# Stage 1: Remove Duplicates

In [1]:
import re
import di_parser
from di_parser import Document, document_from_xml
from pathlib import Path

## Helper Functions for Checking File Version

In [2]:
def separate_version_tag(fname:str) -> tuple[str, str]:
    """
    Separate the version tag from the filename.
    """
    pattern = r'^(.*?)(v\d+)$'
    match = re.match(pattern, fname)
    if not match:
        return fname, None
    return (match.group(1), match.group(2))

def is_newer_version(A, B):

    # Separate the version tag from the filename
    id_A, version_A = separate_version_tag(A)
    id_B, version_B = separate_version_tag(B)

    # If the version tag is missing in either string, return False
    if not version_A or not version_B or not id_A or not id_B:
        return False

    # Check if the identity parts are the same
    if id_A != id_B:
        return False
    
    version_A = version_A.replace("v", "")
    version_B = version_B.replace("v", "")

    # Compare the version numbers
    return int(version_A) > int(version_B)

def has_newer_version(fname:str, doc_name_pool:list):
    return any([is_newer_version(x, fname) for x in doc_name_pool])

## Get File Names

In [3]:
root = Path("./resources/2024-Oct")
fname_docfile_map = {}
for file in root.rglob("*.di"):
    fname_docfile_map[file.stem] = file.absolute()

print(list(fname_docfile_map.keys())[:10])

['1133011517_2_07851929564', '1133011517_2_03215262820', '1133011517_1_01772187775', '1133011517_2_03859603659', '1133011517_2_01772187775', '1133011517_2_05920610854', '1133011517_2_09868822817', '1133011517_2_09567824840', '1133011517_2_05598817171', '33936849_1133011517_1']


## Get Unique, Latest File Versions

In [4]:
# Limit the szie of the pool, since it heavily slows down the performance.
doc_has_version = [x for x in list(fname_docfile_map.keys()) if "v" in x] 
doc_no_version = [x for x in list(fname_docfile_map.keys()) if "v" not in x]


doc_version_unique: dict[str, str] = {} # Collect the unique version of each document

# Each `doc_id` is a slot for a document, each slot will finally contain the latest version of the document.
# If a slot is occupied, compare the version number and replace the slot with the newer version.
for doc in doc_has_version:
    doc_id, version = separate_version_tag(doc)
    if doc_id not in doc_version_unique:
        doc_version_unique[doc_id] = doc
    else:
        if is_newer_version(doc, doc_version_unique[doc_id]):
            doc_version_unique[doc_id] = doc

# Final unique document name pool
doc_name_pool = list(doc_version_unique.values()) + doc_no_version
doc_path_pool = [fname_docfile_map[x] for x in doc_name_pool]

## Finilize: Copy Files to Output Folder

In [5]:
import shutil

In [6]:
out_root = Path("./resources/2024-Oct-unique")
out_root.mkdir(exist_ok=True)

for doc_path in doc_path_pool:
    shutil.copy(doc_path, out_root / doc_path.name)