In [21]:
import pathlib
import pandas as pd
import collections as col
import difflib as diffl

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"

_PROJECT_CONFIG_NB = str(pathlib.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

def read_accession_table(file_path, load_columns, renamed=("file_accession", "project_accession", "file_name")):
    try:
        df = pd.read_csv(file_path, sep="\t", usecols=load_columns)
        df.rename(
            dict((lc, rc) for lc, rc in zip(load_columns, renamed)),
            axis=1, inplace=True
        )
    except ValueError:
        print(file_path.name)
        raise

    file_lut = dict()
    for row in df.itertuples():
        just_file = row.file_name
        if "/" in just_file:
            just_file = just_file.split("/")[-1] 
        if just_file.endswith(".fastq.gz"):
            plain = just_file.rsplit(".", 2)[0]
        elif just_file.endswith(".bam"):
            plain = just_file.rsplit(".", 1)[0]
        else:
            continue
            #raise ValueError(row.file_name)
        file_lut[plain] = (row.file_accession, row.project_accession)

    return file_lut
        
LOAD_FOLDER = PROJECT_BASE.joinpath("annotations", "external")

accession_tables = [
    (
        "20240522_HGSVC3-ENA_fastq_upload-summary_UW.YoungjunKwon.tsv",
        ("runId", "project_accession", "file_name")
    ),
    (
        "20240521_PacBioHiFi_runs_JAX.PilleHallast.tsv",
        ("id", "project_accession", "file_name")
    ),
    (
        "20240521_ONT-UL_runs_JAX.PilleHallast.tsv",
        ("id", "project_accession", "file_name")
    ),
    (
        "PRJEB58376_ENA_filereport.tsv",
        ("run_accession", "study_accession", "submitted_ftp")
    ),
    (
        "PRJNA339722_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias")
    ),
    (
        "PRJNA731524_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias")
    ),
    (
        "PRJNA988114_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias")
    ),
    (
        "PRJEB36100_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias")
    )
]

all_files = col.defaultdict(set)
for table_file, load_columns in accession_tables:
    acc_files = read_accession_table(LOAD_FOLDER.joinpath(table_file), load_columns)
    for k,v in acc_files.items():
        all_files[k].add(v)

data_freeze = pd.read_csv(
    PROJECT_BASE.joinpath("annotations", "data_freezes", "hgsvc3_assembly_data_sources.draft.tsv"),
    comment="#", sep="\t"
)

def check_approx_match(query, targets):

    sm = diffl.SequenceMatcher()
    sm.set_seq2(query)
    max_sim = 0
    selected_t = None
    for t in targets:
        sm.set_seq1(t)
        match = sm.find_longest_match()
        frac_q = match.size / len(query)
        frac_t = match.size / len(t)
        frac = max(frac_q, frac_t)
        if frac > max_sim:
            max_sim = frac
            selected_t = t
    if "m54329U_200715_194535" in query:
        print(query)
        print(selected_t)
        print(max_sim)
        raise
    if max_sim > 0.999:
        return selected_t
    elif max_sim > 0.8:
        print(query, " --- ", selected_t)
        return selected_t
    else:
        return None

missing = []
found = []
known_files = list(all_files.keys())
for row in data_freeze.itertuples():
    if row.sample == "NA24385":
        continue
    if row.filename == "all":
        continue
    if row.datatype == "strandseq":
        continue
    if row.filename.endswith(".fastq.gz"):
        plain = row.filename.rsplit(".", 2)[0]
    else:
        raise ValueError(row.filename)
    
    if plain not in all_files:
        approx_match = check_approx_match(plain, known_files)
        if approx_match is not None:
            file_accessions = all_files[approx_match]
            if len(file_accessions) > 1:
                dup_acc = 1
            else:
                dup_acc = 0
            for accessions in file_accessions:
                file_acc, proj_acc= accessions
                found.append((row.sample, row.datatype, plain, approx_match, row.accession, file_acc, proj_acc, dup_acc))
        else:
            missing.append((row.sample, row.datatype, plain, row.remote_path, row.accession))
    else:
        file_accessions = all_files[plain]
        if len(file_accessions) > 1:
            dup_acc = 1
        else:
            dup_acc = 0
        for accessions in file_accessions:
            file_acc, proj_acc= accessions
            found.append((row.sample, row.datatype, plain, "exact-match", row.accession, file_acc, proj_acc, dup_acc))

print(len(missing))
print(len(found))

with open("hgsvc3_missing_accessions.hifi-ont.ALL.tsv", "w") as dump:
    dump.write("\t".join(["sample", "datatype", "file_name", "remote_path", "accession"]) + "\n")
    for m in missing:
        dump.write("\t".join(m) + "\n")


HG00732_20200722_EEE_m64076_200603_055852.ccs  ---  m64076_200603_055852.Q20
HG00732_20200722_EEE_m64076_200601_234627.ccs  ---  m64076_200601_234627.Q20
HG00732_20200722_EEE_m54329U_200528_200534.ccs  ---  m54329U_200528_200534.Q20
m54329U_200715_194535.ccs
m54329U_200528_200534.Q20
0.44


RuntimeError: No active exception to reraise