In [1]:
import pathlib as pl
import re as re
import collections as col
import pandas as pd
import datetime as dt

local_run = True

local_imbb_mount = pl.Path("/mounts/hilbert/project/projects/medbioinf")
remote_imbb_path = pl.Path("/gpfs/project/projects/medbioinf")

data_root_path = local_imbb_mount if local_run else remote_imbb_path

sample_fofn_path = data_root_path.joinpath("data/00_RESTRUCTURE/sample-centric")

hgsvc_cells = pl.Path("../annotations/hgsvc_cells.tsv")

eav_store = pl.Path("../samples/vrk_samples.eav.tsv").resolve()
row_store = pl.Path("../samples/vrk_samples.tsv").resolve()


def load_hgsvc_cells(file_path):
    
    with open(file_path, "r") as table:
        ts_line = table.readline().strip()
        assert ts_line.startswith("#")
        ts = ts_line.split()[-1]
        df = pd.read_csv(table, sep="\t", header=0)
    select_complete = df["HHU_complete"] == "yes"
    df = df.loc[select_complete, :].copy()
    assert not df.empty
    return df, ts


def fix_fofn_path(fofn_files):
    
    remote_path = str(remote_imbb_path)
    local_mount = str(local_imbb_mount)
    
    fofn_files = [
        str(fp).replace(local_mount, remote_path) for fp in fofn_files
    ]
    return fofn_files


complete_data, timestamp = load_hgsvc_cells(hgsvc_cells)
sample_order = dict((row.sin, row.sample_num) for idx, row in complete_data.iterrows())
complete_samples = set(complete_data["sin"])

special_runs = {
    "HG00733_hifi_fastq.unknown-EEEext-20XX.fofn": ["HG00733red1", "HG00733red2"],
    "HG00733_ont_fastq.hgsvc-JAX-2021.fofn": ["HG00733red1", "HG00733red2"],
    "HG00733_ont_fastq.hgsvc-UW-2023.fofn": ["HG00733red2"]
}


samples = []
records = set()
for sample_folder in sample_fofn_path.iterdir():
    if sample_folder.is_symlink():
        continue
    sample_name = sample_folder.name
    if sample_name.startswith("GM") or sample_name == "HG002":
        continue
    sin = f"SIN:{sample_name[2:]}"
    if sin not in complete_samples:
        continue
    order_num = sample_order[sin]
    hifi_data = list(sample_folder.glob("*hifi*fastq*.fofn"))
    ont_data = list(sample_folder.glob("*ont*fastq*.fofn"))
    if hifi_data and ont_data:
        hifi_fofn = fix_fofn_path(hifi_data)
        ont_fofn = fix_fofn_path(ont_data)
        samples.append((order_num, sample_name))
        records.add((
            sample_name, "target", "unphased"
        ))
        records.add((
            sample_name, "batch", timestamp
        ))
        records.add((
            sample_name, "hgsvc_sample_num", order_num
        ))
        for fofn in hifi_fofn:
            records.add((
                sample_name, "hifi", fofn
            ))
            try:
                special_samples = special_runs[fofn.rsplit("/")[-1]]
                for sn in special_samples:
                    records.add((
                        sn, "hifi", fofn
                    ))
                    records.add((
                        sn, "target", "unphased"
                    ))
                    records.add((
                        sn, "hgsvc_sample_num", order_num
                    ))
                    records.add((
                        sn, "batch", timestamp
                    ))
            except KeyError:
                continue

        for fofn in ont_fofn:
            records.add((
                sample_name, "ont", fofn
            ))
            try:
                special_samples = special_runs[fofn.rsplit("/")[-1]]
                for sn in special_samples:
                    records.add((
                        sn, "ont", fofn
                    ))
                    records.add((
                        sn, "target", "unphased"
                    ))
                    records.add((
                        sn, "hgsvc_sample_num", order_num
                    ))
                    records.add((
                        sn, "batch", timestamp
                    ))
            except KeyError:
                continue
        
        
print("\n".join([f"{n} - {s}" for (n, s) in sorted(samples)]))
print(len(samples))

df = pd.DataFrame.from_records(sorted(records), columns=["sample", "key", "value"])
df.to_csv(eav_store, header=True, index=False, sep="\t")

def make_csv(values):
    return ",".join(sorted(map(str, values)))

df = df.pivot_table(
    index="sample",
    columns="key",
    values="value",
    aggfunc=make_csv
)
df = df[["hgsvc_sample_num", "target", "hifi", "ont", "batch"]]
df.to_csv(row_store, header=True, index=True, sep="\t")


5 - HG00732
6 - HG00733
12 - HG00171
15 - HG02018
16 - NA19036
19 - NA19983
26 - HG01505
34 - NA24385
36 - HG02818
39 - HG01352
40 - HG02059
41 - NA19434
42 - HG04217
43 - HG03807
44 - NA19836
45 - HG02106
46 - HG00268
47 - NA20355
48 - NA19320
49 - NA19129
50 - HG02769
51 - HG03452
52 - HG03520
53 - HG02282
54 - HG02554
55 - HG02953
56 - NA21487
57 - NA18989
58 - NA19331
59 - HG02666
60 - NA19317
61 - NA19347
62 - HG03248
63 - HG04036
64 - HG01457
65 - NA19384
69 - HG00358
37
