In [5]:
import pathlib as pl
import re as re
import collections as col
import pandas as pd

local_run = True

local_imbb_mount = pl.Path("/mounts/hilbert/project/projects/medbioinf")
remote_imbb_path = pl.Path("/gpfs/project/projects/medbioinf")

data_root_path = local_imbb_mount if local_run else remote_imbb_path

sample_fofn_path = data_root_path.joinpath("data/00_RESTRUCTURE/sample-centric")

sample_batches = dict()

eav_store = pl.Path("../samples/vrk_samples.eav.tsv").resolve()
row_store = pl.Path("../samples/vrk_samples.tsv").resolve()


def load_sample_batch(file_path, batch_id):
    
    with open(file_path, "r") as listing:
        for line in listing:
            if not line.strip():
                continue
            sample = line.strip()
            assert sample not in sample_batches
            sample_batches[sample] = batch_id
    return         


def fix_fofn_path(fofn_files):
    
    remote_path = str(remote_imbb_path)
    local_mount = str(local_imbb_mount)
    
    fofn_files = [
        str(fp).replace(local_mount, remote_path) for fp in fofn_files
    ]
    return fofn_files


sample_batch_files = [
    ("../annotations/20230402_complete_samples.tsv", "230402.1")
]

[load_sample_batch(fp, bid) for fp, bid in sample_batch_files]

# handle special cases
sample_batches["NA24385"] = "230402.1"

special_runs = {
    "HG00733_hifi_fastq.unknown-EEEext-20XX.fofn": ["HG00733red1", "HG00733red2"],
    "HG00733_ont_fastq.hgsvc-JAX-2021.fofn": ["HG00733red1", "HG00733red2"],
    "HG00733_ont_fastq.unknown-EEEext-20XX.fofn": ["HG00733red2"]
}


samples = []
records = set()
for sample_folder in sample_fofn_path.iterdir():
    if sample_folder.is_symlink():
        continue
    sample_name = sample_folder.name
    if sample_name.startswith("GM") or sample_name == "HG002":
        continue
    if sample_name not in sample_batches:
        continue
    hifi_data = list(sample_folder.glob("*hifi*fastq*.fofn"))
    ont_data = list(sample_folder.glob("*ont*fastq*.fofn"))
    if hifi_data and ont_data:
        hifi_fofn = fix_fofn_path(hifi_data)
        ont_fofn = fix_fofn_path(ont_data)
        samples.append(sample_name)
        records.add((
            sample_name, "target", "unphased"
        ))
        records.add((
            sample_name, "batch", sample_batches[sample_name]
        ))
        for fofn in hifi_fofn:
            records.add((
                sample_name, "hifi", fofn
            ))
            try:
                special_samples = special_runs[fofn.rsplit("/")[-1]]
                for sn in special_samples:
                    records.add((
                        sn, "hifi", fofn
                    ))
                    records.add((
                        sn, "target", "unphased"
                    ))
            except KeyError:
                continue

        for fofn in ont_fofn:
            records.add((
                sample_name, "ont", fofn
            ))
            try:
                special_samples = special_runs[fofn.rsplit("/")[-1]]
                for sn in special_samples:
                    records.add((
                        sn, "ont", fofn
                    ))
                    records.add((
                        sn, "target", "unphased"
                    ))
            except KeyError:
                continue
        
        
print("\n".join(sorted(samples)))
print(len(samples))
      

df = pd.DataFrame.from_records(sorted(records), columns=["sample", "key", "value"])
df.to_csv(eav_store, header=True, index=False, sep="\t")

def make_csv(values):
    return ",".join(sorted(map(str, values)))

df = df.pivot_table(
    index="sample",
    columns="key",
    values="value",
    aggfunc=make_csv
)
df = df[["target", "hifi", "ont", "batch"]]
df.to_csv(row_store, header=True, index=True, sep="\t")


HG00268
HG00358
HG00733
HG01457
HG01505
HG02106
HG02282
HG02554
HG02666
HG03248
HG03452
HG03520
HG03807
HG04036
HG04217
NA18989
NA19129
NA19317
NA19320
NA19331
NA19347
NA19384
NA19434
NA19836
NA20355
NA21487
NA24385
27
