In [8]:
import pathlib as pl
import pandas as pd
import collections as col

MOUNT_PREFIX = pl.Path("/mounts/hilbert/project")

REMOTE_PREFIX = "/gpfs/project"

data_path = pl.Path(
    "projects/medbioinf/data/00_RESTRUCTURE/project-centric/ceph_pedigree/2024-01/assemblies/males"
)

sample_infos = col.defaultdict(dict)

def make_remote(file_path):
    
    file_path = str(file_path)
    file_path = file_path.replace(str(MOUNT_PREFIX), REMOTE_PREFIX)
    return file_path


for fasta_file in MOUNT_PREFIX.joinpath(data_path).glob("**/*.fasta"):
    filename = fasta_file.name
    sample_id = fasta_file.parents[1].name
    assert sample_id.startswith("NA")
    sample_id = f"{sample_id}-CEPH"
    if "hap1" in filename or "haplotype1" in filename:
        asm_unit = "hap1"
    elif "hap2" in filename or "haplotype2" in filename:
        asm_unit = "hap2"
    elif "unassigned" in filename:
        asm_unit = "unassigned"
    else:
        print(f"skipping over {fasta_file}")
        continue
    sample_infos[sample_id][asm_unit] = make_remote(fasta_file)
    
sample_sheet = [(k, d["hap1"], d["hap2"], d["unassigned"]) for k, d in sample_infos.items()]

sample_sheet = pd.DataFrame.from_records(
    sample_sheet,
    columns=["sample", "asm_hap1", "asm_hap2", "asm_unassigned"]
)
sample_sheet.sort_values("sample", inplace=True)
sample_sheet["family"] = 1463
sample_sheet["sex"] = "male"

out_tsv = pl.Path("../../samples/special_runs/cephped_all.tsv").resolve()
print(out_tsv)

with open(out_tsv, "w") as dump:
    _ = dump.write("# 202400214T1648\n")
    sample_sheet.to_csv(dump, sep="\t", header=True, index=False)





/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/samples/special_runs/cephped_all.tsv
