In [41]:
import pathlib as pl
import pandas as pd
import re

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

_MYNAME="clean-other-data-tables"
_NBSTAMP=get_nb_stamp(_MYNAME)

data_tables = [
    ("HiC", PROJECT_BASE.joinpath("annotations", "external", "20240521_hgsvc_HiC_runs.PH.tsv")),
    ("IsoSeq", PROJECT_BASE.joinpath("annotations", "external", "20240521_hgsvc_isoseq_runs.PH.tsv")),
    ("RNASeq", PROJECT_BASE.joinpath("annotations", "external", "20240521_hgsvc_rnaseq_runs.PH.tsv")),
]

keep_columns = [
    "accession", "project_accession", "sample_alias", "library_name", "file_name"
]

def assert_sample_id(sample):

    if sample in HGSVC_SAMPLES["sample"].values:
        norm_sample = sample
    else:
        if sample in ["GM19320", "NA19320", "HG02016", "NA19219"]:
            norm_sample = None
        elif sample == "HG02818-replacement":
            norm_sample = "HG02818"
        else:
            raise ValueError(sample)

    if norm_sample is not None:
        sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == norm_sample, "sex"].iloc[0]
    else:
        sex = None
    return norm_sample, sex


for data_label, data_table in data_tables:
    if data_label == "HiC":
        print("skip until debugged: ", data_label)
        continue
    print(data_label)
    
    table = pd.read_csv(data_table, sep="\t", header=0)
    table.rename({"id": "accession"}, axis=1, inplace=True)
    table = table[keep_columns].copy()
    sample_sex = table["library_name"].apply(assert_sample_id)
    selector = [t[0] is not None for t in sample_sex]
    table = table.loc[selector, :]
    table.reset_index(drop=True, inplace=True)
    sample_sex = [t for t in sample_sex if t[0] is not None]
    sample_sex = pd.DataFrame.from_records(
        sample_sex, index=table.index, columns=["sample", "sex"]
    )
    table = table.join(sample_sex)
    table.drop("library_name", axis=1, inplace=True)
    table.set_index(["sample", "sex"], inplace=True)
    table.insert(0, "data_type", data_label)
    table.sort_index(inplace=True)

    out_file_name = f"table_SXPE_{data_label}_accessions.tsv"
    out_file_path = TABLE_OUT_SUPPL.joinpath(out_file_name)
    table.to_csv(out_file_path, sep="\t", header=True, index=True)
    
    

skip until debugged:  HiC
IsoSeq
RNASeq
