In [3]:
import pathlib as pl
import datetime as dt
import pandas as pd

LOCAL_BASE = "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies"

PROJECT_BASE = pl.Path(LOCAL_BASE).resolve()

PROJECT_DATA_ROOT = pl.Path("/home/ebertp/work/projects/hgsvc").resolve()

_ts = dt.datetime.now()
TIMESTAMP = _ts.strftime("%Y%m%dT%H%M")
TSNOW = TIMESTAMP
TODAY = TIMESTAMP

# Load HGSVC sample table

HGSVC_SAMPLES_TABLE = PROJECT_BASE.joinpath(
    "annotations", "projectmng", "hgsvc_samples.tsv"
)

HGSVC_SAMPLES = pd.read_csv(HGSVC_SAMPLES_TABLE, comment="#", sep="\t", header=0)
HGSVC_SAMPLES["sex"] = HGSVC_SAMPLES["sex"].str.lower()
HGSVC_SAMPLES["member"] = HGSVC_SAMPLES["member"].str.lower()
HGSVC_SAMPLES["is_child"] = HGSVC_SAMPLES["member"].apply(lambda x: x in ["son", "daughter", "child"])
HGSVC_SAMPLES["sample"] = HGSVC_SAMPLES["sample"].str.replace("GM", "NA")

DATA_PRODUCTION_TABLE = PROJECT_BASE.joinpath(
    "annotations", "projectmng", "data_production_status.tsv"
)

DATA_PRODUCTION_STATUS = pd.read_csv(DATA_PRODUCTION_TABLE, sep="\t", header=0, comment="#")
HGSVC_SAMPLE_BATCH_NUMBERS = dict()
for row in DATA_PRODUCTION_STATUS.itertuples():
    assert row.sample_batch in [1,2,3,-1]
    HGSVC_SAMPLE_BATCH_NUMBERS[row.sample] = row.sample_batch
    HGSVC_SAMPLE_BATCH_NUMBERS[row.sample.replace("NA", "GM")] = row.sample_batch
    HGSVC_SAMPLE_BATCH_NUMBERS[row.sample.replace("GM", "NA")] = row.sample_batch

HGSVC_SAMPLES["batch_num"] = HGSVC_SAMPLES["sample"].replace(HGSVC_SAMPLE_BATCH_NUMBERS)

# Special function to load region annotations

ROI_ANNOTATIONS = PROJECT_BASE.joinpath(
    "annotations", "roi", "roi_ref_coords.tsv"
)

def load_ref_roi(reference, roi_name):
    
    roi_table = pd.read_csv(ROI_ANNOTATIONS, sep="\t", header=0, comment="#")
    known_references = roi_table["reference"].unique()
    known_rois = roi_table["name"].unique()
    
    match_ref = [r for r in known_references if reference.lower() in r.lower()]
    match_roi = [r for r in known_rois if roi_name.lower() in r.lower()]
    if len(match_ref) == 0 or len(match_roi) == 0:
        raise ValueError(f"Cannot match ref or ROI: {reference} / {roi_name}")
    elif len(match_ref) == 1 and len(match_roi) == 1:
        select_ref = match_ref[0]
        select_roi = match_roi[0]
    else:
        raise ValueError(f"Ambiguous reference or ROI: {reference} / {roi_name}")
    row = roi_table.loc[(roi_table["reference"] == select_ref) & (roi_table["name"] == select_roi), :]
    assert row.shape[0] == 1
    return row["chrom"].values[0], row["start"].values[0], row["end"].values[0]

    order_num phase   sample   family    member population supergroup     sex   
0           1   Yr1  NA19238     Y117    mother        YRI        AFR  female  \
1           2   Yr1  NA19239     Y117    father        YRI        YRI    male   
2           3   Yr1  NA19240     Y117  daughter        YRI        AFR  female   
3           4   Yr1  HG00731     PR05    father        PUR        AMR    male   
4           5   Yr1  HG00732     PR05    mother        PUR        AMR  female   
..        ...   ...      ...      ...       ...        ...        ...     ...   
64         65   Yr2  NA19384  NA19384         -        LWK        AFR    male   
65         66   Yr3  HG01890     BB05    father        ACB        AFR    male   
66         67   Yr3  NA19705     2368       son        ASW        AFR    male   
67         68   Yr3  HG03456     SL51       son        MSL        AFR    male   
68         69   Yr3  HG00358  HG00358         -        FIN        EUR    male   

    is_child  batch_num  
0

RuntimeError: No active exception to reraise