In [32]:
import pathlib as pl
import pandas as pd
import re

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

_MYNAME="norm-assm-stats-table-header"
_NBSTAMP=get_nb_stamp(_MYNAME)

verkko_table_file = PROJECT_BASE.joinpath("annotations", "autogen", "verkko_assemblies.hgsvc3.tsv")
verkko = pd.read_csv(verkko_table_file, sep="\t", comment="#", header=[0,1], index_col=[0,1,2])
verkko_acc_file = PROJECT_BASE.joinpath("annotations", "external", "20240716_ena-upload_assembly-acc.csv")
verkko_acc = pd.read_csv(verkko_acc_file)
verkko_acc["sample"] = verkko_acc["alias"].str.extract("([HGNAGM]{2}[0-9]{5})")
verkko_acc["sample"] = verkko_acc["sample"].str.replace("GM", "NA")
verkko_acc = verkko_acc.set_index("sample", inplace=False)
verkko_acc.rename({"id": "accession"}, axis=1, inplace=True)



def norm_stat_header(header):

    parts = header.split("_")
    if parts[2] == "grt":
        size_idx = 3
    elif parts[3] == "grt":
        size_idx = 4
    else:
        raise ValueError(header)

    if parts[size_idx] == "0bp":
        length_info = " "
    else:
        length_info = f" >{parts[size_idx]} "
    
    if header.startswith("cov_xfold"):
        return f"Coverage{length_info}(x-fold)"
    elif header.startswith("length_N50") or header.startswith("length_auN"):
        return f"Length {parts[1]}{length_info}(bp)"
    elif header.startswith("total_length"):
        return f"Length{length_info}(bp)"
    elif header.startswith("total_num"):
        return f"Sequences{length_info}(n)"
    elif header.startswith("pct_dip"):
        return f"Relative length (% H1/H2)"
    else:
        raise ValueError(header)
    

verkko.columns = verkko.columns.rename("asm_unit", level="sequence")
verkko.index = verkko.index.droplevel(["sample_num", "verkko_batch"])

new_index = []
for sample in verkko.index:
    acc = verkko_acc.at[sample, "accession"]
    sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == sample, "sex"].iloc[0]
    new_index.append((sample, sex, acc))

new_index = pd.MultiIndex.from_tuples(new_index, names=["sample", "sex", "accession"])

verkko.index = new_index

new_columns = []
for (au, stat) in verkko.columns:
    norm_stat = norm_stat_header(stat)
    new_columns.append((au, norm_stat))

new_columns = pd.MultiIndex.from_tuples(new_columns, names=["assembly_unit", "statistic"])
verkko.columns = new_columns

verkko.sort_index(axis=0, inplace=True)
print(verkko)

assembly_unit                       asm-hap1                                  \
statistic                  Coverage (x-fold) Length N50 (bp) Length auN (bp)   
sample  sex    accession                                                       
HG00096 male   ERZ24811348               1.0     133734599.0     120452682.0   
HG00171 female ERZ24811349               1.0     140737330.0     139859013.0   
HG00268 female ERZ24811369               1.0     146208399.0     144980692.0   
HG00358 male   ERZ24811367               1.0     135607687.0     139957044.0   
HG00512 male   ERZ24811360               1.0     134238662.0     126205847.0   
...                                      ...             ...             ...   
NA20355 female ERZ24811370               1.0     135831886.0     139534678.0   
NA20509 male   ERZ24811753               1.0     135259439.0     138789726.0   
NA20847 female ERZ24811755               1.0     102904928.0     113931144.0   
NA21487 female ERZ24811354              