In [1]:
%run "../00_project_config.ipynb"

import pathlib as pl
import pandas as pd
import re

input_path = PROJECT_DATA_ROOT.joinpath(
    "2023_assm_stats/verkko/eval_stage/summaries"
)

DUMP_BATCH_NUM = 2

output_table_sheet = PROJECT_BASE.joinpath(
    "annotations", "autogen", f"verkko_batch{DUMP_BATCH_NUM}_assemblies.tsv"
)

merged = []
for tsv_file in input_path.glob("*.summary.tsv"):
    name_parts = tsv_file.name.split(".")
    sample = name_parts[0]
    sample_num = int(HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == sample, "order_num"].values[0])
    sample_batch = int(HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == sample, "batch_num"].values[0])
    if sample_batch != DUMP_BATCH_NUM:
        continue
    asm_unit = name_parts[2]
    df = pd.read_csv(tsv_file, sep="\t", header=0)
    if df.empty:
        df = pd.DataFrame(
            [
                ["all", "total_length_grt_0bp", 0, sample, sample_num, asm_unit],
                ["all", "total_num_grt_0bp", 0, sample, sample_num, asm_unit]
            ],
            columns=["source", "statistic", "value", "sample", "sample_num", "sequence"]
        )
    else:
        df["sample"] = sample
        df["sample_num"] = sample_num
        df["sequence"] = asm_unit
    merged.append(df)
    
merged = pd.concat(merged, axis=0, ignore_index=False)
merged.drop("source", axis=1, inplace=True)
row_index = pd.MultiIndex.from_tuples(
    sorted(set([(row.sample, int(row.sample_num)) for row in merged.itertuples()])),
    names=["sample", "sample_num"]
)
column_index = pd.Index(
    [(seq, stat) for (seq, stat), _ in merged.groupby(["sequence", "statistic"])],
    tupleize_cols=True, name=("sequence", "statistic"))

pivot = pd.DataFrame([], index=row_index, columns=column_index)
for row in merged.itertuples():
    pivot.loc[(row.sample, row.sample_num), (row.sequence, row.statistic)] = row.value
pivot.fillna(0, inplace=True)
    
sort_seq = {
    'asm-hap1': 0,
    'asm-hap2': 1,
    'asm-unassigned': 2,
    'asm-rdna': 3,
    'asm-disconnected': 4,
    'contaminants': 5
}

aun = "length_auN_grt_0bp"
length = "total_length_grt_0bp"
   
for other_seq in ["asm-unassigned", "asm-disconnected"]:

    other_length = pivot.xs((other_seq, length), level=["sequence", "statistic"], axis=1)
    hap1_length = pivot.xs(("asm-hap1", length), level=["sequence", "statistic"], axis=1).values
    hap2_length = pivot.xs(("asm-hap2", length), level=["sequence", "statistic"], axis=1).values
    dip_length = hap1_length + hap2_length
    pct_length = ((other_length.values / dip_length) * 100).round(3)
    pct_length = pd.DataFrame(
        pct_length,
        index=pivot.index,
        columns=pd.MultiIndex.from_tuples(
            [(other_seq, "pct_dip_length_grt_0bp")],
            names=["sequence", "statistic"]
        )
    )
    pivot = pivot.merge(pct_length, left_index=True, right_index=True)
pivot.sort_index(axis=0, level="sample_num", inplace=True)


def sort_column_index(column_index):
    
    stats_values = column_index.get_level_values("statistic")
    
    get_seq_context = re.compile("(0bp|[0-9]+[kM]bp)")
    sort_stat = {}
    for entry in set(stats_values):
        mobj = get_seq_context.search(entry)
        if mobj is None:
            raise ValueError(entry)
        seq_context = mobj.group(0)
        if "kbp" in seq_context:
            factor = 1e3
            order_num = int(int(seq_context.strip("kbp")) * factor)
        elif "Mbp" in seq_context:
            factor = 1e6
            order_num = int(int(seq_context.strip("Mbp")) * factor)
        else:
            order_num = 0
        sort_stat[entry] = order_num
    
    index_tuples = column_index.to_flat_index()
    
    sorted_tuples = sorted(
        [(sort_seq[t[0]], sort_stat[t[1]], t) for t in index_tuples]
    )
    
    sorted_mindex = pd.MultiIndex.from_tuples(
        [t[2] for t in sorted_tuples],
        names=["sequence", "statistic"]
    )
    
    return sorted_mindex

sorted_mindex_cols = sort_column_index(pivot.columns)

pivot = pivot.loc[:, sorted_mindex_cols]
print(pivot)
with open(output_table_sheet, "w") as table_out:
    _ = table_out.write("# AUTOGEN TABLE - DO NOT EDIT\n")
    _ = table_out.write(f"# {TODAY}\n")
    _ = table_out.write(f"# Verkko v1.4.1 batch {DUMP_BATCH_NUM} phased assemblies\n")
    _ = table_out.write(f"# {pivot.index.get_level_values('sample').nunique()} samples\n")
    pivot.to_csv(table_out, sep="\t", index=True, header=True)

sequence                            asm-hap1                      
statistic          cov_xfold_grt_0bp_at_3Gbp length_N50_grt_0bp   
sample  sample_num                                                
HG00731 4                                1.0        146173010.0  \
HG03732 10                               1.0        136436893.0   
HG02011 18                               1.0        137566759.0   
HG03371 20                               1.0        146394654.0   
HG02492 21                               1.0        135396680.0   
HG03009 22                               1.0        143732596.0   
HG03065 24                               1.0        134540910.0   
NA18534 25                               1.0        146360782.0   
NA19650 27                               1.0        135197658.0   
NA20847 30                               1.0        102904928.0   
NA20509 31                               1.0        135259439.0   
HG01596 33                               1.0        134755268.