# CZ ID long-read-mngs benchmark

In [None]:
import os
def param(key, default):
    value = os.environ.get(key, default)
    print(f"{key} = {value}")
    return value
run_name = param("RUN_NAME", "long-read-mngs")
datafiles = param("HARVEST_DATA", "combined_taxa.json")
ground_truth = param("GROUND_TRUTH", None)
step_counts = param("STEP_COUNTS", None)
reference_lib = param("REF_LIB", None)

In [None]:
import json
import math
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
data = {}
for datafile in datafiles.split(":"):
    if datafile:
        with open(datafile) as infile:
            data.update(json.load(infile))
ref_data = {}
if reference_lib:
    if os.path.isfile(reference_lib):
        with open(reference_lib) as infile:
            ref_data = json.load(infile)

In [None]:
# master routine for color-coding deviations and touching a sentinel file if any exist
SENTINEL_FN = ".long-read-mngs-benchmarks-deviation"
try:
    os.remove(SENTINEL_FN)
except FileNotFoundError:
    pass
def colorcode(err):
    ans = 'green'
    if not isinstance(err, float) or math.isnan(err) or err > 0.01:
        ans = 'red'
        if isinstance(err, float) and err <= 0.1:
            ans = 'yellow'
        with open(SENTINEL_FN, 'w'):
            pass
    return ans

## read & contig summary counts

In [None]:
pd.set_option("display.precision", 2)

def error_colors(counts_df):
    colors_df = pd.DataFrame('', index=counts_df.index, columns=counts_df.columns)
    diff = (counts_df.iloc[:, 0]-counts_df.iloc[:, 1])/counts_df.iloc[:, 0]
    for ind, err in diff.iteritems():
        color = colorcode(math.fabs(err))
        colors_df["run_1_step_counts"][ind] = 'background-color: light' + color
        colors_df["ref_step_counts"][ind] = 'background-color:' + color
    return colors_df

if step_counts:
    step_count_df = pd.read_csv(step_counts, sep="\t", index_col=0)
    try:
        step_count_df = step_count_df.style.apply(error_colors, axis=None)
    except Exception:
        pass
    display(step_count_df)

## taxa accuracy vs truth sets

In [None]:
if ground_truth:
    ground_truth_df = pd.read_json(ground_truth)
    prdf = pd.concat([
        pd.DataFrame.from_dict({"precision":ground_truth_df["NT"]["aupr"]["precision"], "recall":ground_truth_df["NT"]["aupr"]["recall"], "db":["NT"]*len(ground_truth_df["NT"]["aupr"]["recall"])}),
        pd.DataFrame.from_dict({"precision":ground_truth_df["NR"]["aupr"]["precision"], "recall":ground_truth_df["NR"]["aupr"]["recall"], "db":["NR"]*len(ground_truth_df["NR"]["aupr"]["recall"])}),
    ]).reset_index(drop=True)
    sns.lineplot(data=prdf, x="recall", y="precision", hue="db", ci=None)\
        .set_title(f"AUPR: NT={round(ground_truth_df['NT']['aupr']['aupr'],4)} NR={round(ground_truth_df['NR']['aupr']['aupr'],4)}")
    plt.xlim(0,1.05)
    plt.ylim(0,1.05)
    plt.show()


## taxa abundance spectra

In [None]:
def taxa_dataframe(sample_data, db):
    # for one sample and either NR or NT, generate dataframe of the taxa-specific abundance/assembly info
    prep = {}
    for tax_id in sample_data:
        prep[tax_id] = {f"{db}_{key}": sample_data[tax_id][key] for key in [
            "tax_name", "bPM", "contigs_reads", "contigs_N50"]}
    return pd.DataFrame.from_dict(prep, orient="index")

def joined_taxa_dataframe(sample_data):
    # taxa dataframes for NT & NR
    sample_NT = taxa_dataframe(sample_data["NT"], "NT")
    sample_NR = taxa_dataframe(sample_data["NR"], "NR")

    # join them & reformat
    joined = pd.merge(sample_NT, sample_NR, how="outer", left_index=True, right_index=True)
    joined["tax_name"] = joined["NT_tax_name"].combine_first(joined["NR_tax_name"])
    joined["max_bPM"] = joined[["NT_bPM", "NR_bPM"]].max(axis=1)
    
    return joined[[
        "tax_name", "max_bPM",
        "NT_contigs_reads", "NT_contigs_N50",
        "NR_contigs_reads", "NR_contigs_N50"
    ]].sort_values(by="max_bPM", ascending=False)

taxa_tables = {}
sample = "sample"
sample_data = data
joined = joined_taxa_dataframe(sample_data)
if ref_data:
    joined_ref = joined_taxa_dataframe(ref_data)
    joined_ref = joined_ref.rename(columns={key: f"{key}.REF" for key in joined_ref.columns})
    joined = pd.merge(joined, joined_ref, how="outer", left_index=True, right_index=True)
    joined["tax_name"] = joined["tax_name"].combine_first(joined["tax_name.REF"])
    joined = joined.drop(columns="tax_name.REF")
    joined = joined.reindex(list(dict.fromkeys(["tax_name", "max_bPM", "max_bPM.REF"] + sorted(joined.columns))), axis=1)
    taxa_tables[sample] = joined
    
def error_colors(joined):
    colors_df = pd.DataFrame('', index=joined.index, columns=joined.columns)
    for measure in ["max_bPM",
            "NT_contigs_reads", "NT_contigs_N50",
            "NR_contigs_reads", "NR_contigs_N50"]:
        for taxon in joined.index:
            v = joined[measure][taxon]
            vref = joined[f"{measure}.REF"][taxon]
            if (math.isnan(v) and math.isnan(vref)) or v == vref:
                err = 0.0
            else:
                err = math.fabs((v-vref)/vref) if vref else float('inf')
            color = colorcode(err)
            colors_df[measure][taxon] = 'background-color: light' + color
            colors_df[f"{measure}.REF"][taxon] = 'background-color:' + color
    return colors_df

**Below:**

1. abundance spectrum: empirical CDF of log10(max_bPM) for all species output by the pipeline.
  * max_bPM = max between NR & NT
  * red line: MIN_BPM cutoff for display in subsequent table.
2. table of each species above MIN_BPM, with read & contig stats for NR & NT
  * green = within 1% of reference, yellow = within 10%, red = other.

In [None]:
ax = sns.ecdfplot(data=np.log10(joined["max_bPM"]))
min_bPM = 5000
plt.axvline(math.log10(min_bPM), 0, 1, color='r', linestyle='--')
ax.set(xlabel="log_10(max(NT_bPM,NR_bPM))")
plt.show()

pd.set_option("display.precision", 0)
joined = joined[joined["max_bPM"] >= min_bPM]
if ref_data:
    joined = joined.style.apply(error_colors, axis=None)
display(joined)