In [53]:
from pathlib import Path # type: ignore
from freyja_plot import FreyjaPlotter
import pandas as pd
import sys
import scipy.stats as stats
from collections import defaultdict
import plotly.io as pio
if (module_path:=str(Path(".").absolute().resolve().parent)) not in sys.path:
    sys.path.insert(0, module_path)
from sample_info import get_stats, summary_dict, expected, artic_runs as runs, renameSamples, plotting_dir, p_value_table

In [54]:
outdir = plotting_dir / "percent_of_expected/observed_vs_expected_combined"
outdir.mkdir(exist_ok=True)

## ANOVA comparing percent of expected across tools (all backgrounds combined)

In [55]:
file_map = {expected:"Expected"}
for tool, files in runs.items():
    for name,file in files.items():
        # file_map[file] = tool
        file_map[file] = f"{tool}: {name}"
plotter = FreyjaPlotter(
    file_map=file_map,
    # colormap=colormap,
    summary_dict=summary_dict,
)
plotter = renameSamples(plotter)
plotter.summarized_freyja_df[["tool","Background"]] = plotter.summarized_freyja_df.apply(lambda row: row["scheme"].split(": ") if ": " in row["scheme"] else (row["scheme"],row["scheme"]), axis=1, result_type="expand")
df = plotter.plotPercentExpectedBox(summarized=True,return_df=True)
df

Unnamed: 0,Sample name,lineages,abundances,scheme,tool,Background,expected_abundance,percent_of_expected
375,0adgio1,Alpha,0.1840,Alcov: NWRB,Alcov,NWRB,0.250000,73.6
581,0adgio1-2,Alpha,0.1320,Alcov: NWRB,Alcov,NWRB,0.142857,92.4
364,0adgio1o2o3o4o5,Alpha,0.0500,Alcov: NWRB,Alcov,NWRB,0.066667,75.0
399,0agio1o2,Alpha,0.0430,Alcov: NWRB,Alcov,NWRB,0.062500,68.8
596,0aio1o2o3o4o5,Alpha,0.0880,Alcov: NWRB,Alcov,NWRB,0.125000,70.4
...,...,...,...,...,...,...,...,...
1397,0adgio1,Wuhan-hu-1,0.1002,kallisto: WB,kallisto,WB,0.250000,40.08
2008,0adgio1-2,Wuhan-hu-1,0.1206,kallisto: WB,kallisto,WB,0.285714,42.21
1382,0adgio1o2o3o4o5,Wuhan-hu-1,0.0140,kallisto: WB,kallisto,WB,0.133333,10.5
1447,0agio1o2,Wuhan-hu-1,0.0165,kallisto: WB,kallisto,WB,0.125000,13.2


In [56]:
tukey_df = get_stats(df, value_col="percent_of_expected", p_min=0.01, batch_col="scheme")
p_table = p_value_table(tukey_df)
p_table

ANOVA for None samples comparing percent of expected across batches
p-value: 9.557988709222302e-27	f-value: 7.611186352020541
The percent of expected was significantly different across Alcov: NWRB (mean=84.79118773946358, std. dev.=32.54083849198575), Alcov: PWRB (mean=88.41556862745102, std. dev.=40.42879399265568), Alcov: WB (mean=86.56931034482756, std. dev.=30.07212149136993), Freyja: NWRB (mean=85.34653305181159, std. dev.=39.90623760283772), Freyja: PWRB (mean=88.2516050710145, std. dev.=48.77037578727342), Freyja: WB (mean=86.3622304340426, std. dev.=37.23119280846171), kallisto (C-WAP): NWRB (mean=92.51099206349207, std. dev.=47.014832481900235), kallisto (C-WAP): PWRB (mean=96.42400793650793, std. dev.=56.73532511470629), kallisto (C-WAP): WB (mean=92.01730158730162, std. dev.=48.26184011312391), kallisto: NWRB (mean=58.913429078014175, std. dev.=29.503945641956797), kallisto: PWRB (mean=64.06914736842101, std. dev.=31.07885803596488), kallisto: WB (mean=59.96308333333334, std

In [57]:
pio.write_image(p_table, outdir / "p_value_table.jpg", width=1200, height=550)

## ANOVA and Tukey's HSD results when comparing O/E for each background for each tool

In [58]:
p_min = 0.01
tools = runs.keys()
treatments = ["WB","NWRB","PWRB"]
report = defaultdict(list)
any_significant_p = False
for tool, files in runs.items():
    # print(tool)

    dfs = []
    for treatment in treatments:
        file_map = {expected:"Expected"}
        for background,file in files.items():
            if treatment == background:
                file_map[file] = tool
        # print(file_map)
        plotter = FreyjaPlotter(
            file_map=file_map,
            # colormap=colormap,
            summary_dict=summary_dict,
        )
        plotter = renameSamples(plotter)
        df = plotter.plotPercentExpectedBox(summarized=True,return_df=True)
        df["batch"] = treatment
        dfs.append(df)
    anova_df = pd.concat(dfs)
    # print(anova_df)
    p_value, statistic, mean_info, tukey_df = get_stats(anova_df, value_col="percent_of_expected", p_min=p_min, return_tuple=True)
    if tukey_df is not None:
        if p_value < p_min:
            any_significant_p = True
        report[tool] = f"p-value={p_value}, F={statistic}"
        report[tool] = tukey_df
        # print(tukey_df)
if any_significant_p:
    for tool, info in report.items():
        print(tool)
        for line in info:
            print(line)
        print("\n\n")
else:
    print(f"No significant results with a p-value < {p_min}.")


No significant results with a p-value < 0.01.


## ANOVA and Tukey's HSD results when comparing O/E for each background for each lineage

In [59]:
p_min = 0.01
lineages = ["Wuhan-hu-1","Alpha","Gamma","Delta","Iota","BA.1.X","BA.2.X","BG.X","BA.4.X","BA.5.X"]
# treatments = {"Control":"WB","Neg Spike-in":"NWRB","Pos Spike-in":"PWRB"}
treatments = ["WB","NWRB","PWRB"]
treatment_pairs = [["WB","NWRB"],["WB","PWRB"],["NWRB","PWRB"]]
anova_results = {"Lineage":[],"F-statistic":[],"p-value":[]}
tukey_results = defaultdict(list)
for lineage in lineages:
    print(lineage)
    anova_results["Lineage"].append(lineage)
    tukey_results["Lineage"].append(lineage)

    dfs = []
    for treatment in treatments:
        file_map = {expected:"Expected"}
        for tool, files in runs.items():
            for name,file in files.items():
                if treatment == name:
                    file_map[file] = tool
        # print(file_map)
        plotter = FreyjaPlotter(
            file_map=file_map,
            # colormap=colormap,
            summary_dict=summary_dict,
        )
        plotter = renameSamples(plotter)
        df = plotter.plotPercentExpectedBox(summarized=True,return_df=True)
        df = df[df["lineages"]==lineage]
        df["batch"] = treatment
        dfs.append(df)
    anova_df = pd.concat(dfs)
    p_value, statistic, mean_info, tukey_df = get_stats(anova_df, value_col="percent_of_expected", p_min=p_min, return_tuple=True)
    anova_results["F-statistic"].append(statistic)
    anova_results["p-value"].append(p_value)

    if tukey_df is not None:
        print(tukey_df)
        for treatment_pair in treatment_pairs:
            pair_name = " vs ".join(treatment_pair)
            tukey_results[pair_name].append(tukey_df.loc[treatment_pair[0],treatment_pair[1]])
    else:
        for treatment_pair in treatment_pairs:
            pair_name = " vs ".join(treatment_pair)
            tukey_results[pair_name].append("-")
    print("\n\n")
anova = pd.DataFrame(anova_results)
tukey = pd.DataFrame(tukey_results)

Wuhan-hu-1



Alpha
4
[1.7909560765796443e-06, 14.204049898015432, 'WB (mean=90.0815007020202, std. dev.=26.296234414874373), NWRB (mean=72.21485155913977, std. dev.=21.20314535929819), and PWRB (mean=71.82217631249999, std. dev.=18.56435137773008), as determined by one-way ANOVA (F=14.204049898015432, p=1.7909560765796443e-06<0.01).',         WB   NWRB   PWRB
WB    <NA>    0.0    0.0
NWRB   0.0   <NA>  0.995
PWRB   0.0  0.995   <NA>]
        WB   NWRB   PWRB
WB    <NA>    0.0    0.0
NWRB   0.0   <NA>  0.995
PWRB   0.0  0.995   <NA>



Gamma
4
[0.00010274766957374775, 9.73827666366686, 'WB (mean=75.7026555493827, std. dev.=15.496448058787792), NWRB (mean=63.51943741509434, std. dev.=19.81286819679811), and PWRB (mean=60.58794284567901, std. dev.=20.863743930494056), as determined by one-way ANOVA (F=9.73827666366686, p=0.00010274766957374775<0.01).',          WB   NWRB   PWRB
WB     <NA>  0.003    0.0
NWRB  0.003   <NA>  0.701
PWRB    0.0  0.701   <NA>]
         WB   NWRB   PWRB
WB    

In [60]:
# data for table 2 in paper
table = anova.merge(tukey, on="Lineage")
table.to_csv(outdir/"table2_data_lineage_anova_each_bg.csv")
table

Unnamed: 0,Lineage,F-statistic,p-value,WB vs NWRB,WB vs PWRB,NWRB vs PWRB
0,Wuhan-hu-1,0.047882,0.9532785,-,-,-
1,Alpha,14.20405,1.790956e-06,0.0,0.0,0.995
2,Gamma,9.738277,0.0001027477,0.003,0.0,0.701
3,Delta,2.996111,0.05347146,-,-,-
4,Iota,23.531721,1.22903e-09,0.003,0.0,0.002
5,BA.1.X,0.025037,0.9752771,-,-,-
6,BA.2.X,1.281574,0.2793544,-,-,-
7,BG.X,2.418755,0.09176943,-,-,-
8,BA.4.X,2.470374,0.08652837,-,-,-
9,BA.5.X,13.120129,3.736958e-06,0.044,0.0,0.02
