In [55]:
import pandas as pd
from pathlib import Path # type: ignore
import scipy.stats as stats
from scipy.stats import tukey_hsd

heatmap_dir = Path(".").resolve()
outdir = heatmap_dir / "depth_heatmap_out"

In [56]:
def tukey2df(batch_res, batch_details):
    combo_res_str = str(batch_res).split("\n")[2:]
    combo_column_names = list(batch_details.keys())
    combo_array = [[pd.NA]*len(combo_column_names) for c in combo_column_names]
    for line in combo_res_str:
        if line.strip():
            col = int(line[2])
            row = int(line[6])
            p = float(line[24:29])
            # print(col,row,p)
            combo_array[col][row] = p
    combo_array
    combo_p_vals = pd.DataFrame(combo_array, columns=combo_column_names, index=combo_column_names)
    combo_p_vals[combo_p_vals < 0.05]
    return combo_p_vals
def get_stats(df, scheme:str, x_col:str):
    print(f"ANOVA for {scheme} samples comparing {x_col.replace('_',' ')} across batches")
    # reduce df to scheme only
    df = df[df["batch"].str.contains(scheme)]

    # check f/p-values
    fvalue, pvalue = stats.f_oneway(*(df.loc[df["batch"]==batch, x_col].dropna() for batch in df["batch"].unique()))
    print(f"p-value: {pvalue}\tf-value: {fvalue}")

    if pvalue < 0.05:
        # perform Tukey's HSD
        print("\nTukey's HSD:")
        batch_details = {batch:list(df.loc[df["batch"]==batch, x_col].dropna()) for batch in df["batch"].unique()}
        batch_res = tukey_hsd(*batch_details.values())
        # print("i\tbatch\tmean\tstdev")
        # for i,(batch,coverage_lst) in enumerate(batch_details.items()):
        #     print(f"{i}\t{batch}\t{statistics.mean(coverage_lst)}\t{statistics.stdev(coverage_lst)}")
        # print()
        # print(batch_res)
        tukey_df = tukey2df(batch_res, batch_details)
        return tukey_df


In [57]:
s_gene_df = pd.read_csv(outdir / "S-depth-of-coverage.csv")
s_gene_df = s_gene_df[~s_gene_df["mixture"].isin(["NFWC", "NFWA"])]
s_gene_df

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,21563,25384,S,8537.23,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,11093.304954,7773.575758
1,MN908947.3,21563,25384,S,8198.01,05-05-23-A41,WB: artic,0adgio1,728720.0,11249.876496,7360.808081
2,MN908947.3,21563,25384,S,7887.96,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,10454.746675,7621.070707
3,MN908947.3,21563,25384,S,8681.83,05-05-23-A41,WB: artic,0agio1o2,797075.0,10892.111784,8051.262626
4,MN908947.3,21563,25384,S,11412.52,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,11100.312608,10385.111111
...,...,...,...,...,...,...,...,...,...,...,...
257,MN908947.3,21563,25384,S,12328.83,07-12-23-V2A,PWRB: varskip,o1-2,720699.0,17106.767180,9739.175676
258,MN908947.3,21563,25384,S,12395.33,07-12-23-V2A,PWRB: varskip,o2-3,695223.0,17829.286430,9394.905405
259,MN908947.3,21563,25384,S,11588.39,07-12-23-V2A,PWRB: varskip,o3-4,622205.0,18624.713720,8408.175676
260,MN908947.3,21563,25384,S,13901.36,07-12-23-V2A,PWRB: varskip,o5-4,679895.0,20446.333625,9187.770270


In [58]:
get_stats(s_gene_df, scheme="artic", x_col="mean_coverage")

ANOVA for artic samples comparing mean coverage across batches
p-value: 5.036492815096238e-31	f-value: 129.7156514517661

Tukey's HSD:


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.356
NWRB: artic,0.0,,0.0
PWRB: artic,0.356,0.0,


In [59]:
get_stats(s_gene_df, scheme="varskip", x_col="mean_coverage")

ANOVA for varskip samples comparing mean coverage across batches
p-value: 1.4145535085226043e-24	f-value: 88.69999651830472

Tukey's HSD:


Unnamed: 0,WB: varskip,NWRB: varskip,PWRB: varskip
WB: varskip,,0.0,0.0
NWRB: varskip,0.0,,0.0
PWRB: varskip,0.0,0.0,


In [60]:
get_stats(s_gene_df, scheme="varskip|artic", x_col="mean_coverage")

ANOVA for varskip|artic samples comparing mean coverage across batches
p-value: 4.584732464132353e-62	f-value: 114.22177092034889

Tukey's HSD:


Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,0.291,0.0,0.034,0.809,0.0
WB: varskip,0.291,,0.0,0.0,0.959,0.0
NWRB: artic,0.0,0.0,,0.0,0.0,0.0
NWRB: varskip,0.034,0.0,0.0,,0.0,0.0
PWRB: artic,0.809,0.959,0.0,0.0,,0.0
PWRB: varskip,0.0,0.0,0.0,0.0,0.0,


In [61]:
whole_genome_df = pd.read_csv(outdir / "whole genome-depth-of-coverage.csv")
whole_genome_df = whole_genome_df[~whole_genome_df["mixture"].isin(["NFWC", "NFWA"])]
whole_genome_df

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,1,29903,whole genome,10277.24,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,13354.279715,7773.575758
1,MN908947.3,1,29903,whole genome,9727.73,05-05-23-A41,WB: artic,0adgio1,728720.0,13349.064112,7360.808081
2,MN908947.3,1,29903,whole genome,10067.49,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,13343.508031,7621.070707
3,MN908947.3,1,29903,whole genome,10632.67,05-05-23-A41,WB: artic,0agio1o2,797075.0,13339.610451,8051.262626
4,MN908947.3,1,29903,whole genome,13735.90,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,13360.132902,10385.111111
...,...,...,...,...,...,...,...,...,...,...,...
257,MN908947.3,1,29903,whole genome,13623.84,07-12-23-V2A,PWRB: varskip,o1-2,720699.0,18903.647709,9739.175676
258,MN908947.3,1,29903,whole genome,13115.62,07-12-23-V2A,PWRB: varskip,o2-3,695223.0,18865.342487,9394.905405
259,MN908947.3,1,29903,whole genome,11692.39,07-12-23-V2A,PWRB: varskip,o3-4,622205.0,18791.861203,8408.175676
260,MN908947.3,1,29903,whole genome,12793.84,07-12-23-V2A,PWRB: varskip,o5-4,679895.0,18817.376213,9187.770270


In [62]:
get_stats(whole_genome_df, scheme="artic", x_col="mean_coverage")

ANOVA for artic samples comparing mean coverage across batches
p-value: 2.5410462594663847e-39	f-value: 199.3770572997673

Tukey's HSD:


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.091
NWRB: artic,0.0,,0.0
PWRB: artic,0.091,0.0,


In [63]:
get_stats(whole_genome_df, scheme="artic", x_col="read_counts")

ANOVA for artic samples comparing read counts across batches
p-value: 3.009143706797375e-39	f-value: 198.66082278853128

Tukey's HSD:


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.097
NWRB: artic,0.0,,0.0
PWRB: artic,0.097,0.0,


In [66]:
get_stats(whole_genome_df, scheme="artic|varskip", x_col="read_counts")

ANOVA for artic|varskip samples comparing read counts across batches
p-value: 2.9903754867030037e-74	f-value: 156.3050467314561

Tukey's HSD:


Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,0.0,0.0,0.0,0.126,0.0
WB: varskip,0.0,,0.0,0.0,0.0,0.0
NWRB: artic,0.0,0.0,,0.102,0.0,0.0
NWRB: varskip,0.0,0.0,0.102,,0.0,0.0
PWRB: artic,0.126,0.0,0.0,0.0,,0.0
PWRB: varskip,0.0,0.0,0.0,0.0,0.0,


In [65]:
get_stats(whole_genome_df, scheme="artic|varskip", x_col="normalized_read_counts")

ANOVA for artic|varskip samples comparing normalized read counts across batches
p-value: 1.182125333044218e-69	f-value: 139.29421485722636

Tukey's HSD:


Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,1.0,0.0,0.0,0.211,0.0
WB: varskip,1.0,,0.0,0.0,0.172,0.0
NWRB: artic,0.0,0.0,,0.0,0.0,0.0
NWRB: varskip,0.0,0.0,0.0,,0.0,0.0
PWRB: artic,0.211,0.172,0.0,0.0,,0.013
PWRB: varskip,0.0,0.0,0.0,0.0,0.013,
