In [1]:
import pandas as pd
from pathlib import Path # type: ignore
import sys
if (module_path:=str(Path(".").absolute().resolve().parent)) not in sys.path:
    sys.path.insert(0, module_path)
from sample_info import get_stats

heatmap_dir = Path(".").resolve()
outdir = heatmap_dir / "depth_heatmap_out"

In [2]:
s_gene_df = pd.read_csv(outdir / "S-depth-of-coverage.csv")
s_gene_df = s_gene_df[~s_gene_df["mixture"].isin(["NFWC", "NFWA"])]
s_gene_df.head()

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,21563,25384,S,8537.23,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,11093.304954,7773.575758
1,MN908947.3,21563,25384,S,8198.01,05-05-23-A41,WB: artic,0adgio1,728720.0,11249.876496,7360.808081
2,MN908947.3,21563,25384,S,7887.96,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,10454.746675,7621.070707
3,MN908947.3,21563,25384,S,8681.83,05-05-23-A41,WB: artic,0agio1o2,797075.0,10892.111784,8051.262626
4,MN908947.3,21563,25384,S,11412.52,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,11100.312608,10385.111111


In [3]:
get_stats(s_gene_df, scheme="artic", value_col="mean_coverage", replace_str=": artic")

ANOVA for artic samples comparing mean coverage across batches
p-value: 1.6871658192864803e-27	f-value: 113.03361517999129
The mean coverage was significantly different across WB (mean=9133.470789473684, std. dev.=1518.0989388605835), NWRB (mean=5036.284736842107, std. dev.=595.2729261745535), and PWRB (mean=9579.869736842104, std. dev.=1914.7779205891225), as determined by one-way ANOVA (F=113.03361517999129, p=1.6871658192864803e-27<0.01).

Tukey's HSD results:
The mean coverage for NWRB differed significantly from WB with p-value 0.0.
The mean coverage for NWRB differed significantly from PWRB with p-value 0.0.
No significant difference in mean coverage (p-value=0.376>0.01) was found between PWRB and WB.


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.376
NWRB: artic,0.0,,0.0
PWRB: artic,0.376,0.0,


In [4]:
get_stats(s_gene_df, scheme="varskip", value_col="mean_coverage")

ANOVA for varskip samples comparing mean coverage across batches
p-value: 2.0683465697617048e-22	f-value: 80.95909070268559
The mean coverage was significantly different across WB: varskip (mean=9994.73552631579, std. dev.=1444.9227643320078), NWRB: varskip (mean=8144.81105263158, std. dev.=1756.8793992109518), and PWRB: varskip (mean=12964.560263157895, std. dev.=1774.9322758811936), as determined by one-way ANOVA (F=80.95909070268559, p=2.0683465697617048e-22<0.01).

Tukey's HSD results:
The mean coverage for NWRB: varskip differed significantly from WB: varskip with p-value 0.0.
The mean coverage for NWRB: varskip differed significantly from PWRB: varskip with p-value 0.0.
The mean coverage for PWRB: varskip differed significantly from WB: varskip with p-value 0.0.


Unnamed: 0,WB: varskip,NWRB: varskip,PWRB: varskip
WB: varskip,,0.0,0.0
NWRB: varskip,0.0,,0.0
PWRB: varskip,0.0,0.0,


In [5]:
get_stats(s_gene_df, scheme="varskip|artic", value_col="mean_coverage")

ANOVA for varskip|artic samples comparing mean coverage across batches
p-value: 3.832439865748823e-56	f-value: 103.90526585195887
The mean coverage was significantly different across WB: artic (mean=9133.470789473684, std. dev.=1518.0989388605835), WB: varskip (mean=9994.73552631579, std. dev.=1444.9227643320078), NWRB: artic (mean=5036.284736842107, std. dev.=595.2729261745535), NWRB: varskip (mean=8144.81105263158, std. dev.=1756.8793992109518), PWRB: artic (mean=9579.869736842104, std. dev.=1914.7779205891225), and PWRB: varskip (mean=12964.560263157895, std. dev.=1774.9322758811936), as determined by one-way ANOVA (F=103.90526585195887, p=3.832439865748823e-56<0.01).

Tukey's HSD results:
No significant difference in mean coverage (p-value=0.16>0.01) was found between WB: artic and WB: varskip.
The mean coverage for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The mean coverage for NWRB: artic differed significantly from WB: varskip with p-value 0.0.
The mean

Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,0.16,0.0,0.068,0.814,0.0
WB: varskip,0.16,,0.0,0.0,0.856,0.0
NWRB: artic,0.0,0.0,,0.0,0.0,0.0
NWRB: varskip,0.068,0.0,0.0,,0.001,0.0
PWRB: artic,0.814,0.856,0.0,0.001,,0.0
PWRB: varskip,0.0,0.0,0.0,0.0,0.0,


In [6]:
whole_genome_df = pd.read_csv(outdir / "whole genome-depth-of-coverage.csv")
whole_genome_df = whole_genome_df[~whole_genome_df["mixture"].isin(["NFWC", "NFWA"])]
whole_genome_df.head()

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,1,29903,whole genome,10277.24,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,13354.279715,7773.575758
1,MN908947.3,1,29903,whole genome,9727.73,05-05-23-A41,WB: artic,0adgio1,728720.0,13349.064112,7360.808081
2,MN908947.3,1,29903,whole genome,10067.49,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,13343.508031,7621.070707
3,MN908947.3,1,29903,whole genome,10632.67,05-05-23-A41,WB: artic,0agio1o2,797075.0,13339.610451,8051.262626
4,MN908947.3,1,29903,whole genome,13735.9,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,13360.132902,10385.111111


In [7]:
get_stats(whole_genome_df, scheme="artic", value_col="mean_coverage")

ANOVA for artic samples comparing mean coverage across batches
p-value: 5.320378360222447e-35	f-value: 174.56129939670765
The mean coverage was significantly different across WB: artic (mean=11085.813684210527, std. dev.=1673.6694894193079), NWRB: artic (mean=5985.2063157894745, std. dev.=539.6182620045049), and PWRB: artic (mean=11752.876315789472, std. dev.=1846.2408869689211), as determined by one-way ANOVA (F=174.56129939670765, p=5.320378360222447e-35<0.01).

Tukey's HSD results:
The mean coverage for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The mean coverage for NWRB: artic differed significantly from PWRB: artic with p-value 0.0.
No significant difference in mean coverage (p-value=0.123>0.01) was found between PWRB: artic and WB: artic.


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.123
NWRB: artic,0.0,,0.0
PWRB: artic,0.123,0.0,


In [8]:
get_stats(whole_genome_df, scheme="varskip", value_col="mean_coverage")

ANOVA for varskip samples comparing mean coverage across batches
p-value: 1.874145781625117e-24	f-value: 93.02858359211771
The mean coverage was significantly different across WB: varskip (mean=11814.909736842106, std. dev.=1329.6259730699046), NWRB: varskip (mean=9485.827105263157, std. dev.=1355.1473030951245), and PWRB: varskip (mean=13330.339473684211, std. dev.=995.6526145494597), as determined by one-way ANOVA (F=93.02858359211771, p=1.874145781625117e-24<0.01).

Tukey's HSD results:
The mean coverage for NWRB: varskip differed significantly from WB: varskip with p-value 0.0.
The mean coverage for NWRB: varskip differed significantly from PWRB: varskip with p-value 0.0.
The mean coverage for PWRB: varskip differed significantly from WB: varskip with p-value 0.0.


Unnamed: 0,WB: varskip,NWRB: varskip,PWRB: varskip
WB: varskip,,0.0,0.0
NWRB: varskip,0.0,,0.0
PWRB: varskip,0.0,0.0,


In [9]:
get_stats(whole_genome_df, scheme="artic", value_col="read_counts")

ANOVA for artic samples comparing read counts across batches
p-value: 6.20840612585503e-35	f-value: 173.92232871452896
The read counts was significantly different across WB: artic (mean=830140.1578947369, std. dev.=125055.05493430902), NWRB: artic (mean=449300.7894736842, std. dev.=40160.87892677022), and PWRB: artic (mean=879198.1842105263, std. dev.=138005.38254247507), as determined by one-way ANOVA (F=173.92232871452896, p=6.20840612585503e-35<0.01).

Tukey's HSD results:
The read counts for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The read counts for NWRB: artic differed significantly from PWRB: artic with p-value 0.0.
No significant difference in read counts (p-value=0.131>0.01) was found between PWRB: artic and WB: artic.


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.131
NWRB: artic,0.0,,0.0
PWRB: artic,0.131,0.0,


In [10]:
get_stats(whole_genome_df, scheme="artic|varskip", value_col="read_counts")

ANOVA for artic|varskip samples comparing read counts across batches
p-value: 1.973429340023503e-66	f-value: 139.3820219831702
The read counts was significantly different across WB: artic (mean=830140.1578947369, std. dev.=125055.05493430902), WB: varskip (mean=623316.1842105263, std. dev.=70156.3022966442), NWRB: artic (mean=449300.7894736842, std. dev.=40160.87892677022), NWRB: varskip (mean=501520.05263157893, std. dev.=71462.41467352), PWRB: artic (mean=879198.1842105263, std. dev.=138005.38254247507), and PWRB: varskip (mean=707244.2894736842, std. dev.=52188.3366497275), as determined by one-way ANOVA (F=139.3820219831702, p=1.973429340023503e-66<0.01).

Tukey's HSD results:


The read counts for WB: artic differed significantly from WB: varskip with p-value 0.0.
The read counts for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The read counts for NWRB: artic differed significantly from WB: varskip with p-value 0.0.
No significant difference in read counts (p-value=0.123>0.01) was found between NWRB: artic and NWRB: varskip.
The read counts for NWRB: artic differed significantly from PWRB: artic with p-value 0.0.
The read counts for NWRB: artic differed significantly from PWRB: varskip with p-value 0.0.
The read counts for NWRB: varskip differed significantly from WB: artic with p-value 0.0.
The read counts for NWRB: varskip differed significantly from WB: varskip with p-value 0.0.
The read counts for NWRB: varskip differed significantly from PWRB: artic with p-value 0.0.
The read counts for NWRB: varskip differed significantly from PWRB: varskip with p-value 0.0.
No significant difference in read counts (p-value=0.173>0.01) was found b

Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,0.0,0.0,0.0,0.173,0.0
WB: varskip,0.0,,0.0,0.0,0.0,0.001
NWRB: artic,0.0,0.0,,0.123,0.0,0.0
NWRB: varskip,0.0,0.0,0.123,,0.0,0.0
PWRB: artic,0.173,0.0,0.0,0.0,,0.0
PWRB: varskip,0.0,0.001,0.0,0.0,0.0,


In [11]:
get_stats(whole_genome_df, scheme="artic|varskip", value_col="normalized_read_counts")

ANOVA for artic|varskip samples comparing normalized read counts across batches
p-value: 8.634622570607919e-63	f-value: 125.95448239905728
The normalized read counts was significantly different across WB: artic (mean=8385.254120148857, std. dev.=1263.1823730738286), WB: varskip (mean=8423.191678520623, std. dev.=948.0581391438403), NWRB: artic (mean=4538.391812865497, std. dev.=405.66544370474986), NWRB: varskip (mean=6777.298008534851, std. dev.=965.708306398919), PWRB: artic (mean=8880.789739500266, std. dev.=1393.9937630553034), and PWRB: varskip (mean=9557.355263157895, std. dev.=705.2477925638851), as determined by one-way ANOVA (F=125.95448239905728, p=8.634622570607919e-63<0.01).

Tukey's HSD results:
No significant difference in normalized read counts (p-value=1.0>0.01) was found between WB: artic and WB: varskip.
The normalized read counts for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The normalized read counts for NWRB: artic differed significantly f

Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,1.0,0.0,0.0,0.264,0.0
WB: varskip,1.0,,0.0,0.0,0.352,0.0
NWRB: artic,0.0,0.0,,0.0,0.0,0.0
NWRB: varskip,0.0,0.0,0.0,,0.0,0.0
PWRB: artic,0.264,0.352,0.0,0.0,,0.042
PWRB: varskip,0.0,0.0,0.0,0.0,0.042,


## Varskip vs Artic comparisons

In [12]:
get_stats(whole_genome_df, "WB", "mean_coverage", replace_str=None)

T-test for WB samples comparing mean coverage across two batches
p-value: 0.038899771401127034	t(74): -2.1026278469623394
The mean coverage was not significantly different across WB: artic (mean=11085.813684210527, std. dev.=1673.6694894193079), and WB: varskip (mean=11814.909736842106, std. dev.=1329.6259730699046), as determined by t-test (t(74)=-2.1026278469623394, p=0.038899771401127034<0.01).


In [13]:
get_stats(s_gene_df, "WB", "mean_coverage", replace_str=None)

T-test for WB samples comparing mean coverage across two batches
p-value: 0.01341909934457707	t(74): -2.5332393675958067
The mean coverage was not significantly different across WB: artic (mean=9133.470789473684, std. dev.=1518.0989388605835), and WB: varskip (mean=9994.73552631579, std. dev.=1444.9227643320078), as determined by t-test (t(74)=-2.5332393675958067, p=0.01341909934457707<0.01).
