In [1]:
import pandas as pd
from pathlib import Path # type: ignore
import sys
if (module_path:=str(Path(".").absolute().resolve().parent)) not in sys.path:
    sys.path.insert(0, module_path)
from sample_info import get_stats, p_value_table

heatmap_dir = Path(".").resolve()
outdir = heatmap_dir / "depth_heatmap_out"

In [2]:
s_gene_df = pd.read_csv(outdir / "S-depth-of-coverage.csv")
s_gene_df = s_gene_df[~s_gene_df["mixture"].isin(["NFWC", "NFWA"])]
s_gene_df.head()

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,21563,25384,S,8537.23,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,11093.304954,7773.575758
1,MN908947.3,21563,25384,S,8198.01,05-05-23-A41,WB: artic,0adgio1,728720.0,11249.876496,7360.808081
2,MN908947.3,21563,25384,S,7887.96,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,10454.746675,7621.070707
3,MN908947.3,21563,25384,S,8681.83,05-05-23-A41,WB: artic,0agio1o2,797075.0,10892.111784,8051.262626
4,MN908947.3,21563,25384,S,11412.52,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,11100.312608,10385.111111


In [3]:
get_stats(s_gene_df, scheme="artic", value_col="mean_coverage", replace_str=": artic")

ANOVA for artic samples comparing mean coverage across batches
p-value: 1.6871658192864803e-27	f-value: 113.03361517999129
The mean coverage was significantly different across WB (mean=9133.470789473684, std. dev.=1518.0989388605835), NWRB (mean=5036.284736842107, std. dev.=595.2729261745535), and PWRB (mean=9579.869736842104, std. dev.=1914.7779205891225), as determined by one-way ANOVA (F=113.03361517999129, p=1.6871658192864803e-27<0.01).
The mean coverage for NWRB differed significantly from WB with p-value 0.0.
The mean coverage for NWRB differed significantly from PWRB with p-value 0.0.
No significant difference in mean coverage (p-value=0.376>0.01) was found between PWRB and WB.

Tukey's HSD results:


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.376
NWRB: artic,0.0,,0.0
PWRB: artic,0.376,0.0,


In [4]:
get_stats(s_gene_df, scheme="varskip", value_col="mean_coverage")

ANOVA for varskip samples comparing mean coverage across batches
p-value: 2.0683465697617048e-22	f-value: 80.95909070268559
The mean coverage was significantly different across WB: varskip (mean=9994.73552631579, std. dev.=1444.9227643320078), NWRB: varskip (mean=8144.81105263158, std. dev.=1756.8793992109518), and PWRB: varskip (mean=12964.560263157895, std. dev.=1774.9322758811936), as determined by one-way ANOVA (F=80.95909070268559, p=2.0683465697617048e-22<0.01).
The mean coverage for NWRB: varskip differed significantly from WB: varskip with p-value 0.0.
The mean coverage for NWRB: varskip differed significantly from PWRB: varskip with p-value 0.0.
The mean coverage for PWRB: varskip differed significantly from WB: varskip with p-value 0.0.

Tukey's HSD results:


Unnamed: 0,WB: varskip,NWRB: varskip,PWRB: varskip
WB: varskip,,0.0,0.0
NWRB: varskip,0.0,,0.0
PWRB: varskip,0.0,0.0,


In [5]:
get_stats(s_gene_df, scheme="varskip|artic", value_col="mean_coverage")

ANOVA for varskip|artic samples comparing mean coverage across batches
p-value: 3.832439865748823e-56	f-value: 103.90526585195887
The mean coverage was significantly different across WB: artic (mean=9133.470789473684, std. dev.=1518.0989388605835), WB: varskip (mean=9994.73552631579, std. dev.=1444.9227643320078), NWRB: artic (mean=5036.284736842107, std. dev.=595.2729261745535), NWRB: varskip (mean=8144.81105263158, std. dev.=1756.8793992109518), PWRB: artic (mean=9579.869736842104, std. dev.=1914.7779205891225), and PWRB: varskip (mean=12964.560263157895, std. dev.=1774.9322758811936), as determined by one-way ANOVA (F=103.90526585195887, p=3.832439865748823e-56<0.01).
No significant difference in mean coverage (p-value=0.16>0.01) was found between WB: artic and WB: varskip.
The mean coverage for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The mean coverage for NWRB: artic differed significantly from WB: varskip with p-value 0.0.
The mean coverage for NWRB: ar

Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,0.16,0.0,0.068,0.814,0.0
WB: varskip,0.16,,0.0,0.0,0.856,0.0
NWRB: artic,0.0,0.0,,0.0,0.0,0.0
NWRB: varskip,0.068,0.0,0.0,,0.001,0.0
PWRB: artic,0.814,0.856,0.0,0.001,,0.0
PWRB: varskip,0.0,0.0,0.0,0.0,0.0,


In [6]:
whole_genome_df = pd.read_csv(outdir / "whole-genome-depth-of-coverage.csv")
whole_genome_df = whole_genome_df[~whole_genome_df["mixture"].isin(["NFWC", "NFWA"])]
whole_genome_df.head()

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,1,29903,whole genome,10277.24,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,13354.279715,7773.575758
1,MN908947.3,1,29903,whole genome,9727.73,05-05-23-A41,WB: artic,0adgio1,728720.0,13349.064112,7360.808081
2,MN908947.3,1,29903,whole genome,10067.49,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,13343.508031,7621.070707
3,MN908947.3,1,29903,whole genome,10632.67,05-05-23-A41,WB: artic,0agio1o2,797075.0,13339.610451,8051.262626
4,MN908947.3,1,29903,whole genome,13735.9,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,13360.132902,10385.111111


In [7]:
get_stats(whole_genome_df, scheme="artic", value_col="mean_coverage")

ANOVA for artic samples comparing mean coverage across batches
p-value: 5.320378360222447e-35	f-value: 174.56129939670765
The mean coverage was significantly different across WB: artic (mean=11085.813684210527, std. dev.=1673.6694894193079), NWRB: artic (mean=5985.2063157894745, std. dev.=539.6182620045049), and PWRB: artic (mean=11752.876315789472, std. dev.=1846.2408869689211), as determined by one-way ANOVA (F=174.56129939670765, p=5.320378360222447e-35<0.01).
The mean coverage for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The mean coverage for NWRB: artic differed significantly from PWRB: artic with p-value 0.0.
No significant difference in mean coverage (p-value=0.123>0.01) was found between PWRB: artic and WB: artic.

Tukey's HSD results:


Unnamed: 0,WB: artic,NWRB: artic,PWRB: artic
WB: artic,,0.0,0.123
NWRB: artic,0.0,,0.0
PWRB: artic,0.123,0.0,


In [8]:
get_stats(whole_genome_df, scheme="varskip", value_col="mean_coverage")

ANOVA for varskip samples comparing mean coverage across batches
p-value: 1.874145781625117e-24	f-value: 93.02858359211771
The mean coverage was significantly different across WB: varskip (mean=11814.909736842106, std. dev.=1329.6259730699046), NWRB: varskip (mean=9485.827105263157, std. dev.=1355.1473030951245), and PWRB: varskip (mean=13330.339473684211, std. dev.=995.6526145494597), as determined by one-way ANOVA (F=93.02858359211771, p=1.874145781625117e-24<0.01).
The mean coverage for NWRB: varskip differed significantly from WB: varskip with p-value 0.0.
The mean coverage for NWRB: varskip differed significantly from PWRB: varskip with p-value 0.0.
The mean coverage for PWRB: varskip differed significantly from WB: varskip with p-value 0.0.

Tukey's HSD results:


Unnamed: 0,WB: varskip,NWRB: varskip,PWRB: varskip
WB: varskip,,0.0,0.0
NWRB: varskip,0.0,,0.0
PWRB: varskip,0.0,0.0,


In [9]:
# # not used (but also giving oddly formatted output)
# get_stats(whole_genome_df, scheme="artic", value_col="read_counts", simple_tukey=True)

In [10]:
# # not used (but also giving oddly formatted output)
# get_stats(whole_genome_df, scheme="artic|varskip", value_col="read_counts")

In [11]:
# not used in paper
get_stats(whole_genome_df, scheme="artic|varskip", value_col="normalized_read_counts")

ANOVA for artic|varskip samples comparing normalized read counts across batches
p-value: 8.634622570607919e-63	f-value: 125.95448239905728
The normalized read counts was significantly different across WB: artic (mean=8385.254120148857, std. dev.=1263.1823730738286), WB: varskip (mean=8423.191678520623, std. dev.=948.0581391438403), NWRB: artic (mean=4538.391812865497, std. dev.=405.66544370474986), NWRB: varskip (mean=6777.298008534851, std. dev.=965.708306398919), PWRB: artic (mean=8880.789739500266, std. dev.=1393.9937630553034), and PWRB: varskip (mean=9557.355263157895, std. dev.=705.2477925638851), as determined by one-way ANOVA (F=125.95448239905728, p=8.634622570607919e-63<0.01).
No significant difference in normalized read counts (p-value=1.0>0.01) was found between WB: artic and WB: varskip.
The normalized read counts for NWRB: artic differed significantly from WB: artic with p-value 0.0.
The normalized read counts for NWRB: artic differed significantly from WB: varskip with p

Unnamed: 0,WB: artic,WB: varskip,NWRB: artic,NWRB: varskip,PWRB: artic,PWRB: varskip
WB: artic,,1.0,0.0,0.0,0.264,0.0
WB: varskip,1.0,,0.0,0.0,0.352,0.0
NWRB: artic,0.0,0.0,,0.0,0.0,0.0
NWRB: varskip,0.0,0.0,0.0,,0.0,0.0
PWRB: artic,0.264,0.352,0.0,0.0,,0.042
PWRB: varskip,0.0,0.0,0.0,0.0,0.042,


## Varskip vs Artic comparisons

In [12]:
whole_genome_df["primer"] = whole_genome_df["batch"].apply(lambda x: x.split(": ")[1])
s_gene_df["primer"] = s_gene_df["batch"].apply(lambda x: x.split(": ")[1])
whole_genome_df

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts,primer
0,MN908947.3,1,29903,whole genome,10277.24,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,13354.279715,7773.575758,artic
1,MN908947.3,1,29903,whole genome,9727.73,05-05-23-A41,WB: artic,0adgio1,728720.0,13349.064112,7360.808081,artic
2,MN908947.3,1,29903,whole genome,10067.49,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,13343.508031,7621.070707,artic
3,MN908947.3,1,29903,whole genome,10632.67,05-05-23-A41,WB: artic,0agio1o2,797075.0,13339.610451,8051.262626,artic
4,MN908947.3,1,29903,whole genome,13735.90,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,13360.132902,10385.111111,artic
...,...,...,...,...,...,...,...,...,...,...,...,...
223,MN908947.3,1,29903,whole genome,12103.44,07-12-23-V2A,PWRB: varskip,i-2,642371.0,18841.821938,8680.689189,varskip
224,MN908947.3,1,29903,whole genome,13919.07,07-12-23-V2A,PWRB: varskip,d-2,741515.0,18771.123983,10020.472973,varskip
225,MN908947.3,1,29903,whole genome,13623.84,07-12-23-V2A,PWRB: varskip,o1-2,720699.0,18903.647709,9739.175676,varskip
226,MN908947.3,1,29903,whole genome,13115.62,07-12-23-V2A,PWRB: varskip,o2-3,695223.0,18865.342487,9394.905405,varskip


In [13]:
get_stats(whole_genome_df, "WB", "mean_coverage", replace_str=None)
print()
get_stats(whole_genome_df, "NWRB", "mean_coverage", replace_str=None)
print()
get_stats(whole_genome_df, "PWRB", "mean_coverage", replace_str=None)
print()
# same as above but all 3 backgrounds
get_stats(whole_genome_df, scheme=None, value_col="mean_coverage", replace_str="None", batch_col="primer")

Shapiro-Wilk test for normality of mean coverage for WB: artic batch
p-value: 0.43959220531377474	W = 0.9717077671519408
The mean coverage was not normally distributed across Shapiro-Wilk test for normality of mean coverage for WB: varskip batch
p-value: 0.637574659571028	W = 0.9777497001828147
The mean coverage was not normally distributed across T-test for WB samples comparing mean coverage across two batches
p-value: 0.038899771401127034	t(74): -2.1026278469623394
The mean coverage was not significantly different across WB: artic (mean=11085.813684210527, std. dev.=1673.6694894193079), and WB: varskip (mean=11814.909736842106, std. dev.=1329.6259730699046), as determined by t-test (t(74)=-2.1026278469623394, p=0.038899771401127034<0.01).

Shapiro-Wilk test for normality of mean coverage for NWRB: artic batch
p-value: 0.9900843842323952	W = 0.9913311678373944
The mean coverage was not normally distributed across Shapiro-Wilk test for normality of mean coverage for NWRB: varskip batch

In [14]:
get_stats(s_gene_df, "WB", "mean_coverage", replace_str=None)
print()
get_stats(s_gene_df, "NWRB", "mean_coverage", replace_str=None)
print()
get_stats(s_gene_df, "PWRB", "mean_coverage", replace_str=None)
print()
# same as above but all 3 backgrounds
get_stats(s_gene_df, scheme=None, value_col="mean_coverage", replace_str="None", batch_col="primer")

Shapiro-Wilk test for normality of mean coverage for WB: artic batch
p-value: 0.2671330373316919	W = 0.9646116578617553
The mean coverage was not normally distributed across Shapiro-Wilk test for normality of mean coverage for WB: varskip batch
p-value: 0.639837052640521	W = 0.9778136659252941
The mean coverage was not normally distributed across T-test for WB samples comparing mean coverage across two batches
p-value: 0.01341909934457707	t(74): -2.5332393675958067
The mean coverage was not significantly different across WB: artic (mean=9133.470789473684, std. dev.=1518.0989388605835), and WB: varskip (mean=9994.73552631579, std. dev.=1444.9227643320078), as determined by t-test (t(74)=-2.5332393675958067, p=0.01341909934457707<0.01).

Shapiro-Wilk test for normality of mean coverage for NWRB: artic batch
p-value: 0.9211525296979172	W = 0.9865809997599354
The mean coverage was not normally distributed across T-test for NWRB samples comparing mean coverage across two batches
p-value: 5.

In [15]:
whole_genome_df

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts,primer
0,MN908947.3,1,29903,whole genome,10277.24,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,13354.279715,7773.575758,artic
1,MN908947.3,1,29903,whole genome,9727.73,05-05-23-A41,WB: artic,0adgio1,728720.0,13349.064112,7360.808081,artic
2,MN908947.3,1,29903,whole genome,10067.49,05-05-23-A41,WB: artic,o2o3o4o5,754486.0,13343.508031,7621.070707,artic
3,MN908947.3,1,29903,whole genome,10632.67,05-05-23-A41,WB: artic,0agio1o2,797075.0,13339.610451,8051.262626,artic
4,MN908947.3,1,29903,whole genome,13735.90,05-05-23-A41,WB: artic,0o5o3o4,1028126.0,13360.132902,10385.111111,artic
...,...,...,...,...,...,...,...,...,...,...,...,...
223,MN908947.3,1,29903,whole genome,12103.44,07-12-23-V2A,PWRB: varskip,i-2,642371.0,18841.821938,8680.689189,varskip
224,MN908947.3,1,29903,whole genome,13919.07,07-12-23-V2A,PWRB: varskip,d-2,741515.0,18771.123983,10020.472973,varskip
225,MN908947.3,1,29903,whole genome,13623.84,07-12-23-V2A,PWRB: varskip,o1-2,720699.0,18903.647709,9739.175676,varskip
226,MN908947.3,1,29903,whole genome,13115.62,07-12-23-V2A,PWRB: varskip,o2-3,695223.0,18865.342487,9394.905405,varskip


In [16]:
gene_df = pd.read_csv(outdir / "gene_depth_df.tsv", sep="\t")
gene_df = gene_df[gene_df["gene"]!="whole genome"]
gene_df

Unnamed: 0,chrom,start,end,gene,mean_coverage,plate,batch,mixture,read_counts,normalized_mean_coverage,normalized_read_counts
0,MN908947.3,1,265,5'UTR,11450.99,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,14879.454355,7773.575758
2,MN908947.3,266,13468,ORF1a,9907.51,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,12873.851327,7773.575758
3,MN908947.3,13468,21555,ORF1b,11301.66,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,14685.414458,7773.575758
4,MN908947.3,21563,25384,S,8537.23,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,11093.304954,7773.575758
5,MN908947.3,25393,26220,ORF3a,10165.79,05-05-23-A41,WB: artic,0adgio1o2o3o4o5,769584.0,13209.461215,7773.575758
...,...,...,...,...,...,...,...,...,...,...,...
3187,MN908947.3,27394,27759,ORF7a,17715.32,07-12-23-V2A,PWRB: varskip,o3-4,622205.0,28471.838060,8408.175676
3188,MN908947.3,27894,28259,ORF8,27890.75,07-12-23-V2A,PWRB: varskip,o3-4,622205.0,44825.660353,8408.175676
3189,MN908947.3,28274,29533,N,30552.43,07-12-23-V2A,PWRB: varskip,o3-4,622205.0,49103.478757,8408.175676
3190,MN908947.3,29558,29674,ORF10,37202.25,07-12-23-V2A,PWRB: varskip,o3-4,622205.0,59790.985286,8408.175676


In [17]:
get_stats(gene_df, scheme=None, value_col="mean_coverage", batch_col="gene", replace_str=None, simple_tukey=True)

ANOVA for None samples comparing mean coverage across batches
p-value: 0.0	f-value: 230.82551947733344
The mean coverage was significantly different across 5'UTR (mean=4352.6708771929825, std. dev.=5523.024196585334), ORF1a (mean=9931.37850877193, std. dev.=2579.2415278927597), ORF1b (mean=10884.631271929826, std. dev.=2675.4746167042535), S (mean=9142.288684210525, std. dev.=2824.203084900613), ORF3a (mean=11073.771578947368, std. dev.=3767.8419883043416), E (mean=11504.392807017543, std. dev.=4145.521194360869), M (mean=15523.410087719298, std. dev.=6337.568922249558), ORF6 (mean=17142.26802631579, std. dev.=5752.015296692167), ORF7a (mean=11462.758859649122, std. dev.=3677.395899317324), ORF8 (mean=16545.281315789474, std. dev.=6879.637048087659), N (mean=16507.94065789474, std. dev.=4988.400734424781), ORF10 (mean=6189.460438596491, std. dev.=6209.545392126046), and 3'UTR (mean=2205.926052631579, std. dev.=2574.84115199797), as determined by one-way ANOVA (F=230.82551947733344, p=0

In [21]:
p_table = p_value_table(get_stats(gene_df, scheme=None, value_col="mean_coverage", batch_col="gene", replace_str=None))
p_table.update_layout(width=1000, height=600)