In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.io as pio
# import researchpy as rp
# import scipy.stats as stats
# import statsmodels.api as sm
# from statsmodels.formula.api import ols
# import numpy as np
import sys
if (module_path:=str(Path(".").absolute().resolve().parent)) not in sys.path:
    sys.path.insert(0, module_path)
from sample_info import rename_mixtures, platforms, backgrounds, mixtures, plotting_dir, get_mosdepth_regions_file, getHeatmap, mixtures2drop

In [2]:
outdir = plotting_dir / "coverage_analysis/depth_plots"
outdir.mkdir(exist_ok=True)

In [3]:

def get_individual_mean_coverage_df(platform, background, mixture):
    regions_file = get_mosdepth_regions_file(platform, background, mixture)
    df = pd.read_csv(regions_file, sep="\t", header=None, compression="gzip", names=["chrom","start","end","gene","mean_coverage"])
    df["mean_coverage"] = df["mean_coverage"].astype(float)
    df["mixture"] = mixture
    df["batch"] = f"{platform}: {background}"
    df = rename_mixtures(df)
    return df

def generate_mean_coverage_dfs():
    for platform in platforms:
        for background in backgrounds:
            for mixture in mixtures:
                if mixture in mixtures2drop:
                    continue
                df = get_individual_mean_coverage_df(platform, background, mixture)
                yield df

def get_mean_coverage_df():
    return pd.concat(generate_mean_coverage_dfs())

In [4]:
gene_df = get_mean_coverage_df()
gene_df

Unnamed: 0,chrom,start,end,gene,mean_coverage,mixture,batch
0,MN908947.3,1,265,5'UTR,91.55,0adgio1o2o3o4o5,illumina_ss: wb
1,MN908947.3,1,29903,whole genome,2996.59,0adgio1o2o3o4o5,illumina_ss: wb
2,MN908947.3,266,13468,ORF1a,2466.42,0adgio1o2o3o4o5,illumina_ss: wb
3,MN908947.3,13468,21555,ORF1b,5350.79,0adgio1o2o3o4o5,illumina_ss: wb
4,MN908947.3,21563,25384,S,868.55,0adgio1o2o3o4o5,illumina_ss: wb
...,...,...,...,...,...,...,...
9,MN908947.3,27394,27759,ORF7a,11806.92,o3-4,ont: pwrb
10,MN908947.3,27894,28259,ORF8,14909.73,o3-4,ont: pwrb
11,MN908947.3,28274,29533,N,19280.87,o3-4,ont: pwrb
12,MN908947.3,29558,29674,ORF10,1476.85,o3-4,ont: pwrb


In [11]:
genes_of_interest = ["S","whole genome"]
# genes_of_interest = gene_df["gene"].unique() # use to create heatmaps for all genes
for gene in genes_of_interest:
    depth_df = gene_df[gene_df["gene"] == gene]
    depth_df.to_csv(outdir / f"{gene.replace(' ','-')}-depth-of-coverage.csv", index=False)
    fig = getHeatmap(
        depth_df, "mean_coverage", 
        title=f"Mean Depth of Coverage: {gene.title() + (' Gene' if 'whole' not in gene else '')}", labels={"y":"Batch","x":"Mixture"}, 
        height=800,
        # width=800,
        title_y=.84,
    )
    fig.show()
    pio.write_image(fig, outdir/f"{gene.replace(' ','-')}-depth-of-coverage.jpg", width=1600, height=1200, scale=2)

In [None]:
box_df = gene_df[~gene_df["gene"].isin(["whole genome"])]
fig = px.box(
    box_df, x="gene", y="mean_coverage", color="batch", hover_data=["mixture"],
    title="Mean Depth Distribution by Gene",
    ).update_layout(yaxis_title="Mean depth",
)
pio.write_image(fig, outdir/"mean-coverage-depth-distribution-by-gene-platforms-and-backgrounds.jpg", width=1400, height=800, scale=2)
fig

In [7]:
fig = px.box(
    box_df, y="mean_coverage", x="batch", hover_data=["mixture","batch"],
    points="all",
    title="Mean coverage depth distribution for each platform and background",
    ).update_layout(yaxis_title="Mean depth",
)
pio.write_image(fig, outdir/"mean-coverage-depth-distribution-platforms-and-backgrounds.jpg", width=1400, height=800, scale=2)
fig