In [1]:
import polars as pl
import os
import glob
import pandas as pd

In [2]:
vcfs_indir = "../../../data/vcf"
mpileups_indir = "../../../evaluations/depth/mpileup"
root_outdir = "../../../evaluations/depth"
variant_depth_outdir = f"{root_outdir}/variant-depth-summary"

os.makedirs(variant_depth_outdir, exist_ok=True)

lookup_table = (
    	pl.read_csv("../../../annot/sample-info_matched-ff-ffpe_on-pat-id.tsv", separator="\t")
		.with_columns((pl.col("sample_alias") + pl.lit("_") + pl.col("run_accession")).alias("sample_name"))
    )

lookup_table

sample_title,inferred_id,sample_type,preservation,run_accession,sample_accession,experiment_accession,study_accession,sample_alias,center_name,tax_id,scientific_name,fastq_ftp,sra_ftp,bam_ftp,sample_name
str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str
"""FFPE colorectal cancer liver m…","""Pat01""","""Meta""","""FFPE""","""ERR791893""","""SAMEA3308192""","""ERX836041""","""PRJEB8754""","""Pat01_Meta_FFPE""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat01_Meta_FFPE_ERR791893"""
"""Frozen colorectal cancer liver…","""Pat01""","""Meta""","""Frozen""","""ERR791883""","""SAMEA3308182""","""ERX836031""","""PRJEB8754""","""Pat01_Meta_Frozen""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat01_Meta_Frozen_ERR791883"""
"""FFPE colorectal cancer liver m…","""Pat03""","""Meta""","""FFPE""","""ERR791895""","""SAMEA3308194""","""ERX836043""","""PRJEB8754""","""Pat03_Meta_FFPE""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat03_Meta_FFPE_ERR791895"""
"""Frozen colorectal cancer liver…","""Pat03""","""Meta""","""Frozen""","""ERR791884""","""SAMEA3308183""","""ERX836032""","""PRJEB8754""","""Pat03_Meta_Frozen""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat03_Meta_Frozen_ERR791884"""
"""FFPE colorectal cancer specime…","""Pat04""","""Prim""","""FFPE""","""ERR791897""","""SAMEA3308196""","""ERX836045""","""PRJEB8754""","""Pat04_Prim_FFPE""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat04_Prim_FFPE_ERR791897"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""FFPE colorectal cancer liver m…","""Pat13""","""Meta""","""FFPE""","""ERR791908""","""SAMEA3308207""","""ERX836056""","""PRJEB8754""","""Pat13_Meta_FFPE""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat13_Meta_FFPE_ERR791908"""
"""Frozen colorectal cancer liver…","""Pat13""","""Meta""","""Frozen""","""ERR791890""","""SAMEA3308189""","""ERX836038""","""PRJEB8754""","""Pat13_Meta_Frozen""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat13_Meta_Frozen_ERR791890"""
"""FFPE colorectal cancer liver m…","""Pat14""","""Meta""","""FFPE""","""ERR791909""","""SAMEA3308208""","""ERX836057""","""PRJEB8754""","""Pat14_Meta_FFPE""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat14_Meta_FFPE_ERR791909"""
"""FFPE colorectal cancer specime…","""Pat14""","""Prim""","""FFPE""","""ERR791910""","""SAMEA3308209""","""ERX836058""","""PRJEB8754""","""Pat14_Prim_FFPE""","""German Cancer Research Center""",9606,"""Homo sapiens""","""ftp.sra.ebi.ac.uk/vol1/fastq/E…",,,"""Pat14_Prim_FFPE_ERR791910"""


In [3]:


ffpe = lookup_table.filter(pl.col("preservation") == "FFPE")
frozen = lookup_table.filter(pl.col("preservation") == "Frozen")

def get_stats(df, col, sample_name, sample_col_name="sample"):
    df = df.get_column(col).describe().rename({"value" : sample_name})
    return df[:, 1:].transpose(include_header=True, column_names=df.get_column("statistic"), header_name = sample_col_name)


In [4]:
chrom_map = {"X": 23, "Y": 24, "M": 25, "MT": 25}

ffpe_variant_only_depth_summary_all_samples = []
frozen_variant_only_depth_summary_all_samples  = []
ffpe_frozen_union_ffpe_depth_summary_all_samples  = []
ffpe_frozen_union_frozen_depth_summary_all_samples  = []

for i, sample_name in enumerate(ffpe.get_column("sample_name"), start=1):
    
    print(f"{i}. Generating variant depth summary for {sample_name}...")
    
    ffpe_variants = (
        	pl.read_csv(
             		f"{vcfs_indir}/{sample_name}/{sample_name}.vcf", 
                    separator="\t", 
                    comment_prefix="##", 
                    infer_schema_length=10000, 
                    columns=["#CHROM", "POS", "REF", "ALT"]
                )
         	.rename(lambda col_name : col_name.lower().replace("#", ""))
			.with_columns(pl.lit(True).alias("in_ffpe"))
        )
    
    ffpe_mpileup = pl.DataFrame(
        pd.read_csv(
            f"{mpileups_indir}/{sample_name}_mpileup.tsv", 
            sep="\t", 
            header=None, 
            names=["chrom", "pos", "ref", "ffpe_read_depth", "ffpe_read_bases", "ffpe_qual"]
        	)
        ).drop("ref")
    
    if i == 1:
        mpileup_stats = ffpe_mpileup["ffpe_read_depth"].describe().rename({"value" : sample_name})
    else:
        ffpe_mpileup_stats = ffpe_mpileup["ffpe_read_depth"].describe().rename({"value" : sample_name})
        mpileup_stats = mpileup_stats.join(ffpe_mpileup_stats, on = "statistic")
    
    patient_id = sample_name.split("_")[0]
    frozen_sample_name = lookup_table.filter(pl.col("inferred_id") == patient_id, pl.col("preservation") == "Frozen")[0, "sample_name"]
    
    frozen_variants = (
        	pl.read_csv(
             		f"{vcfs_indir}/{frozen_sample_name}/{frozen_sample_name}.vcf", 
                    separator="\t", 
                    comment_prefix="##", 
                    infer_schema_length=10000, 
                    columns=["#CHROM", "POS", "REF", "ALT"]
                )
         	.rename(lambda col_name : col_name.lower().replace("#", ""))
          	.with_columns(pl.lit(True).alias("in_frozen"))
        )
    
    frozen_mpileup = pl.DataFrame(
        pd.read_csv(
            f"{mpileups_indir}/{frozen_sample_name}_mpileup.tsv", 
            sep="\t", 
            header=None, 
            names=["chrom", "pos", "ref", "frozen_read_depth", "frozen_read_bases", "frozen_qual"])
        ).drop("ref")
    
    frozen_mpileup_stats = frozen_mpileup["frozen_read_depth"].describe().rename({"value" : frozen_sample_name})
    mpileup_stats = mpileup_stats.join(frozen_mpileup_stats, on = "statistic")
    
    depth_summary = (
		ffpe_variants
		.join(frozen_variants, on=["chrom", "pos", "ref", "alt"], how="full", coalesce=True)
		.fill_null(False)
		.join(ffpe_mpileup, on=["chrom", "pos"])
		.join(frozen_mpileup, on=["chrom", "pos"])
		.select(['chrom','pos','ref','alt','in_ffpe','in_frozen','ffpe_read_depth','frozen_read_depth','ffpe_read_bases','frozen_read_bases','ffpe_qual','frozen_qual'])
		.with_columns(pl.col("chrom").str.replace_all("chr", "").map_elements(lambda x : chrom_map.get(x, x), return_dtype=str).cast(int).alias("chrom_num"))
		.sort(["chrom_num", "pos", "ref", "alt"])
		.drop("chrom_num")
	)
    
    outpath = f"{variant_depth_outdir}/{sample_name}_variant-depth-summary.tsv"
    depth_summary.write_csv(outpath, separator="\t")
    
    ffpe_variant_only_depth_summary = get_stats(depth_summary.filter(pl.col("in_ffpe")), "ffpe_read_depth", sample_name)
    frozen_variant_only_depth_summary = get_stats(depth_summary.filter(pl.col("in_frozen")), "frozen_read_depth", sample_name)
    ffpe_frozen_union_ffpe_depth_summary = get_stats(depth_summary, "ffpe_read_depth", sample_name)
    ffpe_frozen_union_frozen_depth_summary = get_stats(depth_summary, "frozen_read_depth", sample_name)
    
    ffpe_variant_only_depth_summary_all_samples.append(ffpe_variant_only_depth_summary)
    frozen_variant_only_depth_summary_all_samples.append(frozen_variant_only_depth_summary)
    ffpe_frozen_union_ffpe_depth_summary_all_samples.append(ffpe_frozen_union_ffpe_depth_summary)
    ffpe_frozen_union_frozen_depth_summary_all_samples.append(ffpe_frozen_union_ffpe_depth_summary)
    
    print(f"\tvariant depth summary for {sample_name} saved to: {outpath}\n")

mpileup_stats = mpileup_stats[:, 1:].transpose(include_header=True, header_name="samples", column_names=mpileup_stats.get_column("statistic"))
mpileup_stats.write_csv(f"{root_outdir}/all_samples_mpileup_stats.tsv", separator="\t")

ffpe_variant_only_depth_summary_all_samples = pl.concat(ffpe_variant_only_depth_summary_all_samples, how="vertical_relaxed")
frozen_variant_only_depth_summary_all_samples  = pl.concat(frozen_variant_only_depth_summary_all_samples, how="vertical_relaxed")
ffpe_frozen_union_ffpe_depth_summary_all_samples  = pl.concat(ffpe_frozen_union_ffpe_depth_summary_all_samples, how="vertical_relaxed")
ffpe_frozen_union_frozen_depth_summary_all_samples  = pl.concat(ffpe_frozen_union_frozen_depth_summary_all_samples, how="vertical_relaxed")

ffpe_variant_only_depth_summary_all_samples.write_csv(f"{root_outdir}/all_samples_ffpe_variant_only_depth_summary.tsv.tsv", separator="\t")
frozen_variant_only_depth_summary_all_samples.write_csv(f"{root_outdir}/frozen_variant_only_depth_summary.tsv", separator="\t")
ffpe_frozen_union_ffpe_depth_summary_all_samples.write_csv(f"{root_outdir}/ffpe_frozen_union_ffpe_depth_summary.tsv", separator="\t")
ffpe_frozen_union_frozen_depth_summary_all_samples.write_csv(f"{root_outdir}/ffpe_frozen_union_frozen_depth_summary.tsv", separator="\t")


1. Generating variant depth summary for Pat01_Meta_FFPE_ERR791893...
	variant depth summary for Pat01_Meta_FFPE_ERR791893 saved to: ../../../evaluations/depth/variant-depth-summary/Pat01_Meta_FFPE_ERR791893_variant-depth-summary.tsv

2. Generating variant depth summary for Pat03_Meta_FFPE_ERR791895...
	variant depth summary for Pat03_Meta_FFPE_ERR791895 saved to: ../../../evaluations/depth/variant-depth-summary/Pat03_Meta_FFPE_ERR791895_variant-depth-summary.tsv

3. Generating variant depth summary for Pat04_Prim_FFPE_ERR791897...
	variant depth summary for Pat04_Prim_FFPE_ERR791897 saved to: ../../../evaluations/depth/variant-depth-summary/Pat04_Prim_FFPE_ERR791897_variant-depth-summary.tsv

4. Generating variant depth summary for Pat04_Meta_FFPE_ERR791896...
	variant depth summary for Pat04_Meta_FFPE_ERR791896 saved to: ../../../evaluations/depth/variant-depth-summary/Pat04_Meta_FFPE_ERR791896_variant-depth-summary.tsv

5. Generating variant depth summary for Pat08_Meta_FFPE_ERR79190