In [38]:
import polars as pl
import glob
import os

In [45]:
vcf_dir = "../../data/vcf_filtermutectcalls"
outdir = "../../evaluations/ad_filtered"

In [46]:
lookup = pl.read_csv("../../annot/sample-info_matched-ff-ffpe_on-pat-id.tsv", separator="\t")
lookup = lookup.with_columns((pl.col("sample_alias") + pl.lit("_") + pl.col("run_accession")).alias("sample_id"))

In [47]:
ffpe_samples = lookup.filter(pl.col("preservation") == "FFPE")
frozen_samples = lookup.filter(pl.col("preservation") == "Frozen")

In [48]:
vcf_paths = glob.glob(f"{vcf_dir}/*/*.vcf")

In [53]:
common_cols = ["chrom", "pos", "ref", "alt"]

intersections = {
    "ffpe_sample_id": [],
    "frozen_sample_id": [],
    "unfiltered_ffpe" : [],
    "unfiltered_frozen" : [],
    "unfiltered_intersection": []
}

for i, ffpe_sample_id in enumerate(ffpe_samples.get_column("sample_id")):
    
    ffpe_vcf = (
        pl.read_csv(f"{vcf_dir}/{ffpe_sample_id}/{ffpe_sample_id}.vcf", 
                    separator="\t", 
                    comment_prefix="##")
        .rename(lambda col_name : col_name.lower().replace("#", ""))
        .rename(lambda col_name : f"{col_name}_ffpe" if col_name not in common_cols else col_name)
    )
    
    frozen_sample_id = frozen_samples.filter(pl.col("inferred_id") == lookup[i, "inferred_id"])[0, "sample_id"]
    
    frozen_vcf = (
        pl.read_csv(f"{vcf_dir}/{frozen_sample_id}/{frozen_sample_id}.vcf", 
                    separator="\t", 
                    comment_prefix="##")
        .rename(lambda col_name : col_name.lower()
        .replace("#", ""))
        .rename(lambda col_name : f"{col_name}_frozen" if col_name not in common_cols else col_name)
    )
    
    intersections["ffpe_sample_id"].append(ffpe_sample_id)
    intersections["frozen_sample_id"].append(frozen_sample_id)
    intersections["unfiltered_ffpe"].append(ffpe_vcf.shape[0])
    intersections["unfiltered_frozen"].append(frozen_vcf.shape[0])
    intersections["unfiltered_intersection"].append(ffpe_vcf.join(frozen_vcf, on=["chrom", "pos", "ref", "alt"], how="semi").shape[0])
    break

# intersections_summary = pl.DataFrame(intersections)

In [50]:

# intersections_summary.write_csv(f"{outdir}/intersection_count.tsv", separator="\t")
intersections_summary

ffpe_sample_id,frozen_sample_id,unfiltered_ffpe,unfiltered_frozen,unfiltered_intersection
str,str,i64,i64,i64
"""Pat01_Meta_FFPE_ERR791893""","""Pat01_Meta_Frozen_ERR791883""",1115,511,64
"""Pat03_Meta_FFPE_ERR791895""","""Pat01_Meta_Frozen_ERR791883""",905,511,53
"""Pat04_Prim_FFPE_ERR791897""","""Pat03_Meta_Frozen_ERR791884""",1871,754,37
"""Pat04_Meta_FFPE_ERR791896""","""Pat03_Meta_Frozen_ERR791884""",622,754,58
"""Pat08_Meta_FFPE_ERR791901""","""Pat04_Meta_Frozen_ERR791891""",401,714,34
…,…,…,…,…
"""Pat11_Prim_FFPE_ERR791906""","""Pat09_Meta_Frozen_ERR791887""",825,538,56
"""Pat12_Meta_FFPE_ERR791907""","""Pat09_Meta_Frozen_ERR791887""",832,538,36
"""Pat13_Meta_FFPE_ERR791908""","""Pat10_Meta_Frozen_ERR791888""",663,338,32
"""Pat14_Meta_FFPE_ERR791909""","""Pat10_Meta_Frozen_ERR791888""",785,338,46


In [54]:
(
    ffpe_vcf.join(frozen_vcf, on=["chrom", "pos", "ref", "alt"], how="inner")
    .select(['chrom','pos','ref','alt','filter_ffpe','filter_frozen','info_ffpe','info_frozen','format_ffpe','format_frozen','err791893_ffpe','err791883_frozen'])
)

chrom,pos,ref,alt,filter_ffpe,filter_frozen,info_ffpe,info_frozen,format_ffpe,format_frozen,err791893_ffpe,err791883_frozen
str,i64,str,str,str,str,str,str,str,str,str,str
"""chr1""",114713909,"""G""","""T""","""PASS""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:25,42:0.538:67:23,27:0,0:2…","""0/1:511,40:0.074:551:481,38:0,…"
"""chr2""",211665423,"""C""","""T""","""clustered_events""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:4255,89:0.017:4344:2252,38…","""0/1:9557,49:2.871e-03:9606:537…"
"""chr2""",211713649,"""C""","""T""","""clustered_events""","""clustered_events;strand_bias""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=strand_bias;AS…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:601,20:0.016:621:456,12:0,…","""0/1:3984,10:1.682e-03:3994:260…"
"""chr3""",10146651,"""C""","""T""","""clustered_events""","""clustered_events;strand_bias""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=strand_bias;AS…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:10844,33:2.989e-03:10877:5…","""0/1:10570,13:1.087e-03:10583:5…"
"""chr3""",10146652,"""T""","""C""","""clustered_events""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:10785,87:7.830e-03:10872:5…","""0/1:10525,56:4.926e-03:10581:5…"
…,…,…,…,…,…,…,…,…,…,…,…
"""chr18""",51067032,"""A""","""C""","""clustered_events""","""clustered_events;strand_bias""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=strand_bias;AS…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:6812,15:2.104e-03:6827:359…","""0/1:11026,24:1.643e-03:11050:5…"
"""chr18""",51067034,"""A""","""G""","""clustered_events""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:6807,35:4.035e-03:6842:358…","""0/1:11004,85:4.985e-03:11089:5…"
"""chr19""",3119241,"""C""","""T""","""clustered_events;germline;hapl…","""germline;panel_of_normals""","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0|1:10,2647:0.996:2657:0,0:9,2…","""0/1:12,1856:0.996:1868:0,0:6,1…"
"""chr22""",16574568,"""G""","""A""","""germline;panel_of_normals""","""germline;haplotype;panel_of_no…","""AS_FilterStatus=SITE;AS_SB_TAB…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:…","""0/1:17,2106:0.992:2123:8,1051:…","""0|1:46,5408:0.993:5454:22,2877…"


In [55]:
ffpe_vcf.join(frozen_vcf, on=["chrom", "pos", "ref", "alt"], how="semi")

chrom,pos,id_ffpe,ref,alt,qual_ffpe,filter_ffpe,info_ffpe,format_ffpe,err791893_ffpe
str,i64,str,str,str,str,str,str,str,str
"""chr1""",114713909,""".""","""G""","""T""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:25,42:0.538:67:23,27:0,0:2…"
"""chr2""",211665423,""".""","""C""","""T""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:4255,89:0.017:4344:2252,38…"
"""chr2""",211713649,""".""","""C""","""T""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:601,20:0.016:621:456,12:0,…"
"""chr3""",10146651,""".""","""C""","""T""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:10844,33:2.989e-03:10877:5…"
"""chr3""",10146652,""".""","""T""","""C""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:10785,87:7.830e-03:10872:5…"
…,…,…,…,…,…,…,…,…,…
"""chr18""",51067032,""".""","""A""","""C""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:6812,15:2.104e-03:6827:359…"
"""chr18""",51067034,""".""","""A""","""G""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:6807,35:4.035e-03:6842:358…"
"""chr19""",3119241,""".""","""C""","""T""",""".""","""clustered_events;germline;hapl…","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:…","""0|1:10,2647:0.996:2657:0,0:9,2…"
"""chr22""",16574568,""".""","""G""","""A""",""".""","""germline;panel_of_normals""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:17,2106:0.992:2123:8,1051:…"


In [57]:
ffpe_vcf.join(frozen_vcf, on=["chrom", "pos", "ref", "alt"], how="anti")

chrom,pos,id_ffpe,ref,alt,qual_ffpe,filter_ffpe,info_ffpe,format_ffpe,err791893_ffpe
str,i64,str,str,str,str,str,str,str,str
"""chr1""",43349363,""".""","""C""","""T""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:3144,27:6.265e-03:3171:226…"
"""chr1""",43349364,""".""","""C""","""T""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:3147,23:5.158e-03:3170:219…"
"""chr1""",43349367,""".""","""CCCCG""","""C""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:3161,11:3.391e-03:3172:232…"
"""chr1""",43349375,""".""","""G""","""A""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:3160,12:3.015e-03:3172:241…"
"""chr1""",107236445,""".""","""A""","""G""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:26,2:0.125:28:13,1:0,0:13,…"
…,…,…,…,…,…,…,…,…,…
"""chr22""",26100281,""".""","""C""","""T""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:0,4:0.750:4:0,2:0,0:0,2:0,…"
"""chr22""",37236545,""".""","""T""","""C""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:18,8:0.333:26:0,0:9,4:9,4:…"
"""chrY""",19156591,""".""","""CAA""","""C,CA""",""".""","""clustered_events;multiallelic""","""AS_FilterStatus=weak_evidence|…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1/2:13,3,5:0.172,0.262:21:0,…"
"""chrY""",19156600,""".""","""A""","""AAAAG""",""".""","""clustered_events;haplotype;wea…","""AS_FilterStatus=weak_evidence;…","""GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:…","""0|1:19,2:0.133:21:0,0:12,1:12,…"


In [58]:
frozen_vcf.join(ffpe_vcf, on=["chrom", "pos", "ref", "alt"], how="anti")

chrom,pos,id_frozen,ref,alt,qual_frozen,filter_frozen,info_frozen,format_frozen,err791883_frozen
str,i64,str,str,str,str,str,str,str,str
"""chr1""",60897254,""".""","""A""","""G""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:0,2:0.667:2:0,1:0,0:0,1:0,…"
"""chr1""",110658498,""".""","""C""","""T""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:0,1:0.667:1:0,0:0,1:0,1:0,…"
"""chr1""",114716138,""".""","""A""","""G""",""".""","""base_qual;haplotype;strand_bia…","""AS_FilterStatus=base_qual,stra…","""GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:…","""0|1:7346,31:1.209e-03:7377:509…"
"""chr1""",114716146,""".""","""T""","""C""",""".""","""strand_bias""","""AS_FilterStatus=strand_bias;AS…","""GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:…","""1|0:7423,28:1.537e-03:7451:505…"
"""chr1""",114716199,""".""","""G""","""T""",""".""","""strand_bias""","""AS_FilterStatus=strand_bias;AS…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:8474,83:3.555e-03:8557:495…"
…,…,…,…,…,…,…,…,…,…
"""chr22""",23791857,""".""","""A""","""G""",""".""","""base_qual;clustered_events""","""AS_FilterStatus=base_qual;AS_S…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:6720,98:3.247e-03:6818:395…"
"""chr22""",23791898,""".""","""C""","""T""",""".""","""clustered_events""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:6882,29:3.721e-03:6911:388…"
"""chr22""",23803488,""".""","""G""","""C""",""".""","""germline;panel_of_normals""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:1,3752:1.000:3753:0,2317:0…"
"""chr22""",23803507,""".""","""T""","""C""",""".""","""base_qual""","""AS_FilterStatus=base_qual;AS_S…","""GT:AD:AF:DP:F1R2:F2R1:FAD:SB""","""0/1:3695,30:3.089e-03:3725:205…"
