In [97]:
import pathlib as pl
import pysam
import collections as col
import numpy as np
import scipy.stats as scistats
import pandas as pd

input_dir = pl.Path("/home/ebertp/work/data/raw_vcf")

vcfs = input_dir.glob("*.vcf.gz")

gt_map = lambda x: "." if x is None else x

count_stats = col.Counter()
agg_stats = col.defaultdict(list)
for vcf_file in vcfs:
    if ".sv." not in vcf_file.name:
        continue
    print(vcf_file.name)
    with pysam.VariantFile(vcf_file) as vcf:
        for record in vcf.fetch():
            contig = record.contig
            filter_category = "|".join(record.filter.keys())
            if "PRECISE" in record.info.keys():
                calltype = "precise"
            elif "IMPRECISE" in record.info.keys():
                calltype = "imprecise"
            else:
                calltype = "unspecified"
            if "SUPPORT" in record.info.keys():
                read_support = record.info["SUPPORT"]
            elif "RE" in record.info.keys():
                read_support = record.info["RE"]
            elif "RNAMES" in record.info.keys():
                read_support = len(record.info["RNAMES"])
            else:
                read_support = -1
            vartype = record.info["SVTYPE"]
            try:
                svlen = abs(record.info["SVLEN"])
            except KeyError:
                svlen = -1
            quality = record.qual
            gt = dict(record.samples["SAMPLE"].items())["GT"]
            
            genotype = f"GT:{gt_map(gt[0])}/{gt_map(gt[1])}"
            count_stats[(contig, filter_category, calltype, vartype, genotype)] += 1
            count_stats[("callset", filter_category, calltype, vartype, genotype)] += 1
            agg_stats[(contig, filter_category, calltype, vartype, "length")].append(svlen)
            agg_stats[("callset", filter_category, calltype, vartype, "quality")].append(quality)
            agg_stats[(contig, filter_category, calltype, vartype, "support")].append(read_support)
    
    summary = []
    for key, value in count_stats.items():
        row = list(key)
        row.extend(["count", value])
        summary.append(row)
    
    for key, values in agg_stats.items():
        row = list(key)
        data_array = np.array(values)
        low, quartile1, median, quartile3, high = scistats.scoreatpercentile(
            data_array,
            per=[0.5, 25, 50, 75, 99.5]
        )
        mean = data_array.mean()
        if row[-1] in ["length", "support"]:
            normalize = lambda x: int(round(x, 0))
        else:
            normalize = lambda x: float(round(x, 2))
        labels = ["mean", "pct_005", "quart1_pct_250", "median_pct_500", "quart3_pct_750", "pct_995"]
        values = [mean, low, quartile1, median, quartile3, high]
        for l, v in zip(labels, values):
            norm_v = normalize(v)
            new_row = row + [l, v]
            summary.append(new_row)
            
    df = pd.DataFrame.from_records(
        summary,
        columns=[
            "domain", "filter_category", "call_type", "variant_type", "feature", "statistic", "value"
        ]
    )
    df.sort_values(["domain", "filter_category", "call_type"], ascending=True, inplace=True)
    print(df.head(30))
    raise

9-20b_hifi.mm2-sniffles.t2tv2.sv.vcf.gz


[E::idx_find_and_load] Could not retrieve index file for '/home/ebertp/work/data/raw_vcf/9-20b_hifi.mm2-sniffles.t2tv2.sv.vcf.gz'


       domain filter_category  call_type variant_type  feature   
9     callset              GT  imprecise          DEL   GT:0/1  \
27    callset              GT  imprecise          INS   GT:0/1   
29    callset              GT  imprecise          INS   GT:0/0   
31    callset              GT  imprecise          DEL   GT:0/0   
372   callset              GT  imprecise          DUP   GT:0/0   
453   callset              GT  imprecise          INV   GT:0/0   
570   callset              GT  imprecise          DEL  quality   
571   callset              GT  imprecise          DEL  quality   
572   callset              GT  imprecise          DEL  quality   
573   callset              GT  imprecise          DEL  quality   
574   callset              GT  imprecise          DEL  quality   
575   callset              GT  imprecise          DEL  quality   
642   callset              GT  imprecise          INS  quality   
643   callset              GT  imprecise          INS  quality   
644   call

RuntimeError: No active exception to reraise