In [75]:
import vcf
import vcf.utils
import pysam
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# VCF / BAM comparison tool

Compares the INDEL calls and alignments between two datasets. Designed to answer the question "why is Swiftlong (B) so much worse?" 

Input files:

Dataset A & B:
* VCF from hap.py
* BAM file


In [76]:
a_happy_vcf = vcf.Reader(filename="a50_variant_analysis/Nextera-10ng-1_happy/Nextera-10ng-1.vcf.gz")
a_vcf = vcf.Reader(filename="a40_vcf/Nextera-10ng-1.scored.vcf")

a_bam = pysam.AlignmentFile("/ypool/bulk/nsc/kitComparison/10_bwa/Nextera-10ng-1_HYK7TCCXY_L004.sorted.bam")

In [77]:
b_happy_vcf = vcf.Reader(filename="a50_variant_analysis/Swiftlong-10ng-3_happy/Swiftlong-10ng-3.vcf.gz")
b_vcf = vcf.Reader(filename="a40_vcf/Swiftlong-10ng-3.scored.vcf")

b_bam = pysam.AlignmentFile("/ypool/bulk/nsc/kitComparison/10_bwa/Nextera-100ng-3_HYK7TCCXY_L002.sorted.bam")

In [None]:
current_chromosome = None

In [None]:
# Log depths
items = []

for _, (a_rec, a_rec2, b_rec, b_rec2) in zip(iter(int, 1), vcf.utils.walk_together(a_happy_vcf, a_vcf, b_happy_vcf, b_vcf)):
    #any_rec = recs[0] or recs[1]
    a_or_b_rec = a_rec or b_rec
    if not a_or_b_rec: continue
    if a_or_b_rec.CHROM != current_chromosome:
        current_chromosome = a_or_b_rec.CHROM
        print("Chromosome:", current_chromosome)
    if a_or_b_rec.is_indel:
        #print(a_or_b_rec.CHROM, a_or_b_rec.POS)
        item = {
            'chrom': a_or_b_rec.CHROM,
            'pos': a_or_b_rec.POS,
            'a_is_tp': a_rec is not None and a_rec.samples[1].data.BD == 'TP',
            'a_dp': a_bam.count(a_or_b_rec.CHROM, a_or_b_rec.start, a_or_b_rec.start+1),
            'b_is_tp': b_rec is not None and b_rec.samples[1].data.BD == 'TP',
            'b_dp': b_bam.count(a_or_b_rec.CHROM, a_or_b_rec.start, a_or_b_rec.start+1)
        }
        item['is_tp'] = item['b_is_tp'] or item['a_is_tp']
        #for pile_col in b_bam.pileup(a_or_b_rec.CHROM, a_or_b_rec.start, a_or_b_rec.start+1,
        #                               truncate=True, stepper='nofilter', min_base_quality=0):
        #    qs = list(pile_col.get_query_sequences())
        #    print("B Pile", qs)
        #    break
        #else:
        #    print("There is no pile at", a_or_b_rec.CHROM, a_or_b_rec.POS)
        items.append(item)
    
print("Got to position", a_or_b_rec.CHROM, a_or_b_rec.POS)

In [None]:
df = pd.DataFrame(items)
print("Items:", len(items))
df.head(n=2)

In [None]:
sns.displot(data=df, x='b_dp', hue='b_is_tp', discrete=True, common_norm=True, stat='density')
plt.xlim(0,20)

In [None]:
sns.boxplot(data=df[df.is_tp], x='chrom', y='b_is_tp')