In [23]:
import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from pathlib import Path
from pysam import VariantFile
import seaborn as sns
from vcf_to_df import vcf_to_df

mpl.rcParams['figure.dpi']= 300

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Filter by:
    > Must pass filter
    > Must be SNV
    > Must have an allele frequence > 0.1
    > Must have mean postion in read > 13
    > Must have a depth of >10
    > Must have MAF of < 0.01

In [3]:
filters = {
    'filter':['PASS'],
    'type':['SNV','snp'],
    'af_min':0.1,
    'dp_min':10,
    'pmean_min':13,
    'gmaf_max':0.01,
}

## Read in each VCF file and turn into pandas dataframe pickle

In [4]:
for filename in glob.iglob('/Users/DanielaNachmanson/XTHS-analysis/data/vcf/*.annotate.vcf'):

    path = Path(filename)
    df = vcf_to_df.vcf_to_df(path)
    
    df['af'] = df['af'].astype('float')
    df['pmean'] = df['pmean'].astype('float')
    df['dp'] = [int(dp) if dp else None for dp in df['dp']]
    df['gmaf'] = [float(gmaf) if gmaf else 0 for gmaf in df['gmaf']]

    df = df[df['filter'].isin(filters['filter'])]
    df = df[df['type'].isin(filters['type'])]
    df = df[df['af'] >= filters['af_min']]
    df = df[df['dp'] >= filters['dp_min']]
    df = df[df['pmean'] >= filters['pmean_min']]
    df = df[df['gmaf'] <= filters['gmaf_max']]

    if 'FRFZ' in filename:
        df['chr'] = ['chr' + c for c in df['chr']]

    df = df.reset_index(drop=True)
    df.to_pickle(str(path.parent) + '/pkls/' + str(path.stem) + '.pkl')

AttributeError: 'pysam.libcbcf.VariantRecordInfo' object has no attribute 'get_keys'

## Filter VCF

In [8]:
for filename in glob.iglob('/Users/DanielaNachmanson/XTHS-analysis/data/vcf/*.annotate.vcf'):
#     quals = []
    path = Path(filename)
#     out_file = open(str(path.parent) 
#                     + '/' 
#                     + str(path.stem) 
#                     + '.filter.vcf',mode='w')
    vcf_in = VariantFile(path)
#     out_file.write(vcf_in.header.__str__())
    for rec in vcf_in:
        break
        file.write(rec.__str__())
    
        filter_,qual,af,dp,type_,pmean,maf = ["".join(rec.filter),
                                    rec.qual,
                                    rec.info.get('AF'),
                                    rec.info.get('DP'),
                                    rec.info.get('TYPE'),
                                    rec.info.get('PMEAN'),
                                    vcf_to_df.CAF_to_MAF(rec.info.get('CAF'))]
        quals.append(qual)
        if filter_ == 'PASS' and type_ in ['SNV','snp']:
            if float(af) > 0.1 and int(dp) >= 10:
                if int(pmean) > 10:
                    if not maf or float(maf) < 0.01:
                        if int(qual) > 50:
                            if 'MT' not in rec.contig.upper() and 'Y' not in rec.contig.upper():     
                                out_file.write(rec.__str__())
    plt.hist(quals)
    plt.show()
    out_file.close()

NameError: name 'quals' is not defined

In [36]:
for filename in glob.iglob('/Users/DanielaNachmanson/XTHS-analysis/data/vcf/FRFZ.*.annotate.vcf'):
    path = Path(filename)
    out_file = open(str(path.parent) 
                    + '/' 
                    + str(path.stem) 
                    + '.DOWNSAMPLE.vcf',
                    mode='w')
    vcf_in = VariantFile(path)
    out_file.write(vcf_in.header.__str__())
    for rec in vcf_in:
        
        G5 = False
        COMMON = None
        
        if "SAO" in rec.info.keys():
            try:
                SAO = int(rec.info.get('SAO'))
                if SAO not in [0,1,3]:
                    continue
            except:
                hi = 1
        if "G5" in rec.info.keys():
            continue
            
        if "COMMON" in rec.info.keys():
            try:
                COMMON = int(rec.info.get('COMMON'))
                if COMMON == 1:
                    continue
            except:
                hi = 1
        out_file.write(rec.__str__())
    out_file.close()

In [31]:
filename='/Users/DanielaNachmanson/XTHS-analysis/data/vcf/FRFZ-vardict.vcf'
path = Path(filename)
out_file = open(str(path.parent) 
                + '/' 
                + str(path.stem) 
                + '.filter.second.vcf',mode='w')
vcf_in = VariantFile(path)
out_file.write(vcf_in.header.__str__())
for rec in vcf_in:
    if rec.info.get('TYPE') != "Complex":
        out_file.write(rec.__str__())
out_file.close()

In [14]:
rec.info.get('SAO')

(0, 0)

In [None]:
filename = '/Users/DanielaNachmanson/XTHS-analysis/data/vcf/F'
path = Path(filename)
df = vcf_to_df.vcf_to_df(path)

df['af'] = df['af'].astype('float')
df['pmean'] = df['pmean'].astype('float')
df['dp'] = [int(dp) if dp else None for dp in df['dp']]
df['gmaf'] = [float(gmaf) if gmaf else 0 for gmaf in df['gmaf']]

df = df[df['filter'].isin(filters['filter'])]
df = df[df['type'].isin(filters['type'])]
df = df[df['af'] >= filters['af_min']]
df = df[df['dp'] >= filters['dp_min']]
df = df[df['pmean'] >= filters['pmean_min']]
df = df[df['gmaf'] <= filters['gmaf_max']]

if 'FRFZ' in filename:
    df['chr'] = ['chr' + c for c in df['chr']]

df.to_pickle(str(path.parent) + '/pkls/' + str(path.stem) + '.pkl')

## Read in the vcf from our fresh frozen exome data into a dataframe

In [None]:
frfz_in = VariantFile('/Volumes/oncogxA/Projects/ATHENA/MCLU01/Experiments/Development/DNA-seq/pilot0514/FRFZ/FRFZ-vardict.vcf.gz') 

lst_ = []

for rec in frfz_in:
    rec = rec
    entry = [rec.contig,
            rec.start,
            rec.stop,
            rec.ref,
            rec.id,
            rec.alts[0],
            ",".join(rec.filter.keys()),
            rec.qual,
            rec.info.get('QUAL'),
            rec.info.get('AF')[0],
            rec.info.get('DP'),
            rec.info.get('VD'),
            rec.info.get('TYPE'),
            rec.info.get('PMEAN'),
            rec.info.get('NM'),]
    lst_.append(entry)

frfz_df = pd.DataFrame(np.row_stack(lst_),columns=['chr','start','stop','ref','id','alt','filter','qual','vd_qual','af','dp','vd','type','pmean','nm'])

frfz_df = frfz_df[(frfz_df['type'] == 'snp') | (frfz_df['type'] == 'SNV')]
frfz_df = frfz_df[~frfz_df['chr'].str.startswith('G')]
frfz_df['af'] = frfz_df['af'].astype('float')
frfz_df['pmean'] = frfz_df['pmean'].astype('float')
frfz_df['dp'] = frfz_df['dp'].astype('float')
frfz_df['chr'] = ['chr' + c for c in frfz_df['chr']]

frfz_df = frfz_df[frfz_df['filter'] == 'PASS']

frfz_df.to_pickle('/Users/DanielaNachmanson/XTHS-analysis/data/vcf/pkls/FRFZ-vardict.pkl')

In [26]:
for filename in glob.iglob('/Users/DanielaNachmanson/XTHS-analysis/data/vcf/*.filter.filter.second.vcf'):

    path = Path(filename)
    df = vcf_to_df.vcf_to_df(path)
    
    df['af'] = df['af'].astype('float')
    df['pmean'] = df['pmean'].astype('float')
    df['dp'] = [int(dp) if dp else None for dp in df['dp']]

    if 'FRFZ' in filename:
        df['chr'] = ['chr' + c for c in df['chr']]

    df = df.reset_index(drop=True)
    df.to_pickle(str(path.parent) + '/pkls/' + str(path.stem) + '.pkl')