# Results - smMIPs Sequencing and Data Analysis

## Tools

In [1]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style='white')
sns.set_context("talk")
import matplotlib.pyplot as plt
import glob

In [2]:
from pyliftover import LiftOver
lo = LiftOver('hg19', 'hg38')
li = LiftOver('hg38', 'hg19')

## Pull in Input Files

In [3]:
samples = pd.read_csv('../data/validation_samples/sample_dataframe.txt', sep='\t')

In [4]:
samples_QC = pd.read_csv('../data/validation_samples/sequencing_quality_check.txt', sep='\t')

In [5]:
overlap_with_smmips = pd.read_csv('../output/supplementary_table_3-variant_overlap.tsv', sep='\t')

## Initial Quality Check

In [6]:
print('Mean: ', samples_QC['total tags weighted'].mean())
mean = samples_QC['total tags weighted'].mean()
print('Standard Deviation: ', samples_QC['total tags weighted'].std())
std = samples_QC['total tags weighted'].std()

Mean:  5369817.166666667
Standard Deviation:  3334506.223398502


In [7]:
ineligible_samples = []
for i,row in samples_QC.iterrows():
    if row['total tags weighted'] < mean - 1*std:
        print(row['count'] + '_' +  row['Type'])
        ineligible_samples.append(row['count'] + '_' +  row['Type'])
    

H_ML-08-0075-248_normal
Merck016_tumor
Merck012_tumor
HOC71_normal
SCLC19_normal
SCLC31_normal


In [8]:
print('Number of elgible Samples: ', len(samples_QC) - len(ineligible_samples))

Number of elgible Samples:  30


In [9]:
# If tumor is ineligible, eliminate variants for subsequent analysis
overlap_with_smmips['Passed QC'] = ''
for i,row in overlap_with_smmips.iterrows():
    if row['sample'] + '_' +  'tumor' in ineligible_samples:
        overlap_with_smmips.loc[[i], 'Passed QC'] = 'no'
    else:
        overlap_with_smmips.loc[[i], 'Passed QC'] = 'yes'

In [10]:
print('Number of Eligible Individuals: ', len(overlap_with_smmips[overlap_with_smmips['Passed QC'] == 'yes'][['sample']].drop_duplicates()))

Number of Eligible Individuals:  20


In [11]:
print('Number of Eligible Variants: ',overlap_with_smmips[overlap_with_smmips['Passed QC'] == 'yes'].groupby('sample').size().sum())

Number of Eligible Variants:  81


## Accuracy profile of smMIPs CIViC panel when compared to WEX

 ### Pull in VCF Files

In [12]:
smmips_variants = pd.DataFrame()

In [13]:
smmips_variants = pd.DataFrame()
for item in glob.glob('../data/smmips_sequencing/*T*.snp.vcf'):
    if item.split('/')[3].startswith('H_') or item.split('/')[3].startswith('c'):
        name = item.split('/')[3].split('_')[0] +  '_' +item.split('/')[3].split('_')[1]
    else:
        name = item.split('/')[3].split('_')[0]
    current = pd.read_csv(item, sep='\t', comment='#', header=None).filter(items=[0,1,1,3,4,9])
    current['sample'] = name
    smmips_variants = smmips_variants.append(current)

In [14]:
for item in glob.glob('../data/smmips_sequencing/*T*.indel.vcf'):
    if item.split('/')[3].startswith('H_') or item.split('/')[3].startswith('c'):
        name = item.split('/')[3].split('_')[0] +  '_' +item.split('/')[3].split('_')[1]
    else:
        name = item.split('/')[3].split('_')[0]
    current = pd.read_csv(item, sep='\t', comment='#', header=None).filter(items=[0,1,1,3,4,9])
    current['sample'] = name
    current[1] = current[1] + 1
    current[3] = current[3].str[1:].replace('', '-')
    current[4] = current[4].str[1:].replace('', '-')
    smmips_variants = smmips_variants.append(current)

In [15]:
smmips_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant', 'CIViC Panel VAF', 'sample']
smmips_variants = smmips_variants.reset_index()

In [16]:
for i,row in smmips_variants.iterrows():
    VAF = float(row['CIViC Panel VAF'].split(':')[-1])*100
    smmips_variants.loc[[i], 'CIViC Panel VAF'] = VAF

In [17]:
overlap_with_smmips = overlap_with_smmips.merge(smmips_variants, on=['chrom', 'start','sample'], how='left')

In [18]:
overlap_with_smmips['CIViC Panel VAF'] = overlap_with_smmips['CIViC Panel VAF'].replace(np.nan, 0)
overlap_with_smmips['CIViC Panel VAF'] = overlap_with_smmips['CIViC Panel VAF'].astype('float')

In [19]:
overlap_with_smmips = overlap_with_smmips.drop(['Unnamed: 0', 'reference_y', 'stop_x', 'stop_y', 'variant_y', 'index'], axis=1)

In [20]:
print('Total eligible variants: ', len(overlap_with_smmips[overlap_with_smmips['Passed QC'] == 'yes']))
print('Total eligible overlap with smMIPs: ', len(overlap_with_smmips[(overlap_with_smmips['CIViC Panel VAF'] > 0) & (overlap_with_smmips['Passed QC'] == 'yes')]))

Total eligible variants:  81
Total eligible overlap with smMIPs:  71


In [35]:
overlap_with_smmips.to_csv('/Users/ebarnell/Desktop/overlap.tsv', sep='\t')

### Build Waterfall Dataframe 

In [21]:
samples_waterfall = overlap_with_smmips[overlap_with_smmips['Passed QC'] == 'yes']

In [22]:
# Make dataframe for WaterFall plot (Figure 1)
samples_waterfall = samples_waterfall.filter(items=['sample', 'gene_name', 'amino_acid', 'VAF', 'CIViC Panel VAF'])
samples_waterfall['Validated'] = (samples_waterfall['CIViC Panel VAF'] > 0).astype('int')

samples_waterfall.to_csv('../data/validation_samples/waterfall_dataframe.tsv', sep='\t')

## Variant allele frequency correlation between  smMIPs CIViC panel and exome/genome sequencing

In [23]:
sample_info = pd.read_csv('../data/validation_samples/sample_dataframe.txt', sep='\t')

In [24]:
correlation = overlap_with_smmips[(overlap_with_smmips['CIViC Panel VAF'] != 0) & (overlap_with_smmips['VAF'] != 0)]

In [25]:
pd.DataFrame.corr(correlation.filter(items=['CIViC Panel VAF', 'VAF']),method='pearson')

Unnamed: 0,CIViC Panel VAF,VAF
CIViC Panel VAF,1.0,0.829144
VAF,0.829144,1.0


In [26]:
correlation = correlation.merge(sample_info, left_on='sample', right_on='Sample')

In [27]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=correlation, alpha=0.8, hue='Sample', palette="deep")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_Sample.png', bbox_inches='tight', dpi=400)
plt.close()

In [28]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=correlation, alpha=0.8, hue='Matched Normal', palette="deep")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_MatchedNormal.png', bbox_inches='tight', dpi=400)
plt.close()

In [29]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=correlation, alpha=0.8, hue='Passed QC', palette="deep")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_PassedQC.png', bbox_inches='tight', dpi=400)
plt.close()

In [30]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=correlation, alpha=0.8, hue='Mass', palette="deep")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_Mass.png', bbox_inches='tight', dpi=400)
plt.close()

In [31]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=correlation, alpha=0.8, hue='Coverage')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_Coverage.png', bbox_inches='tight', dpi=400)
plt.close()

In [32]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=correlation, alpha=0.8, hue='Tumor Type', palette="deep")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_TumorType.png', bbox_inches='tight', dpi=400)
plt.close()

In [33]:
overlap_with_smmips['status'] = overlap_with_smmips['CIViC Panel VAF'] > 0

In [34]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="VAF", y="CIViC Panel VAF", data=overlap_with_smmips, alpha=0.8, hue='status', palette="deep")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('../data/Figures/VAF_correlation_Status.png', bbox_inches='tight', dpi=400)
plt.close()

## Retrospective capture of clinically variants eliminated in original sequencing