# Results - Tumor Samples used for smMIPs Panel Validation

## Tools 

In [1]:
#!/usr/bin/env python3
import json
import numpy as np
import requests
import sys
import pandas as pd
import seaborn as sns
sns.set(style='white')
sns.set_context("talk")
import matplotlib.pyplot as plt
from pyliftover import LiftOver
import pybedtools
from pybedtools import BedTool

In [2]:
lo = LiftOver('hg19', 'hg38')
li = LiftOver('hg38', 'hg19')

## Pull in Input Files

In [3]:
WEX_variants = pd.read_csv('../data/original_sequencing/VCF_exome.txt', sep='\t')

In [4]:
samples = pd.read_csv('../data/validation_samples/sample_dataframe.txt', sep='\t')

In [5]:
smMIPs_coverage = pd.read_csv('../data/smmips_panel/smmips_coordinates_sorted_merged.bed.txt', sep='\t', header=None)
smMIPs_coverage.columns = ['chromosome_name', 'start', 'stop']
smMIPs_coverage['chromosome_name'] = 'chr' + smMIPs_coverage['chromosome_name'].astype(str)

## Evaluate sample types

In [6]:
print('Total number of cancer genomic studies: ', len(list(samples['Tumor Type'].drop_duplicates())))

Total number of cancer genomic studies:  5


In [7]:
#Determine which samples have matched normal
samples_normal = samples[samples['Matched Normal'] == 'yes']
samples_no_normal = samples[samples['Matched Normal'] == 'no']

In [8]:
print('Total number of tumor/normal samples: ')
samples_normal[['Sample', 'Tumor Type']].drop_duplicates().groupby(['Tumor Type']).count()

Total number of tumor/normal samples: 


Unnamed: 0_level_0,Sample
Tumor Type,Unnamed: 1_level_1
HL,1
OSCC,5
SCLC,9


In [9]:
print('Total number of tumor only samples: ')
samples_no_normal[['Sample', 'Tumor Type']].drop_duplicates().groupby(['Tumor Type']).count()

Total number of tumor only samples: 


Unnamed: 0_level_0,Sample
Tumor Type,Unnamed: 1_level_1
AML,1
CRC,5
HL,1


## Evaluate Existing Exome Data

In [10]:
total_individuals = len(WEX_variants['sample'].drop_duplicates())
print('Number of Individual: ',total_individuals)

Number of Individual:  22


In [11]:
print('Total number of variants called via WEX: ', len(WEX_variants))
print('Average number of variants per individual called via WEX: ', len(WEX_variants)/total_individuals)

Total number of variants called via WEX:  13724
Average number of variants per individual called via WEX:  623.8181818181819


In [12]:
print('Range of variant burden: ', WEX_variants.groupby('sample').size().min(), 'to', WEX_variants.groupby('sample').size().max())

Range of variant burden:  2 to 3900


## Overlap with CIViC 

In [13]:
#Change all of the coordinates to GRCh37 to match CIViC Coordinates
for i,row in WEX_variants.iterrows():
    chrom = str('chr' + row['chromosome_name'])
    start = row['start']
    stop = row['stop']
    if row['genome'] == 38:
        if li.convert_coordinate(chrom, start):
            start_new = li.convert_coordinate(chrom, start)
            stop_new = li.convert_coordinate(chrom, stop)
            WEX_variants.loc[i, 'start'] = start_new[0][1]
            WEX_variants.loc[i, 'stop'] = stop_new[0][1]
            WEX_variants.loc[i, 'genome'] = 37

In [14]:
#Eliminate any variants that were not transferable to GRCh37
WEX_variants = WEX_variants[WEX_variants['genome'] == 37]

In [15]:
#Add 'chr' to chromosome number to make it compatable with BedTool
WEX_variants['chromosome_name'] = 'chr' + WEX_variants['chromosome_name'].astype(str)

In [16]:
#Make sure that the coordinates are in the correct order for BedTool
for i,row in WEX_variants.iterrows():
    if row['start'] > row['stop']:
        WEX_variants.loc[i, 'start'] = row['stop']
        WEX_variants.loc[i, 'stop'] = row['start']

In [17]:
#Make bedtool objects from WEX variants and smmips coverage
a = pybedtools.BedTool.from_dataframe(WEX_variants[WEX_variants.columns[0:3]])
b = pybedtools.BedTool.from_dataframe(smMIPs_coverage[smMIPs_coverage.columns[0:3]])

In [18]:
#Find the intersection of WEX variants and smMIPs coverage
c = a.intersect(b, u=True)

In [19]:
#Create a dataframe with overlapping variants
overlap = pd.read_table(c.fn, names=['chrom', 'start', 'stop'])

In [20]:
#Annotate the overlapping variants using the WEX variants df
overlap_annotated = overlap.merge(WEX_variants, how='left', left_on=['chrom', 'start', 'stop'], right_on=['chromosome_name', 'start', 'stop']).drop_duplicates()

In [21]:
#drop remaining variants
overlap_annotated = overlap_annotated.drop('chromosome_name', axis=1)

In [22]:
print('Total number of overlapping variants from original sequencing with CIViC panel: ', len(overlap_annotated))

Total number of overlapping variants from original sequencing with CIViC panel:  89


In [23]:
# Make TSV for overlapping variants for Supplementary Table
overlap_annotated.to_csv('../output/supplementary_table_3-variant_overlap.tsv', sep='\t')