# Retrospective capture of clinically relevant variants

## Tools

In [1]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib as plt
import glob

## Pull in Input Files

In [2]:
overlap_with_smmips = pd.read_csv('../output/supplementary_table_3-variant_overlap.tsv', sep='\t')

In [3]:
samples_QC = pd.read_csv('../data/validation_samples/sequencing_quality_check.txt', sep='\t')

In [4]:
sample_info = pd.read_csv('../data/validation_samples/sample_dataframe.txt', sep='\t')

## smMIPs variant rescue for samples with both tumor and matched normal

In [5]:
overlap_with_smmips_data = overlap_with_smmips.merge(sample_info, right_on='Sample', left_on='sample')

In [6]:
tumor_normal_samples = []
for item in overlap_with_smmips_data[(overlap_with_smmips_data['Passed QC'] == 'yes') & (overlap_with_smmips_data['Matched Normal'] == 'yes')]['sample'].drop_duplicates():
    tumor_normal_samples.append(item)

In [24]:
def find_somatic_variants(name):
    print(name)
    
    to_iterate = []
    for item in glob.glob('../data/smmips_sequencing/*.vcf'):
        file_name = item.split('/')[-1].split('_')[0] +'_' + item.split('/')[-1].split('_')[1]
        if name == file_name:
            to_iterate.append(item)
    if len(to_iterate) != 4:
        print("Normal sequencing Failed for " + name + " ... skiping this sample")
        print()
        return 0,0
    else:
        tumor_variants = pd.DataFrame()
        normal_variants = pd.DataFrame()
        for item in to_iterate:
            current = pd.read_csv(item, sep='\t', comment='#', header=None).filter(items=[0,1,1,3,4])
            if len(current) > 1:
                if item.split('_')[3].split('.')[0] == 'T':
                    tumor_variants = tumor_variants.append(current)
                if item.split('_')[3].split('.')[0] == 'N':  
                    normal_variants = normal_variants.append(current)
        tumor_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant']
        normal_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant']
        
        print('Total tumor varinats: ', len(tumor_variants))
        
        somatic = tumor_variants.merge(normal_variants, how='outer', indicator=True)
        somatic = somatic[somatic['_merge'] == 'left_only']

#         somatic.to_csv('/Users/ebarnell/Desktop/' + name +'.bed.txt',sep='\t',index=False)
        print('Total somatic varinats: ', len(somatic))
        return 1, int(len(somatic))
    print()
    
    

In [25]:
eligible_samples = 0
total_variants = 0
for item in tumor_normal_samples:
    sample, somatic_variants = find_somatic_variants(item)
    eligible_samples += sample
    total_variants +=somatic_variants

OSCC_1
Normal sequencing Failed for OSCC_1 ... skiping this sample

SCLC_7
Total tumor varinats:  764
Total somatic varinats:  681
SCLC_1
Total tumor varinats:  120
Total somatic varinats:  82
SCLC_4
Total tumor varinats:  131
Total somatic varinats:  81
SCLC_2
Total tumor varinats:  149
Total somatic varinats:  101
OSCC_4
Total tumor varinats:  193
Total somatic varinats:  135
SCLC_5
Total tumor varinats:  413
Total somatic varinats:  344
SCLC_6
Total tumor varinats:  257
Total somatic varinats:  200
SCLC_9
Total tumor varinats:  242
Total somatic varinats:  192
HL_1
Total tumor varinats:  100
Total somatic varinats:  65
SCLC_3
Total tumor varinats:  147
Total somatic varinats:  94
OSCC_5
Total tumor varinats:  94
Total somatic varinats:  39
SCLC_8
Total tumor varinats:  122
Total somatic varinats:  63


In [26]:
print('Total samples with paired tumor/normal sequencing: ', eligible_samples)
print('Total stomatic variants identified exclusively on smMIPs: ', total_variants)

Total samples with paired tumor/normal sequencing:  12
Total stomatic variants identified exclusively on smMIPs:  2077


# Determine Cause of Missed Variants

In [122]:
manual_review_dataframe = pd.DataFrame()
for item in glob.glob('../data/manual_review/*'):
    name = item.split('/')[3].split('.')[0]
    current_df = pd.read_csv(item,sep='\t')
    current_df['Sample'] = name
    manual_review_dataframe = manual_review_dataframe.append(current_df)
    
    

### Analysis of germline variants

In [123]:
#GERMLINE VARIANTS

print('Total Germline Variants: ',len(manual_review_dataframe[manual_review_dataframe['Notes'] == 'germline']))

print(len(manual_review_dataframe[manual_review_dataframe['Notes'] == 'germline'])/len(manual_review_dataframe)*100,'%')

Total Germline Variants:  25
13.661202185792352 %


### Analysis of variants with sequencing artifacts 

In [124]:
#SEQUENCING ARTIFACTS

print('Total Sequencing Artifact Variants: ',len(manual_review_dataframe[manual_review_dataframe['Notes'] == 'Sequencing artifact']))

print(len(manual_review_dataframe[manual_review_dataframe['Notes'] == 'Sequencing artifact'])/len(manual_review_dataframe)*100,'%')

Total Sequencing Artifact Variants:  29
15.846994535519126 %


### Analysis of variants not observed on original sequencing

In [125]:
print('Total Variants not observed on original sequencing: ',len(manual_review_dataframe[manual_review_dataframe['original_VAF'] == '0']))
print(len(manual_review_dataframe[manual_review_dataframe['original_VAF'] == '0'])/len(manual_review_dataframe)*100,'%')

Total Variants not observed on original sequencing:  106
57.92349726775956 %


In [126]:
smMIPs_somatic = pd.DataFrame()
for name in tumor_normal_samples:
    to_iterate = []
    for item in glob.glob('../data/smmips_sequencing/*.vcf'):

        
        file_name = item.split('/')[-1].split('_')[0] +'_' + item.split('/')[-1].split('_')[1]

        if name == file_name:
            to_iterate.append(item)

    if len(to_iterate) != 4:
        continue

    else:
        tumor_variants = pd.DataFrame()
        normal_variants = pd.DataFrame()
        for item in to_iterate:
            current = pd.read_csv(item, sep='\t', comment='#', header=None).filter(items=[0,1,1,3,4,9])
            if len(current) > 1:
                if item.split('_')[3].split('.')[0] == 'T':
                    tumor_variants = tumor_variants.append(current)
                if item.split('_')[3].split('.')[0] == 'N':  
                    normal_variants = normal_variants.append(current)
        tumor_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant','GT:FALT:FTOT:RALT:RTOT:ALT:TOT:FRAC']
        normal_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant','GT:FALT:FTOT:RALT:RTOT:ALT:TOT:FRAC']

        somatic = tumor_variants.merge(normal_variants, how='outer', indicator=True)
        somatic = somatic[somatic['_merge'] == 'left_only']

        somatic['Sample'] = name

        smMIPs_somatic = smMIPs_somatic.append(somatic)

In [127]:
smMIPs_somatic['GT'], smMIPs_somatic['FALT'], smMIPs_somatic['FTOT'], smMIPs_somatic['RALT'], smMIPs_somatic['RTOT'], smMIPs_somatic['ALT'],smMIPs_somatic['TOT'],smMIPs_somatic['FRAC']  = smMIPs_somatic['GT:FALT:FTOT:RALT:RTOT:ALT:TOT:FRAC'].str.split(':', 8).str

In [128]:
smMIPs_somatic = smMIPs_somatic.drop(['GT:FALT:FTOT:RALT:RTOT:ALT:TOT:FRAC', '_merge'], axis=1)

In [129]:
not_observed = manual_review_dataframe[manual_review_dataframe['original_VAF'] == '0']

In [145]:
not_observed_merge = not_observed.merge(smMIPs_somatic, how='outer', left_on=['Chromosome', 'Start', 'Stop', 'Reference', 'Variant', 'Sample'], right_on=['chrom', 'start', 'stop', 'reference', 'variant', 'Sample'],indicator=True)
not_observed_merge = not_observed_merge[not_observed_merge['_merge'] == 'both']

In [154]:
insufficient_coverage = 0
total = 0
for i,row in not_observed_merge.iterrows():
    total +=1
    smMIPs_VAF = float(row['FRAC'])
    original_tumor_coverage = float(row['tumor coverage'])
    if original_tumor_coverage == 0:
        insufficient_coverage +=1
    else:
        if 1/original_tumor_coverage > smMIPs_VAF:
            insufficient_coverage +=1
        
    
    

In [155]:
print('Variants with insufficient coverage: ' + str(insufficient_coverage/total))

Variants with insufficient coverage: 0.9905660377358491


### Analysis of  variants with support in the original sequencing data

In [166]:
original_support = manual_review_dataframe[(manual_review_dataframe['original_VAF'] != '0') & (manual_review_dataframe['original_VAF'] != 'None')]

In [167]:
original_support_merge = original_support.merge(smMIPs_somatic, how='outer', left_on=['Chromosome', 'Start', 'Stop', 'Reference', 'Variant', 'Sample'], right_on=['chrom', 'start', 'stop', 'reference', 'variant', 'Sample'],indicator=True)
original_support_merge = original_support_merge[original_support_merge['_merge'] == 'both']

In [170]:
sns.original_support_merge

Unnamed: 0,Chromosome,Start,Stop,Reference,Variant,Call,Tags,tumor coverage,normal coverage,original_VAF,...,variant,GT,FALT,FTOT,RALT,RTOT,ALT,TOT,FRAC,_merge
0,chr9,21994208.0,21994208.0,G,A,S,,225,110,1%,...,A,1,1,3876,1,6866,2,10742,0.0001861850679575,both
1,chr9,139400268.0,139400268.0,G,A,S,,72,26,1%,...,A,1,3,854,1,1418,4,2272,0.0017605633802816,both
2,chr4,106197116.0,106197116.0,C,T,S,,58,42,2%,...,T,1,1,255,1,244,2,499,0.0040080160320641,both
3,chr17,7578202.0,7578202.0,A,G,S,,98,113,17%,...,G,1,136,560,129,556,265,1116,0.237455197132616,both
4,chr17,7578457.0,7578457.0,CG,C,S,,41,49,20%,...,C,1,172,238,765,967,937,1205,0.777593360995851,both
5,chr7,140453160.0,140453160.0,AT,A,S,,73,102,29%,...,A,1,106,242,404,1058,510,1300,0.392307692307692,both
6,chr13,32912785.0,32912785.0,A,T,S,,45,108,33%,...,T,1,52,137,65,196,117,333,0.351351351351351,both
7,chr9,139401330.0,139401330.0,G,A,S,,215,111,85%,...,A,1,836,953,806,920,1642,1873,0.876668446342766,both
8,chr17,7578518.0,7578518.0,C,G,S,,60,86,72%,...,G,1,157,233,285,432,442,665,0.664661654135338,both
9,chr2,25459828.0,25459828.0,G,T,S,,130,133,31%,...,T,1,29,121,57,212,86,333,0.258258258258258,both
