# Results - Annotation of CIViC smMIPs variants using tumor-only samples

## Tools

In [13]:
import pandas as pd
import requests
import glob

In [None]:
from pyliftover import LiftOver
lo = LiftOver('hg19', 'hg38')
li = LiftOver('hg38', 'hg19')

##  Pull in input files

In [14]:
variants_DNA = requests.get('https://civic.genome.wustl.edu/api/panels/DNA-based/qualifying_variants?minimum_score=0').json()['records'] #Call eligible variants

In [68]:
variants_DNA_df = pd.DataFrame(variants_DNA)

In [15]:
overlap_with_smmips = pd.read_csv('../output/variant_overlap.tsv', sep='\t')

In [16]:
samples_QC = pd.read_csv('../data/validation_samples/sequencing_quality_check.txt', sep='\t')

In [17]:
sample_info = pd.read_csv('../data/validation_samples/sample_dataframe.txt', sep='\t')

## Annotate tumor-only missed variants

In [18]:
overlap_with_smmips_data = overlap_with_smmips.merge(sample_info, right_on='Sample', left_on='sample')

In [19]:
tumor_only_samples = []
for item in overlap_with_smmips_data[(overlap_with_smmips_data['Passed QC'] == 'yes') & (overlap_with_smmips_data['Matched Normal'] == 'no')]['sample'].drop_duplicates():
    tumor_only_samples.append(item)

In [58]:
def find_somatic_variants(name):
    print(name)
    
    to_iterate = []
    for item in glob.glob('../data/smmips_sequencing/*.vcf'):
        file_name = item.split('/')[-1].split('_')[0] +'_' + item.split('/')[-1].split('_')[1]
        if name == file_name:
            to_iterate.append(item)
    if len(to_iterate) != 2:
        print("Normal sequencing Failed for " + name + " ... skiping this sample")
        print()
        return 0,0
    else:
        tumor_variants = pd.DataFrame()
        for item in to_iterate:
            current = pd.read_csv(item, sep='\t', comment='#', header=None).filter(items=[0,1,1,3,4])
            if len(current) > 1:
                if item.split('_')[3].split('.')[0] == 'T':
                    tumor_variants = tumor_variants.append(current)
        tumor_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant']
        
        print('Total tumor varinats: ', len(tumor_variants))

        return 1, int(len(tumor_variants))
    print()
    
    

In [59]:
eligible_samples = 0
total_variants = 0
missed_variants = pd.DataFrame()
for item in tumor_only_samples:
    sample, somatic_variants = find_somatic_variants(item)

CRC_4
Total tumor varinats:  71
CRC_1
Total tumor varinats:  60
CRC_3
Total tumor varinats:  72
AML_31
Total tumor varinats:  60
CRC_5
Total tumor varinats:  44
CRC_2
Total tumor varinats:  94


In [23]:
smMIPs_somatic = pd.DataFrame()
for name in tumor_only_samples:
    to_iterate = []
    for item in glob.glob('../data/smmips_sequencing/*.vcf'):

        
        file_name = item.split('/')[-1].split('_')[0] +'_' + item.split('/')[-1].split('_')[1]

        if name == file_name:
            to_iterate.append(item)

    if len(to_iterate) != 2:
        continue

    else:
        tumor_variants = pd.DataFrame()
        for item in to_iterate:
            current = pd.read_csv(item, sep='\t', comment='#', header=None).filter(items=[0,1,1,3,4,9])
            if len(current) > 1:
                if item.split('_')[3].split('.')[0] == 'T':
                    tumor_variants = tumor_variants.append(current)
        tumor_variants.columns = ['chrom', 'start', 'stop', 'reference', 'variant','GT:FALT:FTOT:RALT:RTOT:ALT:TOT:FRAC']
        tumor_variants['Sample'] = name

        smMIPs_somatic = smMIPs_somatic.append(tumor_variants)

In [111]:
gnomad = pd.read_csv('../../../../tools/gnomad_data/gnomad.exomes.r2.0.2.sites_cut.vcf', comment='#', sep='\t', header=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [112]:
gnomad.columns=['chromosome','start','name','ref','var']

In [113]:
chroms = []
for item in list(smMIPs_somatic['chrom'].drop_duplicates()):
    chroms.append(int(item.strip('chr')))

In [114]:
gnomad = gnomad[gnomad['chromosome'].isin(chroms)]

In [115]:
for i,row in smMIPs_somatic.iterrows():
    chromosome = int(row['chrom'].strip('chr'))
    start = int(row['start'])
    if len(gnomad[(gnomad['start'] == start) & (gnomad['chromosome'] == chromosome)]) > 1:
        print(i)

In [64]:
def get_ids(sample):
    variant_ids = []
    for i,row in smMIPs_somatic[smMIPs_somatic['Sample'] == sample].iterrows():
        var_chrom = str(row['chrom'].strip('chr'))
        var_start = int(row['start'])
        var_stop = int(row['stop'])
        for k,v in variants_dictionary.items():
            chrom = str(v['chromosome'])
            start = int(v['start'])
            stop = int(v['stop'])
            if var_chrom == chrom and var_start >= start and var_stop <= stop:
                if k not in variant_ids:
                    variant_ids.append(k)
    return variant_ids

In [61]:
def get_evidence_statements(variant_ids, sample):
    evidence_statements = []
    
    sample_evidence_count = {}
    sample_evidence_count[sample] = [0,0,0,0]
    
    non_evaluated_variants = ['LOSS-OF-FUNCTION', 'DELETION', 'AMPLIFICATION', 'COPY NUMBER VARIATION', 'EXON 14 SKIPPING MUTATION', 'WILD TYPE']
    for item in variant_ids:

        if float(variants_DNA_df[variants_DNA_df['id'] == item]['civic_actionability_score']) > 20:
            current_variant = variants_DNA_df[variants_DNA_df['id'] == item].reset_index()
            gene = str(current_variant['entrez_name'][0])
            variant = str(current_variant['name'][0])
            if variant not in non_evaluated_variants:
                for evidence in current_variant['evidence_items']:

                    initial = str(gene) +  ' ' +str(variant) +  ' ' +evidence[0]['evidence_direction']+ ' ' + evidence[0]['clinical_significance']
                    final =  '(CIViC ' + evidence[0]['name'] + '- PMID:' + evidence[0]['source']['pubmed_id'] + ')'

                    if evidence[0]['evidence_level'] != 'D' and evidence[0]['evidence_level'] != 'E':

                        #PREDICTIVE
                        if evidence[0]['evidence_type'] == 'Predictive':
                            if evidence[0]['drug_interaction_type'] == 'Combination':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + 'combination of ' + ', '.join(drug_list[:-1]) + ' and ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif not evidence[0]['drug_interaction_type']:
                                evidence_statements.append(initial + ' to ' + evidence[0]['drugs'][0]['name'] + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif evidence[0]['drug_interaction_type'] == 'Substitutes':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + ', '.join(drug_list[:-1]) + ' or ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif evidence[0]['drug_interaction_type'] == 'Substitutes':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + ', '.join(drug_list[:-1]) + ' or ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif evidence[0]['drug_interaction_type'] == 'Sequential':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + 'sequence of ' + ', '.join(drug_list[:-1]) + ' and ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)


                        #CREATE PROGNOSTIC EVIDENCE STATEMENT
                        if evidence[0]['evidence_type'] == 'Prognostic':
                            evidence_statements.append(initial + ' for patients with ' + evidence[0]['disease']['name'] + final)


                        #CREATE DIAGNOSTIC EVIDENCE STATEMENT
                        if evidence[0]['evidence_type'] == 'Diagnostic':
                            evidence_statements.append(initial + ' for patients with ' + evidence[0]['disease']['name'] + final)


                        #CREATE PREDISPOSING EVIDENCE STATEMENT
                        if evidence[0]['evidence_type'] == 'Predisposing':
                            evidence_statements.append(initial  + ' Predisposition For Cancer ' + ' for patients with ' + evidence[0]['disease']['name'] + final)

                        
                        if evidence[0]['evidence_type'] == 'Predictive':
                            sample_evidence_count[sample][0] += 1
                        if evidence[0]['evidence_type'] == 'Prognostic':
                            sample_evidence_count[sample][1] += 1
                        if evidence[0]['evidence_type'] == 'Diagnostic':
                            sample_evidence_count[sample][2] += 1
                        if evidence[0]['evidence_type'] == 'Predisposing':
                            sample_evidence_count[sample][3] += 1
                        
                        
    return evidence_statements, sample_evidence_count

In [69]:
evidence = {}
evidence_counts_final = {}
for item in tumor_only_samples:
    print(item)
    statements, evidence_count = get_evidence_statements(get_ids(item), item)
    evidence[item] = []
    for thing in statements:
        evidence[item].append(thing)
    for k,v in evidence_count.items():
        evidence_counts_final[k] = v
        

CRC_4
CRC_1
CRC_3
AML_31
CRC_5
CRC_2


In [70]:
evidence_statements = pd.DataFrame(pd.DataFrame.from_dict(evidence, orient='index').stack())

In [75]:
print('Total variants observed for tumor-only samples:', len(smMIPs_somatic))

Total variants observed for tumor-only samples: 401


In [71]:
print('Total evidence items observed for tumor-only samples:', len(evidence_statements))

Total evidence items observed for tumor-only samples: 127
