# Results - Analysis of variants not called as somatic on original sequencing

## Tools

In [1]:
import pandas as pd
import requests
import sys

## Pull in input files

In [2]:
missed_variants = pd.read_csv('../data/manual_review/manual_review_matrix.txt', sep='\t')

In [3]:
variants_DNA = requests.get('https://civic.genome.wustl.edu/api/panels/DNA-based/qualifying_variants?minimum_score=0').json()['records'] #Call eligible variants

In [4]:
variants_DNA_df = pd.DataFrame(variants_DNA)

In [5]:
overlap_with_smmips = pd.read_csv('../output/variant_overlap.tsv', sep='\t')

In [6]:
sample_info = pd.read_csv('../data/validation_samples/sample_dataframe.txt', sep='\t')

In [7]:
overlap_with_smmips_data = overlap_with_smmips.merge(sample_info, right_on='Sample', left_on='sample')

## Annotate tumor/normal missed variants

In [8]:
variants_dictionary = {}
for item in variants_DNA:
    variants_dictionary[item['id']]= item['coordinates']

In [9]:
def get_ids(sample):
    variant_ids = []
    for i,row in missed_variants[missed_variants['Sample'] == sample].iterrows():
        var_chrom = str(row['Chromosome'].strip('chr'))
        var_start = int(row['Start'])
        var_stop = int(row['Stop'])
        for k,v in variants_dictionary.items():
            chrom = str(v['chromosome'])
            start = int(v['start'])
            stop = int(v['stop'])
            if var_chrom == chrom and var_start >= start and var_stop <= stop:
                if k not in variant_ids:
                    variant_ids.append(k)
    return variant_ids

In [10]:
def get_evidence_statements(variant_ids, sample):
    evidence_statements = []
    
    sample_evidence_count = {}
    sample_evidence_count[sample] = [0,0,0,0]
    
    non_evaluated_variants = ['LOSS-OF-FUNCTION', 'DELETION', 'AMPLIFICATION', 'COPY NUMBER VARIATION', 'EXON 14 SKIPPING MUTATION', 'WILD TYPE']
    for item in variant_ids:

        if float(variants_DNA_df[variants_DNA_df['id'] == item]['civic_actionability_score']) > 20:
            current_variant = variants_DNA_df[variants_DNA_df['id'] == item].reset_index()
            gene = str(current_variant['entrez_name'][0])
            variant = str(current_variant['name'][0])
            if variant not in non_evaluated_variants:
                for evidence in current_variant['evidence_items']:

                    initial = str(gene) +  ' ' +str(variant) +  ' ' +evidence[0]['evidence_direction']+ ' ' + evidence[0]['clinical_significance']
                    final =  '(CIViC ' + evidence[0]['name'] + '- PMID:' + evidence[0]['source']['pubmed_id'] + ')'

                    if evidence[0]['evidence_level'] != 'D' and evidence[0]['evidence_level'] != 'E':

                        #PREDICTIVE
                        if evidence[0]['evidence_type'] == 'Predictive':
                            if evidence[0]['drug_interaction_type'] == 'Combination':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + 'combination of ' + ', '.join(drug_list[:-1]) + ' and ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif not evidence[0]['drug_interaction_type']:
                                evidence_statements.append(initial + ' to ' + evidence[0]['drugs'][0]['name'] + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif evidence[0]['drug_interaction_type'] == 'Substitutes':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + ', '.join(drug_list[:-1]) + ' or ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif evidence[0]['drug_interaction_type'] == 'Substitutes':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + ', '.join(drug_list[:-1]) + ' or ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)

                            elif evidence[0]['drug_interaction_type'] == 'Sequential':
                                    drug_list = []
                                    for drug in evidence[0]['drugs']:
                                        drug_list.append(drug['name'])
                                    evidence_statements.append(initial + ' to ' + 'sequence of ' + ', '.join(drug_list[:-1]) + ' and ' + str(drug_list[-1]) + ' for patients with ' + evidence[0]['disease'][ 'name'] + final)


                        #CREATE PROGNOSTIC EVIDENCE STATEMENT
                        if evidence[0]['evidence_type'] == 'Prognostic':
                            evidence_statements.append(initial + ' for patients with ' + evidence[0]['disease']['name'] + final)


                        #CREATE DIAGNOSTIC EVIDENCE STATEMENT
                        if evidence[0]['evidence_type'] == 'Diagnostic':
                            evidence_statements.append(initial + ' for patients with ' + evidence[0]['disease']['name'] + final)


                        #CREATE PREDISPOSING EVIDENCE STATEMENT
                        if evidence[0]['evidence_type'] == 'Predisposing':
                            evidence_statements.append(initial  + ' Predisposition For Cancer ' + ' for patients with ' + evidence[0]['disease']['name'] + final)

                        
                        if evidence[0]['evidence_type'] == 'Predictive':
                            sample_evidence_count[sample][0] += 1
                        if evidence[0]['evidence_type'] == 'Prognostic':
                            sample_evidence_count[sample][1] += 1
                        if evidence[0]['evidence_type'] == 'Diagnostic':
                            sample_evidence_count[sample][2] += 1
                        if evidence[0]['evidence_type'] == 'Predisposing':
                            sample_evidence_count[sample][3] += 1
                        
                        
    return evidence_statements, sample_evidence_count

In [23]:
samples = []
for item in overlap_with_smmips_data[(overlap_with_smmips_data['Passed QC'] == 'yes') & (overlap_with_smmips_data['Matched Normal'] == 'yes')]['sample'].drop_duplicates():
    if item != 'OSCC_1':
        samples.append(item)

In [24]:
print('Total number of samples:', len(samples))

Total number of samples: 12


In [12]:
evidence = {}
evidence_counts_final = {}
for item in samples:
    statements, evidence_count = get_evidence_statements(get_ids(item), item)
    evidence[item] = []
    for thing in statements:
        evidence[item].append(thing)
    for k,v in evidence_count.items():
        evidence_counts_final[k] = v

In [13]:
evidence_statements = pd.DataFrame(pd.DataFrame.from_dict(evidence, orient='index').stack())

In [14]:
print('Total evidence items observed for tumor/normal samples:', len(evidence_statements))

Total evidence items observed for tumor/normal samples: 73


In [15]:
evidence_statements.to_csv('/Users/ebarnell/Desktop/evidence_statements.txt')

In [16]:
evidence_counts_final_df = pd.DataFrame(evidence_counts_final).transpose()
evidence_counts_final_df.columns=['Predictive', 'Prognostic', 'Diagnostic', 'Predisposing']

In [17]:
evidence_counts_final_df.sum()

Predictive      33
Prognostic      36
Diagnostic       4
Predisposing     0
dtype: int64