# Assess the Overlap of the CIViC coordinates with potential tumors


#### Usage

This script allows for tumors to be evaluated for overlap with the CIViC coordinates

#### Input Files:

1) file = text file that contains variant information. Each file requires a header with the following labels: sample, gene_name, VAF, amino_acid, chromosome_name, start, stop (these labels can be in any order). the sample column can be composed of multiple different tumors with distinct labels

#### Output Files:

1) mutation_overlap.tsv is a tab separated file that contains each tumor that is analyzed for overlap, the total number of overlaping variants with the smMIPs panel, and information on each variant that overlaps wih the smMIPs panel including: variant, gene, VAF.


In [34]:
##Tools
#!/usr/bin/env python3
import pyliftover
from pyliftover import LiftOver
import json
import numpy as np
import requests
import sys


In [10]:
#Create a tool that will append the input files to a list
def append_file_to_list(file):
    list = []
    f = open(file, 'r')
    for line in f:
        line = line.strip('\n')
        line = line.split('\t')
        list.append(line)
    return list

In [35]:
variants_capture = requests.get('https://civic.genome.wustl.edu/api/panels/captureseq/qualifying_variants?minimum_score=0').json()['records']

In [114]:
## For variants listed in the CaptureSeq API, create bed-like files for capture design

#make dictionary for evidence types
score = {'A':5, 'B':4, 'C':3, 'D':2, 'E':1}

capture_sequence_probes = [] #create empty list for capture sequence probes
capture_sequence_probes.append(['gene', 'so_id', 'variant_type', 'variant_name', 'representative_transcript', 'top_evidence_level', 'diseases', 'evidence_types', 'number_of_evidence_statements', 'evidence_score', 'chrom', 'start', 'stop'])
for k in range(0, len(variants_capture)): #iterate through API and pull all eligible variants
    gene = variants_capture[k]['entrez_name']  #Call Gene name
    variant = variants_capture[k]['name'] #call variant
    soid = variants_capture[k]['variant_types'][0]['so_id'] #call soid
    variant_type = variants_capture[k]['variant_types'][0]['name'] #call variant type
    transcript = variants_capture[k]['coordinates']['representative_transcript'] #call transcript
    evidence = variants_capture[k]['evidence_items'] #pull evidence items
    evidence_statements = len(variants_capture[k]['evidence_items']) #pull number of evidence statements
    chrom = variants_capture[k]['coordinates']['chromosome'] #call chrom
    start = variants_capture[k]['coordinates']['start'] #call start
    stop = variants_capture[k]['coordinates']['stop'] #call stop
    diseases = [] #set list for all of the diseases for this varinat
    evidence_type = [] #set list for evidence types
    evidence_scores = [] #set list for evidence scores
    top_evidences = [] #set list for top evidence level
    for item in evidence: #iterate through the evidence items
        if item['disease']['name'] not in diseases: #see if disease is already there
            if 'Walden' in item['disease']['name']: #Change waldenstroms issues (the A is not accepted by R code)
                if 'Waldenstroms Macroglobulinemia' not in diseases: #check if it is already there
                    diseases.append('Waldenstroms Macroglobulinemia') #if it is not append to diseases
            else: #if its not a weird name
                diseases.append(item['disease']['name']) #append the disease
        if item['evidence_type'] not in evidence_type: #see if the evidence type is already there
            evidence_type.append(item['evidence_type']) #if it is not append it
        trust_rating = int(item['rating'] or 0) #make the trust rating either what is listed or 0
        evidence_level = int(score[item['evidence_level']]) #make the evidence level the value from the score dictionary
        evidence_scores.append(evidence_level * trust_rating) #calculate the Evidence Score
        if item['evidence_level'] != '[]': #find the evidence levels that are not blank
            top_evidences.append(item['evidence_level'].strip()) #add to the list
    #pull the maximum evidence level
    if 'A' in top_evidences:
        top_evidence = 'A'
    elif 'B' in top_evidences:
        top_evidence = 'B'
    elif 'C' in top_evidences:
        top_evidence = 'C'
    elif 'D' in top_evidences:
        top_evidence = 'D'
    else:
        top_evidence = 'E'
    evidence_score = sum(evidence_scores) #sum the evidence scores to get a CIVic Score
    disease = ', '.join(diseases) #format the diseases
    evidence_types = ', '.join(evidence_type) #format the evidence types

    #Append coordinates to the capture_sequence_probes list
    capture_sequence_probes.append([chrom, start, stop, gene, soid, variant_type, variant, transcript, top_evidence, disease, evidence_types, evidence_statements, evidence_score]) #append new list with bed information

In [113]:
print(capture_sequence_probes[1])

['TP53', 'SO:0000817', 'wild_type', 'WILD TYPE', 'ENST00000269305.4', 'B', 'Colorectal Cancer, Non-small Cell Lung Carcinoma, Esophageal Carcinoma, Cancer, Leukemia', 'Predictive', 6, 64, '17', 7571720, 7590856]


In [63]:
lo_38_to_37 = LiftOver('hg38', 'hg19')
lo_37_to_38 = LiftOver('hg19', 'hg38')

In [104]:
pembro = append_file_to_list('/Users/ebarnell/Desktop/DNA_Master_Variants.tsv')
print(len(pembro))
header = pembro[0]
pembro.pop(0)
print(len(pembro))
print(header)

3496
3495
['Chromosome', 'Start', 'Stop', 'Reference', 'Variant', 'Normal_DNA_ref_count', 'Normal_DNA_var_count', 'Normal_DNA_VAF', 'D0_DNA_ref_count', 'D0_DNA_var_count', 'D0_DNA_VAF', 'D14_DNA_ref_count', 'D14_DNA_var_count', 'D14_DNA_VAF', 'sample', 'type', 'transcript_name', 'strand', 'trv_type', 'amino_acid_change', 'mapped_gene_name', 'ensembl_gene_id', 'Normal_DNA_coverage', 'D0_DNA_coverage', 'D14_DNA_coverage']


In [112]:
not_converted = 0
pembro_37 = []
for item in pembro:
    chrom = "chr" + item[0]
    start = int(item[1])
    stop = int(item[2])
    converted_start = lo_38_to_37.convert_coordinate(chrom, start)
    converted_stop = lo_38_to_37.convert_coordinate(chrom, stop)
    if converted_start:
        pembro_37.append([item[0], converted_start[0][1], converted_stop[0][1]])
    else:
        not_converted += 1
        
print(pembro_37[1])

['1', 977465, 977465]


In [135]:
final_list = []
for item in pembro_37:
    for exon in capture_sequence_probes:
        if str(item[0]) == str(exon[0]) and int(item[1]) >= int(exon[1]) and int(item[2]) <= int(exon[2]):
            final_list.append([item, exon[3:]])

In [136]:
print(final_list[1])

[['1', 115258745, 115258745], ['NRAS', 'SO:0001818', 'protein_altering_variant', 'G12/G13', 'ENST00000369535.4', 'B', 'Acute Myeloid Leukemia', 'Prognostic', 1, 16]]


In [144]:
not_converted = 0
pembro_clinical_38 = []
for item in final_list:
    chrom = "chr" + item[0][0]
    start = int(item[0][1])
    stop = int(item[0][2])
    converted_start = lo_37_to_38.convert_coordinate(chrom, start)
    converted_stop = lo_37_to_38.convert_coordinate(chrom, stop)
    if converted_start:
        pembro_clinical_38.append([chrom, converted_start[0][1], converted_stop[0][1], item[1]])
    else:
        not_converted += 1
        
print(len(pembro_clinical_38))
print(pembro_clinical_38[1])

28
['chr1', 114716124, 114716124, ['NRAS', 'SO:0001818', 'protein_altering_variant', 'G12/G13', 'ENST00000369535.4', 'B', 'Acute Myeloid Leukemia', 'Prognostic', 1, 16]]


In [152]:
file = open('/Users/ebarnell/Desktop/pembro_clinical_38.txt', 'w')
file.write("chrom" + '\t' + "start" + '\t' + "stop" + '\t' + "gene" + '\t' + "soid" + '\t' + "variant_type" + '\t' + "variant" + '\t' + "transcript" + '\t' + "top_evidence" + '\t' + "disease" + '\t' + "evidence_types" + '\t' + "evidence_statements" + '\t' + "evidence_score" + '\n')
for item in pembro_clinical_38:
    file.write(str(item[0]))
    file.write('\t')
    file.write(str(item[1]))
    file.write('\t')
    file.write(str(item[2]))
    file.write('\t')
    for thing in item[3]:
        file.write(str(thing))
        file.write('\t')
    file.write('\n')
file.close()
        
