# Create Required Coverage for smMIPs Panel

#### Usage

probe_selection.py will pull existing variants from the CIViC Knowledgebase, iterate through all pulled 
variants, and pull variants based on the provided Actionability Score threshold. The threshold (see input) is the lower limit required for a variant to be considered extensively curated and therefore eligible for probe design. It will create output for DNA-based and RNA-based variants to detail the coverage required to evaluate all extensively curated variants (i.e. variants that have an actionability score > threshold)


#### Inputs:

1) Threshold = minimum requirement for actionability score to be included in output files; default is 20 points.

2) tile_classificaiton = an input file that designates if a variant must be assessed by tiling all protein coding exons or if the variant can be assessed with a few number of probes.

#### Output Files: 

1) capture_sequence_probes.tsv = list of DNA-based variants

2) nanoString_probes.tsv = list of RNA-based variants

3) tile_classification.txt =updated curated list of variants that need to be tiled or not


In [3]:
##SET INPUT VALUES

threshold = str(20)

In [1]:
#!/usr/bin/env python3
import json
import numpy as np
import requests
import sys

In [4]:
##Pull in Data from JSON
variants_capture = requests.get('https://civic.genome.wustl.edu/api/panels/captureseq/qualifying_variants?minimum_score=',threshold).json()['records'] #Call eligible variants
variants_nanostring = requests.get('https://civic.genome.wustl.edu/api/panels/nanostring/qualifying_variants?minimum_score=',threshold).json()['records'] #Call eligible variants

In [5]:
##Use API to determine the total number of eligible variants and the total number of eligible genes

total_variant_count = 0 #Start count to determine the total number of eligible variants
variant_list = [] #Create a Variant List for all genes

for k in range(0, len(variants_nanostring)): #iterate through API and pull all eligible variants
    if variants_nanostring[k]['entrez_name'] not in variant_list: #If the gene is not in the list already
        variant_list.append(variants_nanostring[k]['entrez_name']) #add to the list
    total_variant_count += 1 #Count all of the eligible variants
    
for k in range(0, len(variants_capture)): #iterate through API and pull all eligible variants
    if variants_capture[k]['entrez_name'] not in variant_list: #If the gene is not in the list already
        variant_list.append(variants_capture[k]['entrez_name']) #add to the list
    total_variant_count += 1 #Count all of the eligible variants
    
print('Total Number of Eligible Variants: ', total_variant_count) #Print out all variants
print('Total Number of Eligible Genes: ', len(variant_list)) #Print out all Genes

Total Number of Eligible Variants:  988
Total Number of Eligible Genes:  275


In [6]:
## For variants listed in the CaptureSeq API, create bed-like files for capture design

#make dictionary for evidence types
score = {'A':5, 'B':4, 'C':3, 'D':2, 'E':1}

capture_sequence_probes = [] #create empty list for capture sequence probes
capture_sequence_probes.append(['gene', 'so_id', 'variant_type', 'variant_name', 'representative_transcript', 'top_evidence_level', 'diseases', 'evidence_types', 'number_of_evidence_statements', 'evidence_score', 'chrom', 'start', 'stop'])
for k in range(0, len(variants_capture)): #iterate through API and pull all eligible variants
    gene = variants_capture[k]['entrez_name']  #Call Gene name
    variant = variants_capture[k]['name'] #call variant
    soid = variants_capture[k]['variant_types'][0]['so_id'] #call soid
    variant_type = variants_capture[k]['variant_types'][0]['name'] #call variant type
    transcript = variants_capture[k]['coordinates']['representative_transcript'] #call transcript
    evidence = variants_capture[k]['evidence_items'] #pull evidence items
    evidence_statements = len(variants_capture[k]['evidence_items']) #pull number of evidence statements
    chrom = variants_capture[k]['coordinates']['chromosome'] #call chrom
    start = variants_capture[k]['coordinates']['start'] #call start
    stop = variants_capture[k]['coordinates']['stop'] #call stop
    diseases = [] #set list for all of the diseases for this varinat
    evidence_type = [] #set list for evidence types
    evidence_scores = [] #set list for evidence scores
    top_evidences = [] #set list for top evidence level
    for item in evidence: #iterate through the evidence items
        if item['disease']['name'] not in diseases: #see if disease is already there
            if 'Walden' in item['disease']['name']: #Change waldenstroms issues (the A is not accepted by R code)
                if 'Waldenstroms Macroglobulinemia' not in diseases: #check if it is already there
                    diseases.append('Waldenstroms Macroglobulinemia') #if it is not append to diseases
            else: #if its not a weird name
                diseases.append(item['disease']['name']) #append the disease
        if item['evidence_type'] not in evidence_type: #see if the evidence type is already there
            evidence_type.append(item['evidence_type']) #if it is not append it
        trust_rating = int(item['rating'] or 0) #make the trust rating either what is listed or 0
        evidence_level = int(score[item['evidence_level']]) #make the evidence level the value from the score dictionary
        evidence_scores.append(evidence_level * trust_rating) #calculate the Evidence Score
        if item['evidence_level'] != '[]': #find the evidence levels that are not blank
            top_evidences.append(item['evidence_level'].strip()) #add to the list
    #pull the maximum evidence level
    if 'A' in top_evidences:
        top_evidence = 'A'
    elif 'B' in top_evidences:
        top_evidence = 'B'
    elif 'C' in top_evidences:
        top_evidence = 'C'
    elif 'D' in top_evidences:
        top_evidence = 'D'
    else:
        top_evidence = 'E'
    evidence_score = sum(evidence_scores) #sum the evidence scores to get a CIVic Score
    disease = ', '.join(diseases) #format the diseases
    evidence_types = ', '.join(evidence_type) #format the evidence types

    #Append coordinates to the capture_sequence_probes list
    if variants_capture[k]['coordinates']['chromosome2'] is not None and variants_capture[k]['coordinates']['start2'] is not None and variants_capture[k]['coordinates']['stop2'] is not None: #if there are two chromosomes for the variant
        transcript2 = variants_capture[k]['coordinates']['representative_transcript'] #call transcript
        chrom2 = variants_capture[k]['coordinates']['chromosome2'] #call chrom2
        start2 = variants_capture[k]['coordinates']['start2'] #call start2
        stop2 = variants_capture[k]['coordinates']['stop2'] #call stop2
        capture_sequence_probes.append([gene, soid, variant_type, variant, transcript, top_evidence, disease, evidence_types, evidence_statements, evidence_score, chrom, start, stop, transcript2, chrom2, start2, stop2]) #append new list with bed informaiton
    else: #if there is only 1 chromosome for the variant
        capture_sequence_probes.append([gene, soid, variant_type, variant, transcript, top_evidence, disease, evidence_types, evidence_statements, evidence_score, chrom, start, stop]) #append new list with bed information

##Create output files for capture     
capture = open('../../smMIPs_panel/output_files/capture_sequence_probes.tsv', 'w') #create empy file for capture sequence coordinates
for item in capture_sequence_probes: #iterate through capture list
    for k in item:
        if k is item[-1]:
            capture.write(str(k))
        else:
            capture.write(str(k) + '\t')
    capture.write('\n')
capture.close() #close file

In [7]:
## For variants listed in the NanoString API, create bed-like files for capture design

nanoString_probes = []  # create empty list for nanostring probes
nanoString_probes.append(['gene', 'soid', 'variant_type', 'variant_name', 'representative_transcript', 'top_evidence_level', 'diseases','evidence_types','number_of_evidence_statements', 'chrom', 'start', 'stop', 'transcript2', 'chrom2', 'start2', 'stop2'])
for k in range(0, len(variants_nanostring)):  # iterate through API and pull all eligible variants
    gene = variants_nanostring[k]['entrez_name']  #Call Gene name
    variant = variants_nanostring[k]['name'] #call variant
    soid = variants_nanostring[k]['variant_types'][0]['so_id'] #call soid
    variant_type = variants_nanostring[k]['variant_types'][0]['name'] #call variant type
    transcript = variants_nanostring[k]['coordinates']['representative_transcript'] #call transcript
    top_evidence = variants_nanostring[k]
    diseases = variants_nanostring[k]
    chrom = variants_nanostring[k]['coordinates']['chromosome'] #call chrom
    start = variants_nanostring[k]['coordinates']['start'] #call start
    stop = variants_nanostring[k]['coordinates']['stop'] #call stop
    evidence = variants_nanostring[k]['evidence_items']
    evidence_statements = len(variants_nanostring[k]['evidence_items'])
    diseases = []  # set list for all of the diseases for this varinat
    evidence_type = []  # set list for evidence types
    evidence_scores = []  # set list for evidence scores
    top_evidences = []  # set list for top evidence level
    for item in evidence:  # iterate through the evidence items
        if item['disease']['name'] not in diseases:  # see if disease is already there
            if 'Walden' in item['disease']['name']:  # Change waldenstroms issues (the A is not accepted by R code)
                if 'Waldenstroms Macroglobulinemia' not in diseases:  # check if it is already there
                    diseases.append('Waldenstroms Macroglobulinemia')  # if it is not append to diseases
            else:  # if its not a weird name
                diseases.append(item['disease']['name'])  # append the disease
        if item['evidence_type'] not in evidence_type:  # see if the evidence type is already there
            evidence_type.append(item['evidence_type'])  # if it is not append it
        trust_rating = int(item['rating'] or 0)  # make the trust rating either what is listed or 0
        evidence_level = int(
            score[item['evidence_level']])  # make the evidence level the value from the score dictionary
        evidence_scores.append(evidence_level * trust_rating)  # calculate the Evidence Score
        if item['evidence_level'] != '[]':  # find the evidence levels that are not blank
            top_evidences.append(item['evidence_level'].strip())  # add to the list
    # pull the maximum evidence level
    if 'A' in top_evidences:
        top_evidence = 'A'
    elif 'B' in top_evidences:
        top_evidence = 'B'
    elif 'C' in top_evidences:
        top_evidence = 'C'
    elif 'D' in top_evidences:
        top_evidence = 'D'
    else:
        top_evidence = 'E'
    evidence_score = sum(evidence_scores)  # sum the evidence scores to get a CIVic Score
    disease = ', '.join(diseases)  # format the diseases
    evidence_types = ', '.join(evidence_type)  # format the evidence types

    if variants_nanostring[k]['coordinates']['chromosome2'] is not None and variants_nanostring[k]['coordinates']['start2'] is not None and variants_nanostring[k]['coordinates']['stop2'] is not None:  # if there are two chromosomes for the variant
        chrom2 = variants_nanostring[k]['coordinates']['chromosome2']  # call chrom2
        start2 = variants_nanostring[k]['coordinates']['start2']  # call start2
        stop2 = variants_nanostring[k]['coordinates']['stop2']  # call stop2
        nanoString_probes.append([gene, soid, variant_type, variant, transcript, top_evidence, disease, evidence_types, evidence_statements, chrom, start, stop, chrom2, start2, stop2])  # append new list with bed information
    else:  # if there is only 1 chromosome for the variant
        nanoString_probes.append([gene, soid, variant_type, variant, transcript, top_evidence, disease, evidence_types, evidence_statements, chrom, start, stop])  # append new list with bed information

##Create output files for nanostring
nanostring = open('../../smMIPs_panel/output_files/nanoString_probes.tsv', 'w')  #create empy file for nanostring coordinates
for item in nanoString_probes: #iterate through nanostring list
    for k in item:
        if k is item[-1]:
            nanostring.write(str(k))
        else:
            nanostring.write(str(k) + '\t')
    nanostring.write('\n')
nanostring.close() #close file

In [11]:
## Evaluate and Update Tiling File

tiling_output = [] #create empty list for output file
tiling_file = [] #create dictionary for keys as the gene, variant and values as tile notes
tile_panel = open('../../smMIPs_panel/input_files/tile_classification.txt', 'r') #open tiled genes input file
for line in tile_panel: #iterate through the tile panel
    line = line.strip('\n')  # strip the new line
    line = line.split('\t')  # split by tabs
    if len(line) == 2:
        print(line)
        print('YOU NEED TO UPDATE THE TILE_CLASSIFICATION.TXT!!!')
        break
    gene = line[0]  # pull gene
    variant = line[1] #pull variant
    tile = line[2] #pull tile value
    tiling_file.append([gene, variant])
    tiling_output.append([gene, variant, tile])

#check and see if there are new items that need to be added to tiling list    
for item in capture_sequence_probes: #iterate through the capture_sequencing_list
    if [item[0], item[3]] not in tiling_file: #see if the gene, variant matches with the keys in tiling dictionary
        tiling_output.append([item[0], item[3]]) #if it is not there, append the gene and variant to the tiling list for curation
    
#update tiling folder
tiling = open('../../smMIPs_panel/input_files/tile_classification.txt', 'w')
for item in tiling_output:
    for k in item:
        if k is item[-1]:
            tiling.write(str(k))
        else:
            tiling.write(str(k) + '\t')
    tiling.write('\n')
tiling.close()

