# <span style='font-family:"Times New Roman"'> <span styel=''> **OncoKB Annotation**

## <span style='font-family:"Times New Roman"'> <span styel=''> *Emile Cohen* 
*March 2020*

**Goal:** In this notebook, we call the OncoKB API to get the annotation at variant level for MSK-Impact cohort. 

---

In [1]:
%run -i '../../utils/setup_environment.ipy'

import warnings
import requests
import click
import os
import json
import time
warnings.filterwarnings('ignore')

data_path = '../../data/'

Setup environment... done!


<span style="color:green">✅ Working on **mskimpact_env** conda environment.</span>

---
For that we need a token to access the API, that could be found after logging in in OncoKB (https://www.oncokb.org/dataAccess) in *Account Settings*.

My Token is : **f08eeec6-fe2e-4414-9352-df71e2c1db32**

---

In [22]:
def retry_request(method, **kwargs):
    """Retry request operation multiple times."""
    url = kwargs.get('url') or ""
    click.secho(f"Querying to {url}", fg="green")
    for i in [0.2, 1, 5, 10, 60, 90, 120, 300]:  # attempt some retries
        try:
            error = None
            response = getattr(requests, method)(verify=False, **kwargs)
        except requests.exceptions.RequestException as request_error:  # pragma: no cover
            error = request_error
            response = None
        if response is not None and not str(response.status_code).startswith("50"):
            break
        else:  # pragma: no cover
            msg = f"Request to {url} failed with error: {error}, retrying in {i}s..."
            click.secho(msg, fg="yellow", err=True)
            time.sleep(i)
    return response


ONCOKB_ENDPOINTS = {
    "api_info": "https://www.oncokb.org/api/v1/info",
    "cancer_genes": "https://www.oncokb.org/api/v1/utils/cancerGeneList",
    "annotate": "https://www.oncokb.org/api/v1/annotate/mutations/byGenomicChange",
}


def oncokb_query(endpoint, method="get", data=None):
    """Make authenticated calls to newest oncokb api to gen cancer genes."""
    oncokb_token = "f08eeec6-fe2e-4414-9352-df71e2c1db32"
    if not oncokb_token:
        raise click.UsageError(
            "oncoKb authentication token is missing. "
            "Add it as enviromental variable:"
            "\n\texport ONCOKB_TOKEN=<your-oncokb-token>"
            "\nOr add them to a .env file."
        )
    headers = {
        "Authorization": f"Bearer {oncokb_token}",
        "Content-Type": "application/json",
    }
    response = retry_request(method, url=endpoint, headers=headers, data=data)
    if not response.ok:
        raise click.UsageError(f"OncoKb query failed. Reason: {response.reason}")
    return json.loads(response.content)


def get_oncokb_treatments(df, genomic_cols, tumor_type):
    """Return df with extra columns for Drug and Evidence Level."""
    chrom_col, start_col, end_col, ref_col, alt_col = genomic_cols
    genomic_changes = [
        [v[chrom_col], v[start_col], v[end_col], v[ref_col], v[alt_col]]
        for v in df.T.to_dict().values()
    ]
    data = [
        {
            "genomicLocation": ",".join(map(str, i)),
            "tumorType": tumor_type,
            "id": "",
            "evidenceTypes": [],
        }
        for i in genomic_changes
    ]
    annotated_variants = oncokb_query(
        ONCOKB_ENDPOINTS["annotate"], method="post", data=json.dumps(data)
    )
    return annotated_variants


def annotate_variants(df): 
    data = [
    {
      "genomicLocation": "{},{},{},{},{}".format(
        v["Chromosome"], int(v["Start_Position"]), int(v["End_Position"]), v["Reference_Allele"], v["Variant_Allele"]
      ),
      "tumorType": v["Cancer_Type"],
      "id": "",
      "evidenceTypes": [],
    }
    for v in df.T.to_dict().values()
    ]
    print(data[:1])
    
    return oncokb_query(
      ONCOKB_ENDPOINTS["annotate"], method="post", data=json.dumps(data[:1])
  )

def get_oncokb_genes():
    """Get list of genes and biomarker incindence levels from oncokb."""
    return oncokb_query(ONCOKB_ENDPOINTS["cancer_genes"])


### Use Chunks and Multiprocessing to download faster

In [None]:
from math import ceil
chunck_size= 10_000
annotated_variants = []
for i in range(0, ceil(len(df) / chunk_size)):
    chunks = df[(i * chunk_size) : (2i * chunk_size - 1)]
    ...
    annotated_variants.append(response)

In [23]:
maf_cohort = pd.read_pickle(data_path + 'merged_data/maf_cohort.pkl')
maf_cohort = maf_cohort.dropna(subset=["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"])

In [24]:
%%time
query = annotate_variants(maf_cohort)

[{'genomicLocation': '3,178936091,178936091,G,A', 'tumorType': 'Breast Cancer', 'id': '', 'evidenceTypes': []}]
[32mQuerying to https://www.oncokb.org/api/v1/annotate/mutations/byGenomicChange[0m
CPU times: user 23.9 s, sys: 620 ms, total: 24.5 s
Wall time: 24.9 s


In [25]:
query

[{'query': {'id': None,
   'type': 'regular',
   'hugoSymbol': 'PIK3CA',
   'entrezGeneId': 5290,
   'alteration': 'E545K',
   'alterationType': None,
   'svType': None,
   'tumorType': 'breast cancer',
   'consequence': 'missense_variant',
   'proteinStart': 545,
   'proteinEnd': 545,
   'hgvs': None},
  'geneExist': True,
  'variantExist': True,
  'alleleExist': True,
  'oncogenic': 'Oncogenic',
  'mutationEffect': {'knownEffect': 'Gain-of-function',
   'description': '',
   'citations': {'pmids': ['16322248',
     '30206110',
     '20593314',
     '16432179',
     '26627007',
     '17376864',
     '30093452'],
    'abstracts': []}},
  'highestSensitiveLevel': 'LEVEL_1',
  'highestResistanceLevel': None,
  'highestDiagnosticImplicationLevel': None,
  'highestPrognosticImplicationLevel': None,
  'otherSignificantSensitiveLevels': [],
  'otherSignificantResistanceLevels': [],
  'hotspot': True,
  'geneSummary': 'PIK3CA, the catalytic subunit of PI3-kinase, is frequently mutated in a di

In [8]:
maf_cohort
maf_cohort = maf_cohort.dropna(subset=["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"])

Unnamed: 0,Sample_Id,Tumor_Id,purity,ploidy,dipLogR,frac_loh,Patient_Id,Patient_Current_Age,Cancer_Type,Cancer_Type_Detailed,Ethnicity_Category,Sex,Mutation_Count,Sample_Type,samples_per_patient,Overall Survival Status,Overall Survival (Months),MSI Score,gene,Gene_Id,Variant_Classification,proteinChange,Start_Position,End_Position,Reference_Allele,Variant_Allele,Chromosome,Hugo_Symbol,alt_count,ref_count,mut_key,sample_mut_key,mut_spot,vaf
0,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 5290, 'hugoGeneSymbol': 'PIK3CA', 'type': 'protein-coding'}",5290.0,Missense_Mutation,E545K,178936091.0,178936091.0,G,A,3,PIK3CA,284.0,334.0,3_178936091_G_A,P-0034223-T01-IM6_3_178936091_G_A,545,0.459547
1,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 2064, 'hugoGeneSymbol': 'ERBB2', 'type': 'protein-coding'}",2064.0,Missense_Mutation,L755S,37880220.0,37880220.0,T,C,17,ERBB2,224.0,262.0,17_37880220_T_C,P-0034223-T01-IM6_17_37880220_T_C,755,0.460905
2,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 9641, 'hugoGeneSymbol': 'IKBKE', 'type': 'protein-coding'}",9641.0,Missense_Mutation,R27H,206646650.0,206646650.0,G,A,1,IKBKE,252.0,1027.0,1_206646650_G_A,P-0034223-T01-IM6_1_206646650_G_A,27,0.197029
3,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 6926, 'hugoGeneSymbol': 'TBX3', 'type': 'protein-coding'}",6926.0,Frame_Shift_Ins,S321Vfs*6,115114257.0,115114258.0,-,T,12,TBX3,358.0,384.0,12_115114257_-_T,P-0034223-T01-IM6_12_115114257_-_T,321,0.48248
4,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 3169, 'hugoGeneSymbol': 'FOXA1', 'type': 'protein-coding'}",3169.0,Missense_Mutation,C227Y,38061309.0,38061309.0,C,T,14,FOXA1,410.0,462.0,14_38061309_C_T,P-0034223-T01-IM6_14_38061309_C_T,227,0.470183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260808,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 5290, 'hugoGeneSymbol': 'PIK3CA', 'type': 'protein-coding'}",5290.0,Missense_Mutation,A987D,178951905.0,178951905.0,C,A,3,PIK3CA,158.0,409.0,3_178951905_C_A,P-0050745-T01-IM6_3_178951905_C_A,987,0.27866
260809,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 63978, 'hugoGeneSymbol': 'PRDM14', 'type': 'protein-coding'}",63978.0,Missense_Mutation,D375H,70978530.0,70978530.0,C,G,8,PRDM14,185.0,261.0,8_70978530_C_G,P-0050745-T01-IM6_8_70978530_C_G,375,0.414798
260810,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 54880, 'hugoGeneSymbol': 'BCOR', 'type': 'protein-coding'}",54880.0,Missense_Mutation,Q231E,39933908.0,39933908.0,G,C,23,BCOR,274.0,333.0,23_39933908_G_C,P-0050745-T01-IM6_23_39933908_G_C,231,0.4514
260811,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 5058, 'hugoGeneSymbol': 'PAK1', 'type': 'protein-coding'}",5058.0,Fusion,GDPD4-PAK1 fusion,-1.0,-1.0,,,,PAK1,-1.0,-1.0,NA_-1_NA_,P-0050745-T01-IM6_NA_-1_NA_,4,


In [28]:
maf_cohort

Unnamed: 0,Sample_Id,Tumor_Id,purity,ploidy,dipLogR,frac_loh,Patient_Id,Patient_Current_Age,Cancer_Type,Cancer_Type_Detailed,Ethnicity_Category,Sex,Mutation_Count,Sample_Type,samples_per_patient,Overall Survival Status,Overall Survival (Months),MSI Score,gene,Gene_Id,Variant_Classification,proteinChange,Start_Position,End_Position,Reference_Allele,Variant_Allele,Chromosome,Hugo_Symbol,alt_count,ref_count,mut_key,sample_mut_key,mut_spot,vaf
0,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 5290, 'hugoGeneSymbol': 'PIK3CA', 'type': 'protein-coding'}",5290.0,Missense_Mutation,E545K,178936091.0,178936091.0,G,A,3,PIK3CA,284.0,334.0,3_178936091_G_A,P-0034223-T01-IM6_3_178936091_G_A,545,0.459547
1,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 2064, 'hugoGeneSymbol': 'ERBB2', 'type': 'protein-coding'}",2064.0,Missense_Mutation,L755S,37880220.0,37880220.0,T,C,17,ERBB2,224.0,262.0,17_37880220_T_C,P-0034223-T01-IM6_17_37880220_T_C,755,0.460905
2,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 9641, 'hugoGeneSymbol': 'IKBKE', 'type': 'protein-coding'}",9641.0,Missense_Mutation,R27H,206646650.0,206646650.0,G,A,1,IKBKE,252.0,1027.0,1_206646650_G_A,P-0034223-T01-IM6_1_206646650_G_A,27,0.197029
3,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 6926, 'hugoGeneSymbol': 'TBX3', 'type': 'protein-coding'}",6926.0,Frame_Shift_Ins,S321Vfs*6,115114257.0,115114258.0,-,T,12,TBX3,358.0,384.0,12_115114257_-_T,P-0034223-T01-IM6_12_115114257_-_T,321,0.48248
4,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,"{'entrezGeneId': 3169, 'hugoGeneSymbol': 'FOXA1', 'type': 'protein-coding'}",3169.0,Missense_Mutation,C227Y,38061309.0,38061309.0,C,T,14,FOXA1,410.0,462.0,14_38061309_C_T,P-0034223-T01-IM6_14_38061309_C_T,227,0.470183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260808,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 5290, 'hugoGeneSymbol': 'PIK3CA', 'type': 'protein-coding'}",5290.0,Missense_Mutation,A987D,178951905.0,178951905.0,C,A,3,PIK3CA,158.0,409.0,3_178951905_C_A,P-0050745-T01-IM6_3_178951905_C_A,987,0.27866
260809,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 63978, 'hugoGeneSymbol': 'PRDM14', 'type': 'protein-coding'}",63978.0,Missense_Mutation,D375H,70978530.0,70978530.0,C,G,8,PRDM14,185.0,261.0,8_70978530_C_G,P-0050745-T01-IM6_8_70978530_C_G,375,0.414798
260810,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 54880, 'hugoGeneSymbol': 'BCOR', 'type': 'protein-coding'}",54880.0,Missense_Mutation,Q231E,39933908.0,39933908.0,G,C,23,BCOR,274.0,333.0,23_39933908_G_C,P-0050745-T01-IM6_23_39933908_G_C,231,0.4514
260811,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,"{'entrezGeneId': 5058, 'hugoGeneSymbol': 'PAK1', 'type': 'protein-coding'}",5058.0,Fusion,GDPD4-PAK1 fusion,-1.0,-1.0,,,,PAK1,-1.0,-1.0,NA_-1_NA_,P-0050745-T01-IM6_NA_-1_NA_,4,
