# <span style='font-family:"Times New Roman"'> <span styel=''> **OncoKB Annotation**

*Emile Cohen* 
    
*March 2020*

**Goal:** In this notebook, we call the OncoKB API to get the annotation at variant level for MSK-Impact cohort. 

---

In [1]:
%run -i '../../utils/setup_environment.ipy'

import warnings
import requests
import click
import os
import json
import time
from tqdm import tqdm_notebook as tqdm
warnings.filterwarnings('ignore')

data_path = '../../data/'

Setup environment... done!


<span style="color:green">✅ Working on **mskimpact_env** conda environment.</span>

# API Call

---
To call the API, we need a token to access the API, that could be found after logging in in OncoKB (https://www.oncokb.org/dataAccess) in *Account Settings*.

My Token is : **f08eeec6-fe2e-4414-9352-df71e2c1db32**

---

## Useful functions to call OncoKB API

In [2]:
def retry_request(method, **kwargs):
    """Retry request operation multiple times."""
    url = kwargs.get('url') or ""
    click.secho(f"Querying to {url}", fg="green")
    for i in [0.2, 1, 5, 10, 60, 90, 120, 300]:  # attempt some retries
        try:
            error = None
            response = getattr(requests, method)(verify=False, **kwargs)
        except requests.exceptions.RequestException as request_error:  # pragma: no cover
            error = request_error
            response = None
        if response is not None and not str(response.status_code).startswith("50"):
            break
        else:  # pragma: no cover
            msg = f"Request to {url} failed with error: {error}, retrying in {i}s..."
            click.secho(msg, fg="yellow", err=True)
            time.sleep(i)
    return response


ONCOKB_ENDPOINTS = {
    "api_info": "https://www.oncokb.org/api/v1/info",
    "cancer_genes": "https://www.oncokb.org/api/v1/utils/cancerGeneList",
    "annotate": "https://www.oncokb.org/api/v1/annotate/mutations/byGenomicChange",
}


def oncokb_query(endpoint, method="get", data=None):
    """Make authenticated calls to newest oncokb api to gen cancer genes."""
    oncokb_token = "f08eeec6-fe2e-4414-9352-df71e2c1db32"
    if not oncokb_token:
        raise click.UsageError(
            "oncoKb authentication token is missing. "
            "Add it as enviromental variable:"
            "\n\texport ONCOKB_TOKEN=<your-oncokb-token>"
            "\nOr add them to a .env file."
        )
    headers = {
        "Authorization": f"Bearer {oncokb_token}",
        "Content-Type": "application/json",
    }
    response = retry_request(method, url=endpoint, headers=headers, data=data)
    if not response.ok:
        raise click.UsageError(f"OncoKb query failed. Reason: {response.reason}")
    return json.loads(response.content)


def get_oncokb_treatments(df, genomic_cols, tumor_type):
    """Return df with extra columns for Drug and Evidence Level."""
    chrom_col, start_col, end_col, ref_col, alt_col = genomic_cols
    genomic_changes = [
        [v[chrom_col], v[start_col], v[end_col], v[ref_col], v[alt_col]]
        for v in df.T.to_dict().values()
    ]
    data = [
        {
            "genomicLocation": ",".join(map(str, i)),
            "tumorType": tumor_type,
            "id": "",
            "evidenceTypes": [],
        }
        for i in genomic_changes
    ]
    annotated_variants = oncokb_query(
        ONCOKB_ENDPOINTS["annotate"], method="post", data=json.dumps(data)
    )
    return annotated_variants


def annotate_variants(df): 
    data = [
    {
      "genomicLocation": "{},{},{},{},{}".format(
        v["Chromosome"], int(v["Start_Position"]), int(v["End_Position"]), v["Reference_Allele"], v["Variant_Allele"]
      ),
      "tumorType": v["Cancer_Type"],
      "id": "",
      "evidenceTypes": [],
    }
    for v in df.T.to_dict().values()
    ]

    annotated_variants = oncokb_query(ONCOKB_ENDPOINTS["annotate"], method="post", data=json.dumps(data))
    
    return [[v['mutationEffect']['knownEffect'], v['oncogenic'],v['vus'], v['hotspot']] for v in annotated_variants]

def get_oncokb_genes():
    """Get list of genes and biomarker incindence levels from oncokb."""
    return oncokb_query(ONCOKB_ENDPOINTS["cancer_genes"])


### Use Chunks and Multiprocessing to download faster

In [105]:
maf_cohort = pd.read_pickle(data_path + 'merged_data/maf_cohort.pkl')
maf_cohort = maf_cohort.dropna(subset=["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"])

In [41]:
%%time
query = annotate_variants(maf_cohort)

[32mQuerying to https://www.oncokb.org/api/v1/annotate/mutations/byGenomicChange[0m
CPU times: user 23.1 s, sys: 553 ms, total: 23.7 s
Wall time: 24 s


In [66]:
new_data = pd.DataFrame(query, columns=['mutationEffect', 'oncogenic', 'vus', 'hotspot'])#['mutationEffect']['knownEffect']

new_data

Unnamed: 0,mutationEffect,oncogenic,vus,hotspot
0,Gain-of-function,Oncogenic,False,True
1,Gain-of-function,Oncogenic,False,True


In [42]:
import time

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def fill_with_queries(df: pd.DataFrame, chunk_size: int):
    
    maf_chunks = list(chunks(df, chunk_size))[12:]
    list_ = [chunk_size*x for x in list(range(len(maf_chunks)))] 

    for i, chunk in tqdm(zip(list_, maf_chunks)):
        print(i, len(chunk))
        df_chunk = df[120000+i:120000+i+chunk_size]
        query = annotate_variants(chunk)
        new_data = pd.DataFrame(query, columns=['mutationEffect', 'oncogenic', 'vus', 'hotspot'])

        for column in ['mutationEffect', 'oncogenic', 'vus', 'hotspot']:
            df_chunk[column] = new_data[column]
        
        df_chunk.to_pickle(data_path + 'annotation/maf_cohort_annotated_{}_{}.pkl'.format(120000+i,120000+i+chunk_size))
        print(120000+i)
        time.sleep(60*10)

    return df

In [51]:
%%time
maf_cohort= pd.read_pickle(data_path + 'merged_data/maf_cohort.pkl')
maf_cohort = maf_cohort.dropna(subset=["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"])
maf_cohort['mutationEffect'] = 'NaN' ; maf_cohort['oncogenic'] = 'NaN' ; maf_cohort['vus'] = 'NaN' ; maf_cohort['hotspot'] = 'NaN'
set(maf_cohort[120000:130000]["Start_Position"])
#fill_with_queries(maf_cohort, 10000)

CPU times: user 538 ms, sys: 124 ms, total: 661 ms
Wall time: 661 ms


{134873092.0,
 46268421.0,
 48955419.0,
 137527344.0,
 68845619.0,
 31424569.0,
 89391162.0,
 40370235.0,
 30670915.0,
 70352972.0,
 139395153.0,
 48955480.0,
 139395161.0,
 393310.0,
 117866591.0,
 89391202.0,
 48955494.0,
 161284198.0,
 56852589.0,
 30670958.0,
 68845679.0,
 29556861.0,
 393345.0,
 115114118.0,
 393351.0,
 393357.0,
 115114126.0,
 30671005.0,
 18972832.0,
 56852643.0,
 68845744.0,
 32899248.0,
 20480176.0,
 68845757.0,
 15302860.0,
 11272403.0,
 139395288.0,
 88670433.0,
 42795241.0,
 88670452.0,
 40730874.0,
 27099387.0,
 28901639.0,
 115114249.0,
 46268694.0,
 40730904.0,
 36208923.0,
 11272479.0,
 140476714.0,
 65306928.0,
 70451507.0,
 76939577.0,
 151945532.0,
 70451522.0,
 42795334.0,
 103514440.0,
 27099479.0,
 47677792.0,
 65306977.0,
 201982311.0,
 11141487.0,
 65306997.0,
 37650818.0,
 16056710.0,
 37650823.0,
 89883017.0,
 2130319.0,
 76939674.0,
 37880219.0,
 37880220.0,
 11141550.0,
 133759411.0,
 30671286.0,
 81953214.0,
 47088064.0,
 30671296.0,
 61145

In [134]:
maf_cohort.to_pickle(data_path + 'merged_data/maf_cohort_annotated.pkl')

In [142]:
maf_cohort[170000:180001]

Unnamed: 0,Sample_Id,Tumor_Id,purity,ploidy,dipLogR,frac_loh,Patient_Id,Patient_Current_Age,Cancer_Type,Cancer_Type_Detailed,Ethnicity_Category,Sex,Mutation_Count,Sample_Type,samples_per_patient,Overall Survival Status,Overall Survival (Months),MSI Score,TMB_Score,gene,Gene_Id,Variant_Classification,proteinChange,Start_Position,End_Position,Reference_Allele,Variant_Allele,Chromosome,Hugo_Symbol,alt_count,ref_count,mut_key,sample_mut_key,mut_spot,vaf,mutationEffect,oncogenic,vus,hotspot
170749,P-0008997-T01-IM5_P-0008997-N01-IM5,P-0008997-T01-IM5,0.615154,2.475989,-0.197115,0.200,P-0008997,59.0,Melanoma,Cutaneous Melanoma,Non-Spanish; Non-Hispanic,Female,67.0,Metastasis,1.0,LIVING,45.600,1.35,65.9,"{'entrezGeneId': 695, 'hugoGeneSymbol': 'BTK', 'type': 'protein-coding'}",695.0,Missense_Mutation,K185N,100617194.0,100617194.0,T,A,23,BTK,789.0,625.0,23_100617194_T_A,P-0008997-T01-IM5_23_100617194_T_A,185,0.557992,,,,
170750,P-0008997-T01-IM5_P-0008997-N01-IM5,P-0008997-T01-IM5,0.615154,2.475989,-0.197115,0.200,P-0008997,59.0,Melanoma,Cutaneous Melanoma,Non-Spanish; Non-Hispanic,Female,67.0,Metastasis,1.0,LIVING,45.600,1.35,65.9,"{'entrezGeneId': 4068, 'hugoGeneSymbol': 'SH2D1A', 'type': 'protein-coding'}",4068.0,Missense_Mutation,V102F,123504128.0,123504128.0,G,T,23,SH2D1A,289.0,754.0,23_123504128_G_T,P-0008997-T01-IM5_23_123504128_G_T,102,0.277085,,,,
170751,P-0008997-T01-IM5_P-0008997-N01-IM5,P-0008997-T01-IM5,0.615154,2.475989,-0.197115,0.200,P-0008997,59.0,Melanoma,Cutaneous Melanoma,Non-Spanish; Non-Hispanic,Female,67.0,Metastasis,1.0,LIVING,45.600,1.35,65.9,"{'entrezGeneId': 4763, 'hugoGeneSymbol': 'NF1', 'type': 'protein-coding'}",4763.0,Frame_Shift_Del,L164*,29496919.0,29496919.0,T,-,17,NF1,520.0,389.0,17_29496919_T_-,P-0008997-T01-IM5_17_29496919_T_-,164,0.572057,,,,
170752,P-0008997-T01-IM5_P-0008997-N01-IM5,P-0008997-T01-IM5,0.615154,2.475989,-0.197115,0.200,P-0008997,59.0,Melanoma,Cutaneous Melanoma,Non-Spanish; Non-Hispanic,Female,67.0,Metastasis,1.0,LIVING,45.600,1.35,65.9,"{'entrezGeneId': 11122, 'hugoGeneSymbol': 'PTPRT', 'type': 'protein-coding'}",11122.0,Missense_Mutation,R637Q,40944591.0,40944592.0,CC,TT,20,PTPRT,122.0,654.0,20_40944591_CC_TT,P-0008997-T01-IM5_20_40944591_CC_TT,637,0.157216,,,,
170753,P-0037516-T01-IM6_P-0037516-N01-IM6,P-0037516-T01-IM6,0.599926,2.100641,-0.042908,0.038,P-0037516,61.0,Hepatobiliary Cancer,Hepatocellular Carcinoma,Non-Spanish; Non-Hispanic,Male,4.0,Primary,1.0,LIVING,12.427,0.06,4.4,"{'entrezGeneId': 1499, 'hugoGeneSymbol': 'CTNNB1', 'type': 'protein-coding'}",1499.0,Missense_Mutation,T41A,41266124.0,41266124.0,A,G,3,CTNNB1,46.0,245.0,3_41266124_A_G,P-0037516-T01-IM6_3_41266124_A_G,41,0.158076,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180807,P-0012680-T01-IM5_P-0012680-N01-IM5,P-0012680-T01-IM5,0.797244,3.101509,-0.525153,0.075,P-0012680,85.0,Colorectal Cancer,Colorectal Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,9.0,Metastasis,1.0,DECEASED,0.888,1.63,8.9,"{'entrezGeneId': 3845, 'hugoGeneSymbol': 'KRAS', 'type': 'protein-coding'}",3845.0,Missense_Mutation,G12V,25398284.0,25398284.0,C,A,12,KRAS,433.0,564.0,12_25398284_C_A,P-0012680-T01-IM5_12_25398284_C_A,12,0.434303,,,,
180808,P-0012680-T01-IM5_P-0012680-N01-IM5,P-0012680-T01-IM5,0.797244,3.101509,-0.525153,0.075,P-0012680,85.0,Colorectal Cancer,Colorectal Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,9.0,Metastasis,1.0,DECEASED,0.888,1.63,8.9,"{'entrezGeneId': 1027, 'hugoGeneSymbol': 'CDKN1B', 'type': 'protein-coding'}",1027.0,Nonsense_Mutation,Q20*,12870831.0,12870831.0,C,T,12,CDKN1B,284.0,347.0,12_12870831_C_T,P-0012680-T01-IM5_12_12870831_C_T,20,0.450079,,,,
180809,P-0012680-T01-IM5_P-0012680-N01-IM5,P-0012680-T01-IM5,0.797244,3.101509,-0.525153,0.075,P-0012680,85.0,Colorectal Cancer,Colorectal Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,9.0,Metastasis,1.0,DECEASED,0.888,1.63,8.9,"{'entrezGeneId': 196528, 'hugoGeneSymbol': 'ARID2', 'type': 'protein-coding'}",196528.0,Splice_Site,X1758_splice,46287412.0,46287412.0,G,T,12,ARID2,204.0,706.0,12_46287412_G_T,P-0012680-T01-IM5_12_46287412_G_T,1758,0.224176,,,,
180810,P-0012680-T01-IM5_P-0012680-N01-IM5,P-0012680-T01-IM5,0.797244,3.101509,-0.525153,0.075,P-0012680,85.0,Colorectal Cancer,Colorectal Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,9.0,Metastasis,1.0,DECEASED,0.888,1.63,8.9,"{'entrezGeneId': 3643, 'hugoGeneSymbol': 'INSR', 'type': 'protein-coding'}",3643.0,Missense_Mutation,R413H,7172331.0,7172331.0,C,T,19,INSR,315.0,857.0,19_7172331_C_T,P-0012680-T01-IM5_19_7172331_C_T,413,0.268771,,,,


In [24]:
l = [1,2,3,4,5,6,7]
l[2:]

[3, 4, 5, 6, 7]