# <span style='font-family:"Times New Roman"'> <span styel=''> **OncoKB Annotation**

*Emile Cohen* 
    
*March 2020*

**Goal:** In this notebook, we call the OncoKB API to get the annotation at variant level for MSK-Impact cohort. 

---

In [1]:
%run -i '../../utils/setup_environment.ipy'

import warnings
import requests
import click
import os
import json
import time
from tqdm import tqdm_notebook as tqdm
warnings.filterwarnings('ignore')

data_path = '../../data/'

Setup environment... done!


<span style="color:green">✅ Working on **mskimpact_env** conda environment.</span>

# API Call

---
To call the API, we need a token to access the API, that could be found after logging in in OncoKB (https://www.oncokb.org/dataAccess) in *Account Settings*.

My Token is : **f08eeec6-fe2e-4414-9352-df71e2c1db32**

---

## Useful functions to call OncoKB API

In [142]:
def retry_request(method, **kwargs):
    """Retry request operation multiple times."""
    url = kwargs.get('url') or ""
    click.secho(f"Querying to {url}", fg="green")
    for i in [0.2, 1, 5]:  # attempt some retries
        try:
            error = None
            response = getattr(requests, method)(verify=False, **kwargs)
        except requests.exceptions.RequestException as request_error:  # pragma: no cover
            error = request_error
            response = None
        if response is not None and not str(response.status_code).startswith("50"):
            break
        else:  # pragma: no cover
            msg = f"Request to {url} failed with error: {error}, retrying in {i}s..."
            click.secho(msg, fg="yellow", err=True)
            time.sleep(i)
    return response


ONCOKB_ENDPOINTS = {
    "api_info": "https://www.oncokb.org/api/v1/info",
    "cancer_genes": "https://www.oncokb.org/api/v1/utils/cancerGeneList",
    "annotate": "https://www.oncokb.org/api/v1/annotate/mutations/byGenomicChange",
}


def oncokb_query(endpoint, method="get", data=None):
    """Make authenticated calls to newest oncokb api to gen cancer genes."""
    oncokb_token = "f08eeec6-fe2e-4414-9352-df71e2c1db32"
    if not oncokb_token:
        raise click.UsageError(
            "oncoKb authentication token is missing. "
            "Add it as enviromental variable:"
            "\n\texport ONCOKB_TOKEN=<your-oncokb-token>"
            "\nOr add them to a .env file."
        )
    headers = {
        "Authorization": f"Bearer {oncokb_token}",
        "Content-Type": "application/json",
    }
    response = retry_request(method, url=endpoint, headers=headers, data=data)
    if not response.ok:
        raise click.UsageError(f"OncoKb query failed. Reason: {response.reason}")
    return json.loads(response.content)


def get_oncokb_treatments(df, genomic_cols, tumor_type):
    """Return df with extra columns for Drug and Evidence Level."""
    chrom_col, start_col, end_col, ref_col, alt_col = genomic_cols
    genomic_changes = [
        [v[chrom_col], v[start_col], v[end_col], v[ref_col], v[alt_col]]
        for v in df.T.to_dict().values()
    ]
    data = [
        {
            "genomicLocation": ",".join(map(str, i)),
            "tumorType": tumor_type,
            "id": "",
            "evidenceTypes": [],
        }
        for i in genomic_changes
    ]
    annotated_variants = oncokb_query(
        ONCOKB_ENDPOINTS["annotate"], method="post", data=json.dumps(data)
    )
    return annotated_variants


def annotate_variants(df): 
    data = [
    {
      "genomicLocation": "{},{},{},{},{}".format(
        v["Chromosome"], int(v["Start_Position"]), int(v["End_Position"]), v["Reference_Allele"], v["Variant_Allele"]
      ),
      "tumorType": v["Cancer_Type"],
      "id": "",
      "evidenceTypes": [],
    }
    for v in df.T.to_dict().values()
    ]
    try: 
        annotated_variants = oncokb_query(ONCOKB_ENDPOINTS["annotate"], method="post", data=json.dumps(data))
        print([[v['mutationEffect']['knownEffect'], v['oncogenic'],v['vus'], v['hotspot']] for v in annotated_variants])
    except: return [['None', 'None', 'None', 'None']]
    
    return [[v['mutationEffect']['knownEffect'], v['oncogenic'],v['vus'], v['hotspot']] for v in annotated_variants]

def get_oncokb_genes():
    """Get list of genes and biomarker incindence levels from oncokb."""
    return oncokb_query(ONCOKB_ENDPOINTS["cancer_genes"])


### Use Chunks and Multiprocessing to download faster

In [105]:
maf_cohort = pd.read_pickle(data_path + 'merged_data/maf_cohort.pkl')
maf_cohort = maf_cohort.dropna(subset=["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"])

In [41]:
%%time
query = annotate_variants(maf_cohort)

[32mQuerying to https://www.oncokb.org/api/v1/annotate/mutations/byGenomicChange[0m
CPU times: user 23.1 s, sys: 553 ms, total: 23.7 s
Wall time: 24 s


In [66]:
new_data = pd.DataFrame(query, columns=['mutationEffect', 'oncogenic', 'vus', 'hotspot'])#['mutationEffect']['knownEffect']

new_data

Unnamed: 0,mutationEffect,oncogenic,vus,hotspot
0,Gain-of-function,Oncogenic,False,True
1,Gain-of-function,Oncogenic,False,True


In [None]:
import time

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def fill_with_queries(df: pd.DataFrame, chunk_size: int):
    
    maf_chunks = list(chunks(df, chunk_size))
    list_ = [chunk_size*x for x in list(range(len(maf_chunks)))]

    for i, chunk in tqdm(zip(list_, maf_chunks)):
        #print(i, len(chunk))
        df_chunk = df[i:i+chunk_size]
        query = annotate_variants(chunk)
        new_data = pd.DataFrame(query, columns=['mutationEffect', 'oncogenic', 'vus', 'hotspot'])

        for column in ['mutationEffect', 'oncogenic', 'vus', 'hotspot']:
            df_chunk[column] = new_data[column]
        
        df_chunk.to_pickle(data_path + 'annotation_bis/maf_cohort_annotated_{}_{}.pkl'.format(0+i,0+i+chunk_size))
        print(0+i)

    return df

In [None]:
%%time
maf_cohort= pd.read_pickle(data_path + 'merged_data/maf_cohort.pkl')

# Deleting NaN lines
maf_cohort = maf_cohort.dropna(subset=["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"])
maf_cohort = maf_cohort[maf_cohort['Chromosome'] != 'NA'][maf_cohort['Start_Position'] != 'NA'][maf_cohort['End_Position'] != 'NA'][maf_cohort['Reference_Allele'] != 'NA'][maf_cohort['Variant_Allele'] != 'NA']
maf_cohort['mutationEffect'] = 'NaN' ; maf_cohort['oncogenic'] = 'NaN' ; maf_cohort['vus'] = 'NaN' ; maf_cohort['hotspot'] = 'NaN'
maf_cohort_mini = maf_cohort[0:1]
#maf_cohort_mini
fill_with_queries(maf_cohort_mini, 1)

In [78]:
test = maf_cohort[97500:97600][["Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Variant_Allele"]]
maf_cohort[maf_cohort['Chromosome'] != 'NA'][maf_cohort['Start_Position'] != 'NA'][maf_cohort['End_Position'] != 'NA'][maf_cohort['Reference_Allele'] != 'NA'][maf_cohort['Variant_Allele'] != 'NA']

Unnamed: 0,Chromosome,Start_Position,End_Position,Reference_Allele,Variant_Allele
97946,2,25982440.0,25982440.0,G,A
97947,9,98270529.0,98270530.0,-,C
97948,6,106553389.0,106553389.0,G,A
97949,4,66356133.0,66356133.0,C,T
97950,7,152345760.0,152345760.0,A,-
97951,17,78919466.0,78919466.0,G,T
97952,17,15967388.0,15967388.0,G,A
97953,16,30129462.0,30129462.0,C,T
97954,16,339494.0,339494.0,C,T
97955,23,44918328.0,44918328.0,C,T


In [134]:
maf_cohort.to_pickle(data_path + 'merged_data/maf_cohort_annotated.pkl')

In [138]:
# Now we have to merge all the files
from os import listdir
from os.path import isfile, join
files = [data_path + 'annotation/' + f for f in listdir(data_path + 'annotation') if isfile(join(data_path + 'annotation', f))]
data = [pd.read_pickle(f) for f in files]

In [139]:
maf_annotated_total = pd.concat(data)

In [140]:
maf_annotated_total.to_pickle(data_path + 'maf_cohort_annotated.pkl')

In [None]:
maf_annotated_total[0:10]