# <span style='font-family:"Times New Roman"'> <span styel=''>**CBIOPORTAL EXTRACTION OF MSK IMPACT MUTATIONS**
## <span style='font-family:"Times New Roman"'> <span styel=''>*Emile Cohen*
 *March 2020*

**Goal:** Through this notebook, we call the CBioPortal API to get the mutations file of the entire MSK IMPACT Cohort.

---

# Creation of exploitable dataframe from CBioPortal Datasets

In [4]:
%run -i '../../utils/setup_environment.ipy'

import warnings
warnings.filterwarnings('ignore')

data_path = '../../data/'

import pprint
import json
import click
import requests

Setup environment... done!


<span style="color:green">✅ Working on **mskimpact_env** conda environment.</span>

## Extract dataset from Cbio Portal API
First we need to extract the samples from the Clinical Data Table that we have already downloaded from CBioPortal

In [5]:
samples = pd.read_csv(data_path + 'cbioportal/raw/mskimpact_clinical_data-2.tsv', sep= '\t')

In [10]:
%%time

def retry_request(method, **kwargs):
    """Retry request operation multiple times."""
    url = kwargs.get('url') or ""
    click.secho(f"Querying to {url}", fg="green")
    for i in [0.2, 1, 5, 10, 60, 90, 120, 300]:  # attempt some retries
        try:
            error = None
            response = getattr(requests, method)(verify=False, **kwargs)
        except requests.exceptions.RequestException as request_error:  # pragma: no cover
            error = request_error
            response = None
        if response is not None and not str(response.status_code).startswith("50"):
            break
        else:  # pragma: no cover
            msg = f"Request to {url} failed with error: {error}, retrying in {i}s..."
            click.secho(msg, fg="yellow", err=True)
            time.sleep(i)
    return response

endpoints =  { 
    'samples': 'https://cbioportal.mskcc.org/api/studies/mskimpact/patients/{pid}/samples?projection=DETAILED',
    'mutations': 'https://cbioportal.mskcc.org/api/molecular-profiles/mskimpact_mutations/mutations/fetch?projection=DETAILED',
    'segments': 'https://cbioportal.mskcc.org/api/copy-number-segments/fetch?projection=DETAILED'
}

headers = {
    "Authorization": f"Bearer 82e01e74-dd2a-412a-9a19-fda2e5d5d519",
    "Content-Type": "application/json",
}

if samples.any().any():
    sampleIds = list(samples['Sample ID'])
    #click.secho(f'Patient {pid} has samples: {", ".join(sampleIds)}')
    # Get Mutations for all DMP Samples
    data = {'sampleIds': sampleIds}
    response = retry_request(
        'post',
        url=endpoints['mutations'], 
        data=json.dumps(data),
        headers=headers
    )
    mutations = [{**d, **d["gene"]} for d in json.loads(response.content)]    

Querying to https://cbioportal.mskcc.org/api/molecular-profiles/mskimpact_mutations/mutations/fetch?projection=DETAILED




CPU times: user 12.9 s, sys: 1.5 s, total: 14.4 s
Wall time: 47.3 s


'if samples.any().any():\n    sampleIds = [sample[\'Sample ID\'] for sample in samples]\n    #click.secho(f\'Patient {pid} has samples: {", ".join(sampleIds)}\')\n    # Get Mutations for all DMP Samples\n    data = {\'sampleIds\': sampleIds}\n    response = retry_request(\n        \'post\',\n        url=endpoints[\'mutations\'], \n        data=json.dumps(data),\n        headers=headers\n    )'

In [11]:
mut_table = pd.DataFrame(mutations)
mut_table

In [None]:
mut_table.to_pickle(data_path + 'cbioportal/raw/mutations_cohort.pkl')