### cBioPortal REST API test

List of studies:
- https://github.com/cBioPortal/datahub/tree/master/public
    - if needed you could parse the name of studies from the website and feed them into here
    
    
Issues:
- is this the proper REST API usage?
- I can only query 1000 rows at a time 
    - ie. pageSize=1000 parameter

#### The raw data from REST API

In [1]:
import pandas as pd

pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/acyc_mskcc_2013/clinical-data?projection=SUMMARY&pageNumber=0&pageSize=1000&direction=ASC').head(8)

Unnamed: 0,clinicalAttributeId,entityId,value
0,CANCER_TYPE,ACYC-MSKCC_09_12352,Salivary Gland Cancer
1,CANCER_TYPE_DETAILED,ACYC-MSKCC_09_12352,Adenoid Cystic Carcinoma
2,METASTATIC_SITE,ACYC-MSKCC_09_12352,Bone
3,METASTATIC_TUMOR_INDICATOR,ACYC-MSKCC_09_12352,No
4,ONCOTREE_CODE,ACYC-MSKCC_09_12352,ACYC
5,PERINEURAL_INVASION,ACYC-MSKCC_09_12352,Microscopic
6,PLATFORM,ACYC-MSKCC_09_12352,WGS/WES
7,PRIMARY_SITE,ACYC-MSKCC_09_12352,Salivary gland


In [2]:
def json_to_df(df):
    # get the unique entities - study subjects
    unique_ids = set(df.entityId)
    # get unique attributes - attributes in this study
    unique_attrib = set(df.clinicalAttributeId)
    # create a dataframe from (study subjects x attributes)
    new_df = pd.DataFrame(index = unique_ids, columns = unique_attrib)
    # go through every id
    for i in unique_ids:
        # get the attribute value
        for j in unique_attrib:
            try:
                # extract value
                value = df.loc[(df['entityId']== i)&(df['clinicalAttributeId'] == j)]
                # set the value in dataframe
                new_df.set_value(index = i, col = j, value = value.values[0,2])
            # in case the value isn't there which is quite often
            except IndexError:
                pass
    return new_df



In [9]:
df1 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/acyc_mskcc_2013/clinical-data?projection=SUMMARY&pageNumber=0&pageSize=1000&direction=ASC'))
df2 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/acbc_mskcc_2015/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))
df3 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/ov_tcga/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))
df4 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/acyc_sanger_2013/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))
df5 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/egc_tmucih_2015/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))
df6 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/ccrcc_utokyo_2013/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))
df7 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/lgg_tcga/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))
df8 = json_to_df(pd.read_json('http://cbioportal-rc.herokuapp.com/api/studies/lihc_amc_prv/clinical-data?projection=SUMMARY&pageNumber=0&direction=ASC'))


df1.head(5)

Unnamed: 0,SAMPLE_TYPE,CANCER_TYPE_DETAILED,ONCOTREE_CODE,PRIMARY_SITE,METASTATIC_TUMOR_INDICATOR,TUMOR_TISSUE_SITE,CANCER_TYPE,PERINEURAL_INVASION,METASTATIC_SITE,TUMOR_STAGE,PLATFORM
ACYC-MSKCC_000236,Primary,Adenoid Cystic Carcinoma,ACYC,Head and Neck,No,Maxilla,Salivary Gland Cancer,Yes,,Advanced,WGS/WES
ACYC-MSKCC_2000136,Primary,Adenoid Cystic Carcinoma,ACYC,Oral cavity,No,Hard palate,Salivary Gland Cancer,Yes,,Advanced,WGS/WES
ACYC-MSKCC_002282,Primary,Adenoid Cystic Carcinoma,ACYC,Oral cavity,No,Hard palate,Salivary Gland Cancer,Yes,,Early,WGS/WES
ACYC-MSKCC_2453,Primary,Adenoid Cystic Carcinoma,ACYC,Oral cavity,No,Palate,Salivary Gland Cancer,Yes,,Advanced,WGS/WES
ACYC-MSKCC_80872,Primary,Adenoid Cystic Carcinoma,ACYC,Head and Neck,No,Thyroid,Salivary Gland Cancer,No,,Advanced,WGS/WES


### Jaccardi similarity between study attributes

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter



def similarity(df1, df2):
    # get the unique attributes
    study1_attrs=set(df1.columns.values)
    study2_attrs=set(df2.columns.values)

    
    #########################
    # JACCARD COEFFICIENT   #
    #########################
    
    # create a dataframe of desired dimension
    jaccard = pd.DataFrame(index=study1_attrs, columns=study2_attrs)
    jaccard = jaccard.fillna(0.0)

    df1 = df1.reset_index()
    df2 = df2.reset_index()

    
    # loop over unique values by column per dataframe
    # first go over unique values in dataframe1 columns
    for i in study1_attrs:
        unique_vals_1 = set(df1[i].unique())
        # then go over unique values in dataframe2 columns
        for j in study2_attrs:
            # get unique vals for dataframe2
            unique_vals_2 = set(df2[j].unique())
            # get the cardinality
            intersection_cardinality = float(len(set.intersection(unique_vals_1,unique_vals_2)))
            union_cardinality = float(len(set.union(unique_vals_1,unique_vals_2)))
            # jaccard coefficient
            jaccard_coef = float(intersection_cardinality/union_cardinality)
            # set the value into pandas dataframe
            jaccard.set_value(i, j, jaccard_coef)

    # if you print 'df' you can actually get the jaccard coefficient between each column per study
    
    #########################
    # COSINE SIMILARITY   #
    #########################
    
    cosine = pd.DataFrame(index=study1_attrs, columns=study2_attrs)
    cosine = cosine.fillna(0.0)
    
    df1 = df1.reset_index()
    df2 = df2.reset_index()


    for i in study1_attrs:
        # get all values for dataframe1 column i
        values_1 = df1[i].values.astype('str').tolist()
        # get counts per value values for dataframe1
        a_vals = Counter(values_1)

        # then go over values in dataframe2 columns
        for j in study2_attrs:
            # get all values for dataframe2 column j
            values_2 = df2[j].values.astype('str').tolist()            
            # count word occurrences
            b_vals = Counter(values_2)

            # convert to word-vectors
            words  = list(a_vals.keys() | b_vals.keys())
            a_vect = [a_vals.get(word, 0) for word in words]       
            b_vect = [b_vals.get(word, 0) for word in words]       

            # find cosine
            # length of each vector
            len_a  = sum(av*av for av in a_vect) ** 0.5           
            len_b  = sum(bv*bv for bv in b_vect) ** 0.5          
            # dot product of vectors
            dot    = sum(av*bv for av,bv in zip(a_vect, b_vect)) 
            # get the score
            cosine_score = dot / (len_a * len_b)
            # set the value in the place where we track it
            cosine.set_value(i, j, cosine_score)


    # now format the presentation of similar attributes by jaccard
    similar_attributes_value_jacardi = ''
    for i in (jaccard.index.values):
        for j in (jaccard.columns.values):
            if(jaccard.get_value(i,j) > 0.5):
                similar_attributes_value_jacardi += (i +' and ' + j + ' with Jacardi coef = ' + str(jaccard.get_value(i,j)) + ', ')


    # now format the presentation of similar attributes by jaccard
    similar_attributes_value_cosine = ''
    for i in (cosine.index.values):
        for j in (cosine.columns.values):
            if(cosine.get_value(i,j) > 0.5):
                similar_attributes_value_cosine += (i +' and ' + j + ' with Cosine score = ' + str(cosine.get_value(i,j)) + ', ')

    
    
    # get the same named columns
    same_attributes = study1_attrs.intersection(study2_attrs)
    # these are unique attributes in each study
    unique_attrs_study1 = study1_attrs-same_attributes
    unique_attrs_study2 = study2_attrs-same_attributes
                
    print("Study1 attributes:", study1_attrs)
    print()
    print("Study2 attributes:", study2_attrs)
    print()
    print("\n-----------------------------------------------------------------------------------------\n")
    print("Same attributesin both studies:",same_attributes)
    print("\n-------------------------------------\n")
    print("Similar attributes based on Jaccardi coefficient:",similar_attributes_value_jacardi)
    print("\n-------------------------------------\n")
    print("Similar attributes based on Cosine score:",similar_attributes_value_cosine)
    print("\n-------------------------------------\n")



    return 


In [7]:
similarity(df1,df2)

Study1 attributes: {'SAMPLE_TYPE', 'CANCER_TYPE_DETAILED', 'ONCOTREE_CODE', 'PRIMARY_SITE', 'METASTATIC_TUMOR_INDICATOR', 'TUMOR_TISSUE_SITE', 'CANCER_TYPE', 'PERINEURAL_INVASION', 'METASTATIC_SITE', 'TUMOR_STAGE', 'PLATFORM'}

Study2 attributes: {'SAMPLE_TYPE', 'CANCER_TYPE_DETAILED', 'ONCOTREE_CODE', 'PRIMARY_SITE', 'METASTATIC_TUMOR_INDICATOR', 'TUMOR_TISSUE_SITE', 'CANCER_TYPE', 'MYB_NFIB_NONSYNONYMOUS_COUNT', 'METASTATIC_SITE', 'ER_STATUS_BY_IHC', 'TYPE_OF_SURGERY', 'IHC_HER2', 'TUMOR_SIZE', 'PR_STATUS_BY_IHC', 'MYB_NFIB_CNA', 'TUMOR_STAGE', 'PLATFORM'}


-----------------------------------------------------------------------------------------

Same attributesin both studies: {'SAMPLE_TYPE', 'CANCER_TYPE_DETAILED', 'ONCOTREE_CODE', 'PRIMARY_SITE', 'METASTATIC_TUMOR_INDICATOR', 'TUMOR_TISSUE_SITE', 'CANCER_TYPE', 'METASTATIC_SITE', 'TUMOR_STAGE', 'PLATFORM'}

-------------------------------------

Similar attributes based on Jaccardi coefficient: SAMPLE_TYPE and SAMPLE_TYPE with Ja

In [8]:
similarity(df3,df4)

Study1 attributes: {'SAMPLE_TYPE', 'VIAL_NUMBER', 'IS_FFPE', 'PATHOLOGY_REPORT_UUID', 'OTHER_SAMPLE_ID', 'SAMPLE_TYPE_ID', 'SPECIMEN_SECOND_LONGEST_DIMENSION', 'SHORTEST_DIMENSION', 'PATHOLOGY_REPORT_FILE_NAME', 'LONGEST_DIMENSION'}

Study2 attributes: {'VHL_MUTATION_CODON', 'CANCER_TYPE_DETAILED', 'SARCOMATOID_COMPONENT', 'CANCER_TYPE', 'GENE_EXPRESSION_CLUSTER', 'METASTATIC_SITE', 'VHL_MUTATION_AA', 'STAGE_AT_DIAGNOSIS', 'GRADE', 'GENE_PANEL'}


-----------------------------------------------------------------------------------------

Same attributesin both studies: set()

-------------------------------------

Similar attributes based on Jaccardi coefficient: 

-------------------------------------

Similar attributes based on Cosine score: 

-------------------------------------



In [10]:
similarity(df5,df8)

Study1 attributes: {'CDH1_STATUS', 'CANCER_TYPE_DETAILED', 'DNA_REPAIR_STATUS', 'CLONAL_LABEL', 'CANCER_TYPE', 'ARID1A_MUTATION', 'TP53_MUTATION', 'GRADE', 'PLATFORM'}

Study2 attributes: {'GRADE', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED'}


-----------------------------------------------------------------------------------------

Same attributesin both studies: {'GRADE', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED'}

-------------------------------------

Similar attributes based on Jaccardi coefficient: 

-------------------------------------

Similar attributes based on Cosine score: 

-------------------------------------



In [14]:
similarity(df2,df6)

Study1 attributes: {'SAMPLE_TYPE', 'CANCER_TYPE_DETAILED', 'ONCOTREE_CODE', 'PRIMARY_SITE', 'METASTATIC_TUMOR_INDICATOR', 'TUMOR_TISSUE_SITE', 'CANCER_TYPE', 'MYB_NFIB_NONSYNONYMOUS_COUNT', 'METASTATIC_SITE', 'ER_STATUS_BY_IHC', 'TYPE_OF_SURGERY', 'IHC_HER2', 'TUMOR_SIZE', 'PR_STATUS_BY_IHC', 'MYB_NFIB_CNA', 'TUMOR_STAGE', 'PLATFORM'}

Study2 attributes: {'VHL_MUTATION_CODON', 'CANCER_TYPE_DETAILED', 'SARCOMATOID_COMPONENT', 'CANCER_TYPE', 'GENE_EXPRESSION_CLUSTER', 'METASTATIC_SITE', 'VHL_MUTATION_AA', 'STAGE_AT_DIAGNOSIS', 'GRADE', 'GENE_PANEL'}


-----------------------------------------------------------------------------------------

Same attributesin both studies: {'METASTATIC_SITE', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED'}

-------------------------------------

Similar attributes based on Jaccardi coefficient: 

-------------------------------------

Similar attributes based on Cosine score: METASTATIC_SITE and VHL_MUTATION_CODON with Cosine score = 0.969331267486, METASTATIC_SIT

In [18]:
similarity(df3,df7)

Study1 attributes: {'SAMPLE_TYPE', 'VIAL_NUMBER', 'IS_FFPE', 'PATHOLOGY_REPORT_UUID', 'OTHER_SAMPLE_ID', 'SAMPLE_TYPE_ID', 'SPECIMEN_SECOND_LONGEST_DIMENSION', 'SHORTEST_DIMENSION', 'PATHOLOGY_REPORT_FILE_NAME', 'LONGEST_DIMENSION'}

Study2 attributes: {'SAMPLE_TYPE', 'VIAL_NUMBER', 'OCT_EMBEDDED', 'IS_FFPE', 'INITIAL_WEIGHT', 'PATHOLOGY_REPORT_UUID', 'SAMPLE_TYPE_ID', 'OTHER_SAMPLE_ID', 'PATHOLOGY_REPORT_FILE_NAME', 'DAYS_TO_COLLECTION'}


-----------------------------------------------------------------------------------------

Same attributesin both studies: {'SAMPLE_TYPE', 'VIAL_NUMBER', 'IS_FFPE', 'PATHOLOGY_REPORT_UUID', 'SAMPLE_TYPE_ID', 'OTHER_SAMPLE_ID', 'PATHOLOGY_REPORT_FILE_NAME'}

-------------------------------------

Similar attributes based on Jaccardi coefficient: 

-------------------------------------

Similar attributes based on Cosine score: SAMPLE_TYPE and SAMPLE_TYPE with Cosine score = 0.998245144081, VIAL_NUMBER and VIAL_NUMBER with Cosine score = 0.99940374550