In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import islice
import os
from sklearn.decomposition import PCA

# !cat /proc/sys/vm/overcommit_memory
# !echo 1 > /proc/sys/vm/overcommit_memory

In [2]:
#Read Genomic data

In [3]:
gen_data=pd.read_csv("./data/genomic_features.tsv",sep="\t",nrows=10)

#Optimize datatypes to save memory
gene_dict=gen_data.dtypes.apply(lambda x: x.name).to_dict()
new_gene_dict=gene_dict.copy()
vlist=['COSMIC_ID','Sample Name','TISSUE_FACTOR']
i=1
for k in new_gene_dict.keys():
        if k in vlist:
            i=0
        else:
            new_gene_dict[k]='int16'
#         print(k,new_wes_dict[k])

genomic_features = pd.read_csv("./data/genomic_features.tsv",sep = "\t",dtype=new_gene_dict)

gen_data.head(2)

Unnamed: 0,COSMIC_ID,Sample Name,TISSUE_FACTOR,MSI_FACTOR,ABCB1_mut,ABL2_mut,ACACA_mut,ACVR1B_mut,ACVR2A_mut,AFF4_mut,...,BAZ2B_mut,BCLAF1_mut,BCOR_mut,BCR-ABL_mut,BLM_mut,BMPR2_mut,BNC2_mut,BPTF_mut,BRAF_mut,TP53_mut
0,1287381,201T,lung_NSCLC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,924100,22RV1,prostate,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Cell line details

In [5]:
celld=pd.read_csv('./data/Cell_Lines_Details.csv')
celld.head(2)

Unnamed: 0,Sample Name,COSMIC identifier,Whole Exome Sequencing (WES),Copy Number Alterations (CNA),Gene Expression,Methylation,Drug\nResponse,GDSC\nTissue descriptor 1,GDSC\nTissue\ndescriptor 2,Cancer Type\n(matching TCGA label),Microsatellite \ninstability Status (MSI),Screen Medium,Growth Properties
0,A253,906794.0,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,,MSS/MSI-L,D/F12,Adherent
1,BB30-HNC,753531.0,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent


In [6]:
# Read in drug data
drug_d=pd.read_csv("./data/Screened_Compounds.csv",sep=',')
drug_d.head()

Unnamed: 0,DRUG_ID,DRUG_NAME,SYNONYMS,TARGET,TARGET_PATHWAY
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling
3,6,PHA-665752,"PHA665752, PHA 665752",MET,RTK signaling
4,9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",Protein stability and degradation


In [7]:
# Read in drug response data
dose_response_df_all = pd.read_csv('./data/v17.3_fitted_dose_response_binary.csv', index_col='IC50_RESULTS_ID')
dose_response_df=pd.merge(dose_response_df_all,drug_d,how='left',on='DRUG_ID')
dose_response_df['TARGET_PATHWAY']=dose_response_df['TARGET_PATHWAY'].fillna("UNK")
dose_response_df.head(2)

Unnamed: 0,DATASET_VERSION,COSMIC_ID,CELL_LINE_NAME,DRUG_ID,DRUG_NAME_x,PUTATIVE_TARGET,MAX_CONC_MICROMOLAR,MIN_CONC_MICROMOLAR,LN_IC50,AUC,RMSE,Z_SCORE,BINARY_RESPONSE,DRUG_NAME_y,SYNONYMS,TARGET,TARGET_PATHWAY
0,17.3,683665,MC-CAR,1,Erlotinib,EGFR,2.0,0.007812,2.453524,0.98261,0.021678,-0.015505,R,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
1,17.3,684055,ES3,1,Erlotinib,EGFR,2.0,0.007812,3.376592,0.985169,0.029915,0.779999,R,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling


In [8]:
#Read in cancer labels, clean data
tcga_labels=pd.read_csv("./data/TCGA_Labels.csv")
celld=pd.read_csv('./data/Cell_Lines_Details.csv')
celld=celld[['COSMIC identifier','GDSC\nTissue descriptor 1','GDSC\nTissue\ndescriptor 2','Cancer Type\n(matching TCGA label)']]
celld_desc=pd.merge(celld,tcga_labels,how='outer',left_on='Cancer Type\n(matching TCGA label)',right_on="Sigle",indicator=True)
celld_desc['Program']=celld_desc['Program'].fillna('UNK')
celld_desc=celld_desc.rename(columns={'COSMIC identifier':'COSMIC_ID','GDSC\nTissue descriptor 1':'GDSC_DESC1','GDSC\nTissue\ndescriptor 2':'GDSC_DESC2','Cancer Type\n(matching TCGA label)':'TCGA_Label', 'Program':'Cancer_Type'                                     })
dose_resp_caname=pd.merge(dose_response_df,celld_desc,how='left',on='COSMIC_ID')
dose_response_trimmed_df = dose_resp_caname[['COSMIC_ID', 'DRUG_ID', 'LN_IC50', 'BINARY_RESPONSE','TCGA_Label','Cancer_Type','TARGET_PATHWAY']]
dose_response_trimmed_df.head(2)

Unnamed: 0,COSMIC_ID,DRUG_ID,LN_IC50,BINARY_RESPONSE,TCGA_Label,Cancer_Type,TARGET_PATHWAY
0,683665,1,2.453524,R,MM,Multiple Myeloma,EGFR signaling
1,684055,1,3.376592,R,,UNK,EGFR signaling


In [9]:
# WES prep

In [10]:
# Read Wes data first few rows
wes_scored_temp = pd.read_csv("./data/data2/wes_scored_transposed.tsv",sep = "\t",nrows=10)
wes_scored_temp.head()

Unnamed: 0.1,Unnamed: 0,1,2,9,10,12,13,14,15,16,...,101060321,101927546,101927722,101928638,102724473,102724928,105375355,105378803,107403068,109731405
0,907268,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,907269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,907270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,907271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,907272,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Read full WES data with right data types
wes_dict=wes_scored_temp.dtypes.apply(lambda x: x.name).to_dict()
new_wes_dict=wes_dict.copy()
i=1
for k in new_wes_dict.keys():
        if k=='Unnamed: 0':
            i=0
        else:
            new_wes_dict[k]='int16'
#         print(k,new_wes_dict[k])

wes_scored = pd.read_csv("./data/data2/wes_scored_transposed.tsv",sep = "\t",dtype=new_wes_dict)

In [12]:
wes_scored=wes_scored.rename(columns={"Unnamed: 0":"COSMIC_ID"})

In [13]:
#Subset one pathway RTK signaling and lung cancer - THIS MAY NEED TO BE PARAMETERS
rtk=dose_resp_caname[(dose_resp_caname['TARGET_PATHWAY']=='DNA replication') & (dose_resp_caname['GDSC_DESC1']=='aero_dig_tract')]
# rtk['Cancer_Type'].value_counts()
rtk.shape

(1540, 23)

In [14]:
rtk_wes_joined_df = pd.merge(wes_scored, rtk,how='inner', on='COSMIC_ID')
print(rtk_wes_joined_df.shape)
print(rtk.shape)

(1540, 18404)
(1540, 23)


In [15]:
rtk_wes_joined_df['COSMIC_DRUG_ID']=rtk_wes_joined_df['COSMIC_ID'].map(str)+"_"+rtk_wes_joined_df['DRUG_ID'].map(str)
wes_subset=rtk_wes_joined_df[['COSMIC_DRUG_ID','1292','79091','6239','127623','790','64241','6328','1357','3671','5798','153643','343099','322','675','2648','4439','4668','6433','6670','7220','8139','8170','9779','9902','10075','23191','23469','25917','51168','55832','56884','122618']]

In [16]:
#CNA

In [17]:
cna_scored_temp = pd.read_csv("./data/data2/cna_scored_transposed.tsv",sep = "\t", nrows=10)
cna_scored_temp.head()

Unnamed: 0,COSMIC_ID,1,2,9,10,12,13,14,15,16,...,107133486,107133502,107133524,107161145,107985535,107986809,107987337,107987341,109731405,112441434
0,1240121,0.0,0.0,-1.0,-1.0,0.0,1.584963,0.0,0.0,0.0,...,0.0,-4.32,0.0,0.0,0.0,0.0,-4.32,-4.32,0.0,0.0
1,1240122,0.0,0.584963,0.0,0.0,0.0,1.0,0.0,0.0,0.584963,...,0.584963,0.0,0.0,0.584963,0.0,0.584963,-4.32,-4.32,0.584963,0.584963
2,1240123,0.0,0.0,0.0,0.0,0.0,0.321928,0.0,0.0,0.321928,...,-0.415037,-4.32,0.0,0.0,0.0,0.0,-4.32,-4.32,0.321928,-0.415037
3,1240124,0.0,0.584963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-4.32,0.0,0.0,0.0,0.0,-4.32,-4.32,0.0,0.0
4,1240125,0.0,0.0,0.0,0.0,-0.415037,0.0,0.0,0.0,-1.0,...,-0.415037,-2.0,0.0,0.321928,0.0,0.321928,-2.0,-2.0,-1.0,-0.415037


In [18]:
cna_dict=cna_scored_temp.dtypes.apply(lambda x: x.name).to_dict()
new_cna_dict=cna_dict.copy()
i=1
for k in new_cna_dict.keys():
        if k=='COSMIC_ID':
            i=0
        else:
            new_cna_dict[k]='float16'
#         print(k,new_cna_dict[k])

In [19]:
cna_scored = pd.read_csv("./data/data2/cna_scored_transposed.tsv",sep = "\t", dtype=new_cna_dict)
cna_scored.head()

Unnamed: 0,COSMIC_ID,1,2,9,10,12,13,14,15,16,...,107133486,107133502,107133524,107161145,107985535,107986809,107987337,107987341,109731405,112441434
0,1240121,0.0,0.0,-1.0,-1.0,0.0,1.584961,0.0,0.0,0.0,...,0.0,-4.320312,0.0,0.0,0.0,0.0,-4.320312,-4.320312,0.0,0.0
1,1240122,0.0,0.584961,0.0,0.0,0.0,1.0,0.0,0.0,0.584961,...,0.584961,0.0,0.0,0.584961,0.0,0.584961,-4.320312,-4.320312,0.584961,0.584961
2,1240123,0.0,0.0,0.0,0.0,0.0,0.322021,0.0,0.0,0.322021,...,-0.415039,-4.320312,0.0,0.0,0.0,0.0,-4.320312,-4.320312,0.322021,-0.415039
3,1240124,0.0,0.584961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-4.320312,0.0,0.0,0.0,0.0,-4.320312,-4.320312,0.0,0.0
4,1240125,0.0,0.0,0.0,0.0,-0.415039,0.0,0.0,0.0,-1.0,...,-0.415039,-2.0,0.0,0.322021,0.0,0.322021,-2.0,-2.0,-1.0,-0.415039


In [20]:
rtk=rtk.drop(['_merge'],axis=1)
rtk_cna_joined_df = pd.merge(cna_scored, rtk, how='inner', on='COSMIC_ID',indicator=True)
rtk_cna_joined_df['COSMIC_DRUG_ID']=rtk_cna_joined_df['COSMIC_ID'].map(str)+"_"+rtk_cna_joined_df['DRUG_ID'].map(str)

#Make a copy with original labels and other fields
rtk_cna_joined_df_copy_with_label=rtk_cna_joined_df.copy()

rtk_cna_joined_df=rtk_cna_joined_df.drop({"LN_IC50","COSMIC_ID","CELL_LINE_NAME","DRUG_ID","_merge","Sigle","GDSC_DESC1","GDSC_DESC2","TCGA_Label","Cancer_Type",'MAX_CONC_MICROMOLAR','MIN_CONC_MICROMOLAR','AUC','RMSE','Z_SCORE','DATASET_VERSION','PUTATIVE_TARGET','DRUG_NAME_x','DRUG_NAME_y','SYNONYMS','TARGET','TARGET_PATHWAY','BINARY_RESPONSE'},axis=1)
rtk_cna_joined_df=rtk_cna_joined_df.set_index("COSMIC_DRUG_ID")


In [21]:
rtk_cna_joined_df.head(2)

Unnamed: 0_level_0,1,2,9,10,12,13,14,15,16,18,...,107133486,107133502,107133524,107161145,107985535,107986809,107987337,107987341,109731405,112441434
COSMIC_DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1240121_133,0.0,0.0,-1.0,-1.0,0.0,1.584961,0.0,0.0,0.0,0.0,...,0.0,-4.320312,0.0,0.0,0.0,0.0,-4.320312,-4.320312,0.0,0.0
1240121_133,0.0,0.0,-1.0,-1.0,0.0,1.584961,0.0,0.0,0.0,0.0,...,0.0,-4.320312,0.0,0.0,0.0,0.0,-4.320312,-4.320312,0.0,0.0


In [23]:
#Apply PCA on CNA test
from sklearn.externals import joblib

# Load from file
joblib_file = "model2_cna_pca.pkl"
joblib_pca = joblib.load(joblib_file)

# Transform test file
devcnapca=joblib_pca.transform(rtk_cna_joined_df)  

i=1
varlist=[]
for i in range(1, 51):
    var="cna_princ"+str(i)
    varlist.append(var)
    
cna_subset=pd.DataFrame(data=devcnapca,columns=varlist,index=rtk_cna_joined_df.index)
cna_subset=cna_subset.reset_index()
cna_subset.head()

Unnamed: 0,COSMIC_DRUG_ID,cna_princ1,cna_princ2,cna_princ3,cna_princ4,cna_princ5,cna_princ6,cna_princ7,cna_princ8,cna_princ9,...,cna_princ41,cna_princ42,cna_princ43,cna_princ44,cna_princ45,cna_princ46,cna_princ47,cna_princ48,cna_princ49,cna_princ50
0,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,-6.487453,-4.156025,-3.84717,-4.248751,0.089578,-10.973423,7.944318,2.843812,-5.619458,-5.506473
1,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,-6.487453,-4.156025,-3.84717,-4.248751,0.089578,-10.973423,7.944318,2.843812,-5.619458,-5.506473
2,1240121_134,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,-6.487453,-4.156025,-3.84717,-4.248751,0.089578,-10.973423,7.944318,2.843812,-5.619458,-5.506473
3,1240121_134,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,-6.487453,-4.156025,-3.84717,-4.248751,0.089578,-10.973423,7.944318,2.843812,-5.619458,-5.506473
4,1240121_135,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,-6.487453,-4.156025,-3.84717,-4.248751,0.089578,-10.973423,7.944318,2.843812,-5.619458,-5.506473


In [24]:
#Join the WES and CNA, split COSMIC_DRUG_ID to COSMIC_ID and DRUG_ID
cna_wes_joined=pd.merge(cna_subset,wes_subset,how='inner',on='COSMIC_DRUG_ID')

cna_wes_joined["DRUG_ID"]=(cna_wes_joined['COSMIC_DRUG_ID'].str.extract(pat = '(["_"].+)'))
cna_wes_joined["DRUG_ID"]=cna_wes_joined["DRUG_ID"].str.replace('_','').astype(int)

cna_wes_joined["COSMIC_ID"]=(cna_wes_joined['COSMIC_DRUG_ID'].str.extract(pat = '(.+["_"])'))
cna_wes_joined["COSMIC_ID"]=cna_wes_joined["COSMIC_ID"].str.replace('_','').astype(int)

cna_wes_joined.head()

Unnamed: 0,COSMIC_DRUG_ID,cna_princ1,cna_princ2,cna_princ3,cna_princ4,cna_princ5,cna_princ6,cna_princ7,cna_princ8,cna_princ9,...,10075,23191,23469,25917,51168,55832,56884,122618,DRUG_ID,COSMIC_ID
0,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0,0,0,0,0,0,0,0,133,1240121
1,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0,0,0,0,0,0,0,0,133,1240121
2,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0,0,0,0,0,0,0,0,133,1240121
3,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0,0,0,0,0,0,0,0,133,1240121
4,1240121_134,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0,0,0,0,0,0,0,0,134,1240121


In [25]:
#Transform drug_id to indicator columns
cna_wes_joined = pd.concat([cna_wes_joined, pd.get_dummies(cna_wes_joined['DRUG_ID'], prefix="drug_id_" )],axis=1)

In [26]:
#Import number of targets with drug id
drug_target=pd.read_csv("./data/drugs_num_target.csv")
drug_target.head(3)

Unnamed: 0,DRUG_ID,Num_Targets
0,1,1
1,3,1
2,5,6


In [27]:
#Import drugs with target indicator
drug_with_target=pd.read_csv("./data/drugs_with_target.csv")
drug_with_target=drug_with_target.drop(['num_targets'],axis=1)
drug_with_target.head(3)

Unnamed: 0,DRUG_ID,ABL,ABL(T315I),ADCK4,AKT1,AKT2,AKT3,ALK,ALK4,ALK5,...,WEE1,XIAP,c-FGR,dsDNAbreakinduction,gamma-secretase,notdefined,others,p38,p38alpha,p38beta
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
#Join all 
cna_wes_gen_joined=pd.merge(cna_wes_joined,genomic_features,how='inner',on='COSMIC_ID')
all_joined=pd.merge(cna_wes_gen_joined,drug_target,how='inner',on='DRUG_ID')
all_joined1=pd.merge(all_joined,drug_with_target,how='inner',on='DRUG_ID')
all_joined2=all_joined1.drop(['Sample Name','TISSUE_FACTOR','DRUG_ID'],axis=1)
all_joined2.head()

Unnamed: 0,COSMIC_DRUG_ID,cna_princ1,cna_princ2,cna_princ3,cna_princ4,cna_princ5,cna_princ6,cna_princ7,cna_princ8,cna_princ9,...,WEE1,XIAP,c-FGR,dsDNAbreakinduction,gamma-secretase,notdefined,others,p38,p38alpha,p38beta
0,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1240121_133,-31.606542,-5.181886,-1.091571,-1.189097,0.751733,-9.555195,-22.575763,30.862199,4.737867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1240122_133,17.593993,49.914069,14.912065,-0.78764,0.816889,-13.254492,2.553275,-19.072798,-8.723795,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
all_joined2.shape
all_joined2.to_csv("./data/model2_to_score.csv")

In [35]:
# Score with model

In [36]:
%run -i model2_scoring.py ./data/model2_to_score.csv scored_model2.csv



In [37]:
scoreddf=pd.read_csv("./scored_model2.csv")

scored=pd.concat([all_joined2,scoreddf],axis=1)[['COSMIC_DRUG_ID','Prediction']]
scored["DRUG_ID"]=(scored['COSMIC_DRUG_ID'].str.extract(pat = '(["_"].+)'))
scored["DRUG_ID"]=scored["DRUG_ID"].str.replace('_','').astype(int)

scored["COSMIC_ID"]=(scored['COSMIC_DRUG_ID'].str.extract(pat = '(.+["_"])'))
scored["COSMIC_ID"]=scored["COSMIC_ID"].str.replace('_','').astype(int)
scored.head()

Unnamed: 0,COSMIC_DRUG_ID,Prediction,DRUG_ID,COSMIC_ID
0,1240121_133,-2.22276,133,1240121
1,1240121_133,-2.22276,133,1240121
2,1240121_133,-2.22276,133,1240121
3,1240121_133,-2.22276,133,1240121
4,1240122_133,-1.919224,133,1240122


In [38]:
scored.shape

(3058, 4)

In [39]:
tdf=pd.read_csv("./Drug_Response_threshold.csv")
tdf=tdf.drop(['Screened Compounds:'],axis=1)

In [40]:
scored_resp=pd.merge(scored,tdf,how='left',on='DRUG_ID')
scored_resp=scored_resp.rename(columns={'Prediction':'Predicted_IC50'})
scored_resp['Predicted_Resp']=np.where(scored_resp['Predicted_IC50']>=scored_resp['IC50_Threshold'],"S","R")
scored_resp['Predicted_Resp']=np.where(scored_resp['IC50_Threshold'].isnull(),"UNK",scored_resp['Predicted_Resp'])
scored_resp.head()

Unnamed: 0,COSMIC_DRUG_ID,Predicted_IC50,DRUG_ID,COSMIC_ID,IC50_Threshold,Predicted_Resp
0,1240121_133,-2.22276,133,1240121,-3.9565,S
1,1240121_133,-2.22276,133,1240121,-3.9565,S
2,1240121_133,-2.22276,133,1240121,-3.9565,S
3,1240121_133,-2.22276,133,1240121,-3.9565,S
4,1240122_133,-1.919224,133,1240122,-3.9565,S


In [45]:
scored_resp[scored_resp['COSMIC_ID']==907319]

Unnamed: 0,COSMIC_DRUG_ID,Predicted_IC50,DRUG_ID,COSMIC_ID,IC50_Threshold,Predicted_Resp
202,907319_133,-2.735652,133,907319,-3.9565,S
203,907319_133,-2.735652,133,907319,-3.9565,S
204,907319_133,-2.735652,133,907319,-3.9565,S
205,907319_133,-2.735652,133,907319,-3.9565,S
480,907319_134,0.28602,134,907319,-1.2198,S
481,907319_134,0.28602,134,907319,-1.2198,S
482,907319_134,0.28602,134,907319,-1.2198,S
483,907319_134,0.28602,134,907319,-1.2198,S
758,907319_135,-3.221859,135,907319,-5.9903,S
759,907319_135,-3.221859,135,907319,-5.9903,S
