## Lab 2, Part 1 - Code & Comments
Elysa Strunin  
October 2018  

In [215]:
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.io.json import json_normalize

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

import matplotlib
import matplotlib.pyplot as plt

### Data Preparation

In [2]:
dose_response = pd.read_csv('v17_fitted_dose_response.csv')
dose_response.head()

Unnamed: 0,Dataset_version,IC50_Results_ID,COSMIC_ID,DRUG_ID,MAX_CONC_MICROMOLAR,LN_IC50,AUC,RMSE
0,17,335,924100,1026,1.0,0.72,0.9,0.11
1,17,336,924100,1028,2.0,2.66,0.96,0.18
2,17,337,924100,1029,2.0,3.34,0.97,0.08
3,17,338,924100,1030,10.0,5.16,0.98,0.09
4,17,339,924100,1031,0.2,-4.33,0.51,0.09


In [3]:
expr = pd.read_table('Cell_line_COSMIC_ID_gene_expression_transposed_clean.tsv', header=None)
header = pd.read_table('Cell_line_RMA_proc_basalExp_transposed.tsv', nrows = 2)

more_info = header
more_info

Unnamed: 0,GENE_SYMBOLS,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,LINC00526,PPY2,Unnamed: 17730,Unnamed: 17731,KRT18P55,Unnamed: 17733,POLRMTP1,UBL5P2,TBC1D3P5,Unnamed: 17737
0,GENE_title,tetraspanin 6 [Source:HGNC Symbol;Acc:11858],tenomodulin [Source:HGNC Symbol;Acc:17757],dolichyl-phosphate mannosyltransferase polypep...,SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbo...,chromosome 1 open reading frame 112 [Source:HG...,Gardner-Rasheed feline sarcoma viral (v-fgr) o...,complement factor H [Source:HGNC Symbol;Acc:4883],"fucosidase, alpha-L- 2, plasma [Source:HGNC Sy...","glutamate-cysteine ligase, catalytic subunit [...",...,long intergenic non-protein coding RNA 526 [So...,pancreatic polypeptide 2 [Source:HGNC Symbol;A...,,,keratin 18 pseudogene 55 [Source:HGNC Symbol;A...,hsa-mir-5195 [Source:miRBase;Acc:MI0018174],polymerase (RNA) mitochondrial (DNA directed) ...,ubiquitin-like 5 pseudogene 2 [Source:HGNC Sym...,"TBC1 domain family, member 3 pseudogene 5 [Sou...",
1,DATA.906826,7.63202317146339,2.96458512058924,10.3795526353077,3.61479404843988,3.38068143582194,3.32469189081249,3.56635031211478,8.20453043072163,5.2351175492262,...,6.78692491488045,2.99705393632274,3.109774,7.882377,3.33113425699397,2.85253711034634,3.13069614031282,9.98661583035146,3.07372352861326,7.284733


In [4]:
expr.iloc[:,0] = expr.iloc[:,0].astype(int)
expr.columns = header.columns
expr = expr.rename(index=str, columns={"GENE_SYMBOLS": "COSMIC_ID"})

In [5]:
def hack(x):
        return train_target.corr(x)

### PCA & Linear Regression

In [7]:
# There are 265 different drugs - goal is to form 265 different models

# Format the data frame to aggregate the gene expression info (left)
# with the drug response info (right), where applicable 

# Note many NaNs throughout because not all drugs tested on all cell lines

rmse_all = []

for drug in dose_response['DRUG_ID'].unique():
    subset = dose_response[ dose_response['DRUG_ID']== drug ].loc[:,['COSMIC_ID','LN_IC50']] # 'AUC', 'RMSE' 
    expre = expr.merge(subset, on='COSMIC_ID', how='left', sort=False)

    # remove the id's 
    expre = expre.iloc[:,1:]
    expr_data = expre.copy() 
    expr_target = expre.loc[:,'LN_IC50']
    del expr_data['LN_IC50']

    # remove na's
    mask = np.array(~expr_target.isnull())
    expr_target = expr_target[mask]
    expr_data = pd.DataFrame(expr_data)[mask]
    
    # CV k= 5
    rmse_drug = []
    
    for i in np.arange(0,5) :
        
        train_data, test_data, train_target, test_target = train_test_split( expr_data, expr_target, test_size=0.2)
        # rownames are cosmic ids
        # colnames are gene expression
        
        #auc and rmse filters
        #auc_limit, rmse_limit = train_data[['AUC', 'RMSE']].quantile(0.5)
        #filter_mask =  (train_data['AUC'] >= auc_limit)  & (train_data['RMSE'] <= rmse_limit) 
        #train_data = train_data[filter_mask]
        #train_target = train_target[ filter_mask ]
        
        #remove auc, rmse after filtering
        #del train_data['AUC']
        #del train_data['RMSE']
        #del test_data['AUC']
        #del test_data['RMSE']
        
        #filter by correlation
        #corr = train_data.apply(hack)
        #top_corr = corr.quantile(0.8)
        #low_corr = corr.quantile(0.2)
        #Bool = (corr >= top_corr) | (corr <= low_corr)
        #train_data = train_data[train_data.columns[Bool]]
        #test_data = test_data[test_data.columns[Bool]]

        if train_data.shape[1] ==0 or test_data.shape[1] ==0 :
            rmse_drug.append(np.nan)
            
        else:

            # scale the data
            scaled_train = preprocessing.scale(train_data)
            scaled_test = preprocessing.scale(test_data)
            pca = PCA(.90)  

            # fit the pca dim reduction on training data
            train_img = pca.fit_transform(scaled_train)
            
            # apply it to testing data
            test_img = pca.transform(scaled_test)

            train_target = np.ravel(train_target)

            linRegr = LinearRegression()
            linRegr.fit(train_img, train_target)

            pred = linRegr.predict(test_img)

            mask = np.array(~test_target.isnull())
            test_targ = test_target[mask]
            pred = pd.DataFrame(pred)[mask]

            if test_targ.shape[0] ==0 or pred.shape[0] ==0 : 
                rmse_drug.append(np.nan)

            else:        
                rmse_drug.append( np.sqrt(mean_squared_error(test_targ, pred) ))
    
    rmse_all.append(np.mean(rmse_drug)) 
    

### Model Comparisons (RMSE)

In [9]:
# best RMSE by drug
rmse_all = pd.Series(rmse_all)
rmse_all.index = dose_response['DRUG_ID'].unique()[:]
rmse_all.sort_values().head(10)

1262    0.446368
266     0.458985
150     0.472007
1264    0.524581
91      0.548536
341     0.552194
1502    0.555846
205     0.641457
1018    0.647145
193     0.649277
dtype: float64

In [11]:
# worst RMSE by drug
rmse_all.sort_values().tail(10)

3       2.060696
299     2.071485
302     2.077077
344     2.083248
51      2.157260
190     2.214239
346     2.237624
268     2.247033
1248    2.513362
135     2.550817
dtype: float64

### Random Forests

In [219]:
accuracy_all = []
f1_all = []
auc_all = []

for drug in dose_response['DRUG_ID'].unique():
    subset = dose_response[ dose_response['DRUG_ID']== drug ].loc[:,['COSMIC_ID','LN_IC50']] #, 'AUC', 'RMSE'
    expre = expr.merge(subset, on='COSMIC_ID', how='left', sort=False)

    # remove the id's 
    expre = expre.iloc[:,1:]
    expr_data = expre.copy() 
    expr_target = expre.loc[:,'LN_IC50'] 
    del expr_data['LN_IC50']

    # remove na's
    mask = np.array(~expr_target.isnull())
    expr_target = expr_target[mask]
    expr_data = pd.DataFrame(expr_data)[mask]
    
    # CV k= 5 - cross validation is not necessary for random forests because of the averaging throughout!
    # Remove cv loops in future
    accuracy_drug = []
    f1_drug = []
    auc_drug = []

    for i in np.arange(0,5) :
        train_data, test_data, train_target, test_target = train_test_split( expr_data, expr_target, test_size=0.2)
        
        #remove null values
        mask = np.array(~train_target.isnull())
        train_target = train_target[mask]
        train_data = train_data[mask]
        
        #set the SIR categories
        sir_one_third = train_target.quantile(1/3)
        sir_two_third = train_target.quantile(2/3)
        
        #assign SIR classification
        train_target = pd.DataFrame(train_target > sir_two_third)*2 + pd.DataFrame(train_target < sir_one_third)*1 
        train_target = train_target.astype(str)
        
        #auc and rmse filters
        #auc_limit, rmse_limit = train_data[['AUC', 'RMSE']].quantile(0.5)
        #filter_mask =  (train_data['AUC'] >= auc_limit)  & (train_data['RMSE'] <= rmse_limit) 
        #train_data = train_data[filter_mask]
        #train_target = train_target[ filter_mask ]
        
        #remove auc, rmse after filtering
        #del train_data['AUC']
        #del train_data['RMSE']
        #del test_data['AUC']
        #del test_data['RMSE']
        
        #filter by correlation
        #corr = train_data.apply(hack)
        #top_corr = corr.quantile(0.8)
        #low_corr = corr.quantile(0.2)
        #Bool = (corr >= top_corr) | (corr <= low_corr)
        #train_data = train_data[train_data.columns[Bool]]
        #test_data = test_data[test_data.columns[Bool]]

        if train_data.shape[1] ==0 or test_data.shape[1] ==0 :
            accuracy_drug.append(np.nan)
            f1_drug.append(np.nan)
            
        else:

            #PCA for dim reduction - fit on training data only
            #scaled_train = preprocessing.scale(train_data)
            #scaled_test = preprocessing.scale(test_data)
            #pca = PCA(.90)  
            #train_img = pca.fit_transform(scaled_train)
            #test_img = pca.transform(scaled_test)

            train_target = np.ravel(train_target)

            rf = RandomForestClassifier(n_estimators=30, oob_score=True, random_state=0)
            rf.fit(train_data, train_target)
            
            mask = np.array(~test_target.isnull())
            test_target = test_target[mask]
            test_data = test_data[mask]
            
            #assign SIR classification
            test_target = pd.DataFrame(test_target > sir_two_third)*2 + pd.DataFrame(test_target < sir_one_third)*1 
            test_target = test_target.astype(str)

            pred = rf.predict(test_data)
            
            if test_target.shape[0] ==0 or pred.shape[0] ==0 : 
                accuracy_drug.append(np.nan)
                f1_drug.append(np.nan)

            else:        
                            
                test_target_bin = label_binarize(test_target, classes=['0','1','2'])
                pred_bin = label_binarize(pred, classes=['0','1','2'])

                fpr = dict()
                tpr = dict()
                roc_auc = [0,0,0]
                for i in range(3):
                    fpr[i], tpr[i], _ = roc_curve(test_target_bin[:, i], pred_bin[:, i])
                    roc_auc[i] = np.round( auc(fpr[i], tpr[i]), 2 )
                
                accuracy_drug.append( accuracy_score(test_target, pred) )
                f1_drug.append( f1_score(test_target, pred, average='macro') )
                auc_drug.append(roc_auc)

    accuracy_all.append(np.mean(accuracy_drug)) 
    f1_all.append(np.mean(f1_drug)) 
    auc_all.append(np.mean(auc_drug, axis=0)) 


### Model Comparisons (Accuracy, F1, AUC)

In [221]:
# best accuracy by drug

accuracy = pd.Series(accuracy_all)
f1 = pd.Series(f1_all)
auc = pd.Series(auc_all)

new = pd.DataFrame({'accuracy': accuracy.round(3), 'f1': f1.round(3), 'auc[0,1,2]': auc })
new.index = dose_response['DRUG_ID'].unique()[:]
new.sort_values(by='accuracy', ascending=False).head(10)

Unnamed: 0,accuracy,f1,"auc[0,1,2]"
1526,0.596,0.59,"[0.628, 0.6799999999999999, 0.78]"
1372,0.593,0.589,"[0.612, 0.682, 0.79]"
211,0.558,0.558,"[0.592, 0.766, 0.6439999999999999]"
1498,0.549,0.55,"[0.588, 0.682, 0.7220000000000001]"
1014,0.549,0.547,"[0.582, 0.6860000000000002, 0.724]"
1377,0.539,0.536,"[0.612, 0.726, 0.622]"
1047,0.536,0.525,"[0.558, 0.74, 0.6479999999999999]"
252,0.531,0.529,"[0.584, 0.6940000000000001, 0.668]"
253,0.528,0.532,"[0.5599999999999999, 0.7619999999999999, 0.618..."
290,0.521,0.526,"[0.5599999999999999, 0.75, 0.608]"


In [222]:
# worst accuracy by drug

new.sort_values(by='accuracy', ascending=False).tail(10)

Unnamed: 0,accuracy,f1,"auc[0,1,2]"
204,0.371,0.37,"[0.508, 0.522, 0.5599999999999999]"
63,0.37,0.366,"[0.534, 0.538, 0.5199999999999999]"
197,0.37,0.371,"[0.512, 0.5199999999999999, 0.554]"
186,0.369,0.367,"[0.51, 0.52, 0.5519999999999999]"
110,0.365,0.359,"[0.5079999999999999, 0.544, 0.522]"
157,0.364,0.361,"[0.48600000000000004, 0.508, 0.574]"
1053,0.359,0.356,"[0.5, 0.5599999999999999, 0.5]"
207,0.358,0.357,"[0.496, 0.51, 0.5519999999999999]"
202,0.35,0.348,"[0.4880000000000001, 0.506, 0.5399999999999999]"
35,0.344,0.339,"[0.488, 0.55, 0.48999999999999994]"
