In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [22]:
celllines_wes_efgr = pd.read_csv("../data/wes_ic50_Erlotinib.tsv",sep = "\t", index_col="COSMIC_ID")

# split data
train_data_set, dev_data_set, train_labels, dev_labels = train_test_split(celllines_wes_efgr, celllines_wes_efgr['LN_IC50'], test_size=0.33, random_state=0)


In [23]:
train_data_set = train_data_set.drop('LN_IC50', axis=1)
dev_data_set = dev_data_set.drop('LN_IC50', axis=1)

In [24]:
regr = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=100)
regr.fit(train_data_set, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [10]:
print(regr.feature_importances_)

[0.00778297 0.         0.         ... 0.         0.         0.        ]


In [25]:
pred = regr.predict(dev_data_set)

In [8]:
print(pred)

[ 2.49879381  2.52930409  2.40586345  1.42165075  3.45666201  3.45666201
  0.93954498  2.49879381  2.49879381  2.49879381  2.49879381  3.45666201
  0.95766854  2.68857434  3.45666201  3.45666201  2.49879381  3.45666201
  2.49879381  2.29575606  2.49879381  2.49879381  2.25616353  2.49879381
  2.49879381  2.49879381  1.75993051  2.27008816  2.49879381  2.53862785
  0.77359558  0.93954498  2.0273553   0.9028792   2.40586345  2.40586345
  2.49879381  1.99749846  3.45666201  3.45666201  3.45666201  2.96830016
  2.44710561  1.6480081   0.95766854  2.49879381  0.42336172  2.16393689
  3.45666201  2.49879381  2.96830016  3.45666201  0.33059374  3.02568955
  0.93954498  2.49879381  0.33059374  2.49879381  3.45666201  3.45666201
  3.45666201  2.49879381  0.95766854  2.49879381  2.48772786  3.45666201
  2.49879381  3.45666201  3.45666201  2.49879381  2.49879381  1.42165075
  3.45666201  2.49879381  2.49879381  3.3714539   0.93954498  3.45666201
  3.45666201  2.24013514  3.45666201  2.49879381  1

In [26]:
print(pred- dev_labels)

COSMIC_ID
908159     0.287704
688015    -0.156401
1331040    0.776131
688013     1.270555
1330995   -1.005985
949154    -1.274244
930082     0.681028
949173     0.099090
907791     0.281968
909249    -0.114316
1327771    0.202022
909712    -0.941431
1330987    1.030455
906765    -0.043311
1303897   -0.532534
907275    -0.725366
905984     0.039524
910911    -1.146328
906872    -0.013598
909742     1.083821
906834     0.246582
907067     0.666276
907044     1.049601
909260    -0.280947
949153     0.352523
910688     0.188817
907043     0.087016
753595     0.993453
688026     0.438543
907280    -0.892502
             ...   
906849     5.049910
687562    -0.123531
908451    -0.579613
910691    -1.563406
1331025   -2.370404
909785    -1.223001
949174    -0.963926
684055    -0.619175
1327774    1.302900
1330933    0.689936
910692     0.764167
907050    -0.111550
1330947    1.495431
907311    -0.040644
687997    -0.781356
908142    -0.020372
949170    -0.963086
910905    -0.208946
753622    

In [27]:

def train_rf(df):
    
    # split data
    train_data_set, dev_data_set, train_labels, dev_labels = train_test_split(df, df['LN_IC50'], test_size=0.20, random_state=0)
    
    #drop labels
    train_data_set = train_data_set.drop('LN_IC50', axis=1)
    dev_data_set = dev_data_set.drop('LN_IC50', axis=1)
    
    regr = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=100)
    regr.fit(train_data_set, train_labels)
    
    print(regr.feature_importances_)
    pred = regr.predict(dev_data_set)
    
    #print(pred)
    #print(pred- dev_labels)
    print(np.mean(pred - dev_labels))

In [14]:
celllines_wes_non_bin_efgr = pd.read_csv("../data/continuous/wes_ic50_Erlotinib.tsv",sep = "\t", index_col="COSMIC_ID")
celllines_wes_efgr = pd.read_csv("../data/wes_ic50_Erlotinib.tsv",sep = "\t", index_col="COSMIC_ID")




In [28]:
train_rf(celllines_wes_non_bin_efgr)
train_rf(celllines_wes_efgr)


[4.27583300e-03 0.00000000e+00 6.67593064e-07 ... 0.00000000e+00
 0.00000000e+00 0.00000000e+00]
0.15951560747323176
[0.00457405 0.         0.         ... 0.         0.         0.        ]
0.10208208987523698


In [None]:

celllines_wes_binary_efgr = pd.read_csv("../data/wes_ic50_binary_all.tsv",sep = "\t", index_col="COSMIC_ID")


In [28]:
def train_rf_class(df):
    
    # split data
    train_data_set, dev_data_set, train_labels, dev_labels = train_test_split(df, df['BINARY_RESPONSE'], test_size=0.20, random_state=0)
    
    #drop labels
    train_data_set = train_data_set.drop('BINARY_RESPONSE', axis=1)
    dev_data_set = dev_data_set.drop('BINARY_RESPONSE', axis=1)

    train_data_set = train_data_set.drop('LN_IC50', axis=1)
    dev_data_set = dev_data_set.drop('LN_IC50', axis=1)

    regr = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=10)
    regr.fit(train_data_set, train_labels)
    
    #print(regr.feature_importances_)
    pred = regr.predict(dev_data_set)
    
    wrong_predictions = (pred != dev_labels)
    print(dev_labels[wrong_predictions])
    print(confusion_matrix(dev_labels, pred))
    


In [29]:
train_rf_class(celllines_wes_binary_efgr)

COSMIC_ID
1330987    S
946355     S
1240183    S
906861     S
949177     S
910697     S
909250     S
905967     S
907795     S
909743     R
753583     S
909754     R
Name: BINARY_RESPONSE, dtype: object
[[62  2]
 [10  1]]
