In [1]:
import argparse
import pandas as pd
import csv
import numpy as np
import json
import sys
sys.path.insert(0, '..')  # Add path from parent folder
sys.path.insert(0, '.')  # Add path from current folder
from evaluation import *
from function import *
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.externals import joblib
from sklearn.grid_search import ParameterGrid
import copy

rnd_state=1337
np.random.seed(seed=rnd_state)

class SKLearn_RandomForest:
    def __init__(self, conf):
        self.conf = conf
        self.input_layer_dimension = 1024
        self.label_names = conf['label_names']
        self.EF_ratio_list = conf['enrichment_factor']['ratio_list']
        
        if os.environ.get('process') == None:
            self.process_id = 0
        else:
            self.process_id = int(os.environ.get('process'))        
        if self.process_id == None:
            print('Error: No environemnt variable process exists.')
            return 
        else:
            print('process id:', self.process_id)
        
        cnt = 0
        for param in ParameterGrid(conf['params']):
            if cnt != self.process_id:
                cnt += 1
                continue
            
            self.param = param
            param['n_estimators'] = 4000
            param['n_estimators'] = 100
            param['max_features'] = 'log2'
            param['min_samples_leaf'] = 1
            param['class_weight'] = 'balanced'
            
            self.n_estimators = param['n_estimators']
            self.max_features = param['max_features']
            self.min_samples_leaf = param['min_samples_leaf']
            self.class_weight = param['class_weight']
        
            print('Testing set:', param)            
            break
        
        if self.max_features == "None":
            self.max_features = None
        if self.class_weight == "None":
            self.class_weight = None
        
        self.model_dict = {}
        self.useVal = bool(conf['useVal'])
        return
    
    @property    
    def useVal(self):
        return self.useVal
        
    
    def get_prediction_info(self, X, y_true):
        y_pred = np.zeros(shape=y_true.shape)
        
        
        for i, label in zip(range(len(self.label_names)), self.label_names):     
            model = self.model_dict[label]
            
            y_true[np.where(np.isnan(y_true[:,i]))[0],i] = -1
            if i in [0,1,2]:                
                y_pred[:,i] =  model.predict_proba(X)[:,1]
        
        return y_true, y_pred
        
    def setup_model(self):
        for i in [0,1,2]:
            self.model_dict[self.label_names[i]] = RandomForestClassifier(n_estimators=self.n_estimators, 
                                           max_features=self.max_features, 
                                           min_samples_leaf=self.min_samples_leaf, 
                                           n_jobs=3, 
                                           class_weight=self.class_weight,
                                           random_state=rnd_state,
                                           oob_score=False, 
                                           verbose=1)
        return
        
        
    def train_and_predict(self,
                          X_train, y_train,
                          X_val, y_val,
                          X_test, y_test,
                          model_file):
                              
        self.setup_model()
        
        if not self.useVal:
            X_train = np.concatenate((X_train, X_val))
            y_train = np.concatenate((y_train, y_val))
        
        p = np.random.permutation(len(X_train))
        X_train = X_train[p,:]
        y_train = y_train[p,:]
        
        for i, label in zip(range(len(self.label_names)), self.label_names):
            print 'testing {}, {}'.format(i, label)
            y = y_train[:,i]
            indexes = np.where(np.isnan(y))[0]
                
            y = np.delete(y, indexes, axis=0)
            X = np.delete(X_train, indexes, axis=0)
            self.model_dict[label].fit(X, y)
            
            #joblib.dump(self.model_dict[label], model_file+'_'+label+'.pkl', compress = 1)
        return

    def predict_with_existing(self,
                              X_train, y_train,
                              X_val, y_val,
                              X_test, y_test):  
        if self.useVal:
            y_val, y_pred_on_val = self.get_prediction_info(X_val, y_val)
        else:                          
            X_train = np.concatenate((X_train, X_val))
            y_train = np.concatenate((y_train, y_val))
        
        y_train, y_pred_on_train = self.get_prediction_info(X_train, y_train)        
        y_test, y_pred_on_test = self.get_prediction_info(X_test, y_test)
        print y_train.shape
        print y_pred_on_train.shape
        print y_test.shape
        print y_pred_on_test.shape
        
        print
        print('train precision: {}'.format(precision_auc_multi(y_train, y_pred_on_train, range(y_train.shape[1]), np.mean)))
        print('train roc: {}'.format(roc_auc_multi(y_train, y_pred_on_train, range(y_train.shape[1]), np.mean)))
        print('train bedroc: {}'.format(bedroc_auc_multi(y_train, y_pred_on_train, range(y_train.shape[1]), np.mean)))
        print
      
        if self.useVal:
            print('val precision: {}'.format(precision_auc_multi(y_val, y_pred_on_val, range(y_val.shape[1]), np.mean)))
            print('val roc: {}'.format(roc_auc_multi(y_val, y_pred_on_val, range(y_val.shape[1]), np.mean)))
            print('val bedroc: {}'.format(bedroc_auc_multi(y_val, y_pred_on_val, range(y_val.shape[1]), np.mean)))
            print
        print('test precision: {}'.format(precision_auc_multi(y_test, y_pred_on_test, range(y_test.shape[1]), np.mean)))
        print('test roc: {}'.format(roc_auc_multi(y_test, y_pred_on_test, range(y_test.shape[1]), np.mean)))
        print('test bedroc: {}'.format(bedroc_auc_multi(y_test, y_pred_on_test, range(y_test.shape[1]), np.mean)))
        print
        
        print 'Below is for Keck_Pria_AS_Retest'
        print
        print('train precision: {}'.format(precision_auc_multi(y_train, y_pred_on_train, range(1), np.mean)))
        print('train roc: {}'.format(roc_auc_multi(y_train, y_pred_on_train, range(1), np.mean)))
        print('train bedroc: {}'.format(bedroc_auc_multi(y_train, y_pred_on_train, range(1), np.mean)))
        print
        print('test precision: {}'.format(precision_auc_multi(y_test, y_pred_on_test, range(1), np.mean)))
        print('test roc: {}'.format(roc_auc_multi(y_test, y_pred_on_test, range(1), np.mean)))
        print('test bedroc: {}'.format(bedroc_auc_multi(y_test, y_pred_on_test, range(1), np.mean)))
        print
        
        '''
        label_list = ['Keck_Pria_AS_Retest', 'Keck_Pria_FP_data', 
                      'Keck_Pria_Continuous_AS_Retest', 'Keck_Pria_Continuous_FP_data',
                      'Keck_RMI_cdd', 'FP counts % inhibition']
        label_list = ['Keck_Pria_AS_Retest', 'Keck_Pria_FP_data', 
                      'Keck_Pria_Continuous_AS_Retest', 'Keck_Pria_Continuous_FP_data',
                      'Keck_RMI_cdd', 'FP counts % inhibition']
        nef_auc_mean = np.mean(np.array(nef_auc(y_train, y_pred_on_train, self.EF_ratio_list, label_list))) 
        print('train nef auc: {}'.format(nef_auc_mean))
        if self.useVal:
            nef_auc_mean = np.mean(np.array(nef_auc(y_val, y_pred_on_val, self.EF_ratio_list, label_list))) 
            print('val nef auc: {}'.format(nef_auc_mean))
        nef_auc_mean = np.mean(np.array(nef_auc(y_test, y_pred_on_test, self.EF_ratio_list, label_list))) 
        print('test nef auc: {}'.format(nef_auc_mean))
        '''
        return

   
    def save_model_params(self, config_csv_file):      
        data = str(self.param)
        with open(config_csv_file, 'w') as csvfile:
            csvfile.write(data)
        return
print 'done'        

done


# Test 1

In [2]:
config_json_file = '../../json/sklearn_randomforest_keck_pria_only.json'
with open(config_json_file, 'r') as f:
    conf = json.load(f)
        
label_name_list = conf['label_names']
print 'label_name_list ', label_name_list

# specify dataset
k = 5
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('file_{}.csv'.format(i))

labels = label_name_list

output_file_list = [directory + f_ for f_ in file_list]
train_pd = read_merged_data(output_file_list[0:3])
val_pd = read_merged_data([output_file_list[3]])
test_pd = read_merged_data([output_file_list[4]])

X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=labels)
        
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=labels)
                                                   
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=labels)

print 'done'

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
(43452, 3)
(14485, 3)
(14486, 3)
done


In [3]:
task = SKLearn_RandomForest(conf=conf)

('process id:', 0)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 100, u'min_samples_leaf': 1, u'class_weight': 'balanced'})


In [4]:
task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, 'model_file')
print 'done'

testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.3s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.8s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.2s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.2s


done


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.7s finished


In [5]:
task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test)
print 'done'

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished


(57937, 3)
(57937, 3)
(14486, 3)
(14486, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990234270073

test precision: 0.085280718087
test roc: 0.735687139602
test bedroc: 0.478756107257

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.99466830683

test precision: 0.0815249940382
test roc: 0.733316776088
test bedroc: 0.502638413819

done


# Test 2

In [6]:
config_json_file = '../../json/sklearn_randomforest_keck_pria_only.json'
with open(config_json_file, 'r') as f:
    conf = json.load(f)
        
label_name_list = conf['label_names']
print 'label_name_list ', label_name_list

# specify dataset
k = 5
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('file_{}.csv'.format(i))

labels = label_name_list

output_file_list = [directory + f_ for f_ in file_list]
train_pd = read_merged_data(output_file_list[2:5])
val_pd = read_merged_data([output_file_list[1]])
test_pd = read_merged_data([output_file_list[0]])

X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=labels)
        
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=labels)
                                                   
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=labels)

print 'done'

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
(43455, 3)
(14482, 3)
(14486, 3)
done


In [7]:
task = SKLearn_RandomForest(conf=conf)

('process id:', 0)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 100, u'min_samples_leaf': 1, u'class_weight': 'balanced'})


In [8]:
task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, 'model_file')
print 'done'

testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.4s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    3.0s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.0s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.1s


done


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.6s finished


In [9]:
task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test)
print 'done'

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished


(57937, 3)
(57937, 3)
(14486, 3)
(14486, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990205375155

test precision: 0.0707712372813
test roc: 0.676025687288
test bedroc: 0.39818311044

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.99466830683

test precision: 0.13760671283
test roc: 0.799790514858
test bedroc: 0.623657645847

done
