In [1]:
import argparse
import pandas as pd
import csv
import numpy as np
import json
import sys
sys.path.insert(0, '..')  # Add path from parent folder
sys.path.insert(0, '.')  # Add path from current folder
from evaluation import *
from function import *
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.externals import joblib
from sklearn.grid_search import ParameterGrid
import copy

rnd_state=1337
np.random.seed(seed=rnd_state)

class SKLearn_RandomForest:
    def __init__(self, conf):
        self.conf = conf
        self.input_layer_dimension = 1024
        self.label_names = conf['label_names']
        self.EF_ratio_list = conf['enrichment_factor']['ratio_list']
        
        if os.environ.get('process') == None:
            self.process_id = 0
        else:
            self.process_id = int(os.environ.get('process'))        
        if self.process_id == None:
            print('Error: No environemnt variable process exists.')
            return 
        else:
            print('process id:', self.process_id)
        
        cnt = 0
        for param in ParameterGrid(conf['params']):
            if cnt != self.process_id:
                cnt += 1
                continue
            
            self.param = param
            param['n_estimators'] = 4000
            # param['n_estimators'] = 100
            param['max_features'] = 'log2'
            param['min_samples_leaf'] = 1
            param['class_weight'] = 'balanced'
            
            self.n_estimators = param['n_estimators']
            self.max_features = param['max_features']
            self.min_samples_leaf = param['min_samples_leaf']
            self.class_weight = param['class_weight']
        
            print('Testing set:', param)            
            break
        
        if self.max_features == "None":
            self.max_features = None
        if self.class_weight == "None":
            self.class_weight = None
        
        self.model_dict = {}
        self.useVal = False
        return
    
    @property    
    def useVal(self):
        return self.useVal
        
    
    def get_prediction_info(self, X, y_true):
        y_pred = np.zeros(shape=y_true.shape)
        
        
        for i, label in zip(range(len(self.label_names)), self.label_names):     
            model = self.model_dict[label]
            
            y_true[np.where(np.isnan(y_true[:,i]))[0],i] = -1
            if i in [0,1,2]:                
                y_pred[:,i] =  model.predict_proba(X)[:,1]
        
        return y_true, y_pred
        
    def setup_model(self):
        for i in [0,1,2]:
            self.model_dict[self.label_names[i]] = RandomForestClassifier(n_estimators=self.n_estimators, 
                                           max_features=self.max_features, 
                                           min_samples_leaf=self.min_samples_leaf, 
                                           n_jobs=-1, 
                                           class_weight=self.class_weight,
                                           random_state=rnd_state,
                                           oob_score=False, 
                                           verbose=1)
        return
        
        
    def train_and_predict(self,
                          X_train, y_train,
                          X_val, y_val,
                          X_test, y_test,
                          model_file):
                              
        self.setup_model()
        
        if not self.useVal:
            X_train = np.concatenate((X_train, X_val))
            y_train = np.concatenate((y_train, y_val))
        
        p = np.random.permutation(len(X_train))
        X_train = X_train[p,:]
        y_train = y_train[p,:]
        
        for i, label in zip(range(len(self.label_names)), self.label_names):
            print 'testing {}, {}'.format(i, label)
            y = y_train[:,i]
            indexes = np.where(np.isnan(y))[0]
                
            y = np.delete(y, indexes, axis=0)
            X = np.delete(X_train, indexes, axis=0)
            self.model_dict[label].fit(X, y)
            
            #joblib.dump(self.model_dict[label], model_file+'_'+label+'.pkl', compress = 1)
        return

    def predict_with_existing(self,
                              X_train, y_train,
                              X_val, y_val,
                              X_test, y_test):  
        if self.useVal:
            y_val, y_pred_on_val = self.get_prediction_info(X_val, y_val)
        else:                          
            X_train = np.concatenate((X_train, X_val))
            y_train = np.concatenate((y_train, y_val))
        
        y_train, y_pred_on_train = self.get_prediction_info(X_train, y_train)        
        y_test, y_pred_on_test = self.get_prediction_info(X_test, y_test)
        print y_train.shape
        print y_pred_on_train.shape
        print y_test.shape
        print y_pred_on_test.shape
        
        print
        print('train precision: {}'.format(precision_auc_multi(y_train, y_pred_on_train, range(y_train.shape[1]), np.mean)))
        print('train roc: {}'.format(roc_auc_multi(y_train, y_pred_on_train, range(y_train.shape[1]), np.mean)))
        print('train bedroc: {}'.format(bedroc_auc_multi(y_train, y_pred_on_train, range(y_train.shape[1]), np.mean)))
        print
      
        if self.useVal:
            print('val precision: {}'.format(precision_auc_multi(y_val, y_pred_on_val, range(y_val.shape[1]), np.mean)))
            print('val roc: {}'.format(roc_auc_multi(y_val, y_pred_on_val, range(y_val.shape[1]), np.mean)))
            print('val bedroc: {}'.format(bedroc_auc_multi(y_val, y_pred_on_val, range(y_val.shape[1]), np.mean)))
            print
        print('test precision: {}'.format(precision_auc_multi(y_test, y_pred_on_test, range(y_test.shape[1]), np.mean)))
        print('test roc: {}'.format(roc_auc_multi(y_test, y_pred_on_test, range(y_test.shape[1]), np.mean)))
        print('test bedroc: {}'.format(bedroc_auc_multi(y_test, y_pred_on_test, range(y_test.shape[1]), np.mean)))
        print
        
        print 'Below is for Keck_Pria_AS_Retest'
        print
        print('train precision: {}'.format(precision_auc_multi(y_train, y_pred_on_train, range(1), np.mean)))
        print('train roc: {}'.format(roc_auc_multi(y_train, y_pred_on_train, range(1), np.mean)))
        print('train bedroc: {}'.format(bedroc_auc_multi(y_train, y_pred_on_train, range(1), np.mean)))
        print
        print('test precision: {}'.format(precision_auc_multi(y_test, y_pred_on_test, range(1), np.mean)))
        print('test roc: {}'.format(roc_auc_multi(y_test, y_pred_on_test, range(1), np.mean)))
        print('test bedroc: {}'.format(bedroc_auc_multi(y_test, y_pred_on_test, range(1), np.mean)))
        print
        
        '''
        label_list = ['Keck_Pria_AS_Retest', 'Keck_Pria_FP_data', 
                      'Keck_Pria_Continuous_AS_Retest', 'Keck_Pria_Continuous_FP_data',
                      'Keck_RMI_cdd', 'FP counts % inhibition']
        label_list = ['Keck_Pria_AS_Retest', 'Keck_Pria_FP_data', 
                      'Keck_Pria_Continuous_AS_Retest', 'Keck_Pria_Continuous_FP_data',
                      'Keck_RMI_cdd', 'FP counts % inhibition']
        nef_auc_mean = np.mean(np.array(nef_auc(y_train, y_pred_on_train, self.EF_ratio_list, label_list))) 
        print('train nef auc: {}'.format(nef_auc_mean))
        if self.useVal:
            nef_auc_mean = np.mean(np.array(nef_auc(y_val, y_pred_on_val, self.EF_ratio_list, label_list))) 
            print('val nef auc: {}'.format(nef_auc_mean))
        nef_auc_mean = np.mean(np.array(nef_auc(y_test, y_pred_on_test, self.EF_ratio_list, label_list))) 
        print('test nef auc: {}'.format(nef_auc_mean))
        '''
        return

   
    def save_model_params(self, config_csv_file):      
        data = str(self.param)
        with open(config_csv_file, 'w') as csvfile:
            csvfile.write(data)
        return
print 'done'        

done




# Test 1

In [2]:
config_json_file = '../../json/sklearn_randomforest.json'
with open(config_json_file, 'r') as f:
    conf = json.load(f)
        
label_name_list = conf['label_names']
print 'label_name_list ', label_name_list

# specify dataset
k = 5
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('file_{}.csv'.format(i))

labels = label_name_list

output_file_list = [directory + f_ for f_ in file_list]
train_pd = read_merged_data(output_file_list[0:3])
val_pd = read_merged_data([output_file_list[3]])
test_pd = read_merged_data([output_file_list[4]])

X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=labels)
        
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=labels)
                                                   
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=labels)

print 'done'

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
done


In [3]:
task = SKLearn_RandomForest(conf=conf)

('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})


In [4]:
task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, 'model_file')
print 'done'

testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   42.9s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   36.9s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   35.9s


done


[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   46.2s finished


In [5]:
task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test)
print 'done'

[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.8s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.2s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.6s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.5s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elapsed:    9.6s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.6s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.6s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    3.9s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.3s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    6.9s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57937, 3)
(57937, 3)
(14486, 3)
(14486, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990234270073

test precision: 0.100943048104
test roc: 0.796127296245
test bedroc: 0.508345964473

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.99466830683

test precision: 0.0971574455977
test roc: 0.848060642709
test bedroc: 0.573435781742

done


# Test 2

In [6]:
config_json_file = '../../json/sklearn_randomforest.json'
with open(config_json_file, 'r') as f:
    conf = json.load(f)
        
label_name_list = conf['label_names']
print 'label_name_list ', label_name_list

# specify dataset
k = 5
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('file_{}.csv'.format(i))

labels = label_name_list

output_file_list = [directory + f_ for f_ in file_list]
train_pd = read_merged_data(output_file_list[2:5])
val_pd = read_merged_data([output_file_list[0]])
test_pd = read_merged_data([output_file_list[1]])

X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=labels)
        
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=labels)
                                                   
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=labels)

print 'done'

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
done


In [7]:
task = SKLearn_RandomForest(conf=conf)

('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})


In [8]:
task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, 'model_file')
print 'done'

testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   45.1s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   30.2s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   32.1s


done


[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   42.2s finished


In [9]:
task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test)
print 'done'

[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.6s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.2s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.8s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.6s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elapsed:    9.6s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.8s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.5s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.4s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    3.6s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    4.9s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    6.5s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57941, 3)
(57941, 3)
(14482, 3)
(14482, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990067859348

test precision: 0.118431387229
test roc: 0.743033770703
test bedroc: 0.466435691803

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.994582995534

test precision: 0.246995408166
test roc: 0.91400658971
test bedroc: 0.816401412397

done


In [10]:
def run(running_index):
    config_json_file = '../../json/sklearn_randomforest.json'
    with open(config_json_file, 'r') as f:
        conf = json.load(f)

    label_name_list = conf['label_names']
    print 'label_name_list ', label_name_list

    # specify dataset
    k = 5
    directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
    file_list = []
    for i in range(k):
        file_list.append('{}file_{}.csv'.format(directory, i))
    file_list = np.array(file_list)
    labels = label_name_list

    output_file_list = [directory + f_ for f_ in file_list]
    test_index = running_index / 4
    val_index = running_index % 4 + (running_index % 4 >= test_index)
    complete_index = np.arange(k)
    train_index = np.where((complete_index != test_index) & (complete_index != val_index))[0]
    
    print train_index, val_index, test_index
    
    train_file_list = file_list[train_index]
    val_file_list = file_list[val_index:val_index+1]
    test_file_list = file_list[test_index:test_index+1]

    print 'train files ', train_file_list
    print 'val files ', val_file_list
    print 'test files ', test_file_list

    train_pd = read_merged_data(train_file_list)
    val_pd = read_merged_data(val_file_list)
    test_pd = read_merged_data(test_file_list)

    X_train, y_train = extract_feature_and_label(train_pd,
                                                 feature_name='Fingerprints',
                                                 label_name_list=labels)

    X_val, y_val = extract_feature_and_label(val_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=labels)

    X_test, y_test = extract_feature_and_label(test_pd,
                                               feature_name='Fingerprints',
                                               label_name_list=labels)

    print 'done'
    task = SKLearn_RandomForest(conf=conf)
    task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, 'model_file')
    task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test)
    print 'done'

In [11]:
run(0)

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
[2 3 4] 1 0
train files  ['../../dataset/fixed_dataset/fold_5/file_2.csv'
 '../../dataset/fixed_dataset/fold_5/file_3.csv'
 '../../dataset/fixed_dataset/fold_5/file_4.csv']
val files  ['../../dataset/fixed_dataset/fold_5/file_1.csv']
test files  ['../../dataset/fixed_dataset/fold_5/file_0.csv']
done
('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})
testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   49.7s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   29.8s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   36.7s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.7s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.2s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.9s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.8s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57937, 3)
(57937, 3)
(14486, 3)
(14486, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990205375155

test precision: 0.069406686327
test roc: 0.771620934683
test bedroc: 0.502867352061

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.99466830683

test precision: 0.0974344178009
test roc: 0.837005442294
test bedroc: 0.671730699013

done


In [12]:
run(4)

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
[2 3 4] 0 1
train files  ['../../dataset/fixed_dataset/fold_5/file_2.csv'
 '../../dataset/fixed_dataset/fold_5/file_3.csv'
 '../../dataset/fixed_dataset/fold_5/file_4.csv']
val files  ['../../dataset/fixed_dataset/fold_5/file_0.csv']
test files  ['../../dataset/fixed_dataset/fold_5/file_1.csv']
done
('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})
testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   48.2s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   32.1s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   37.7s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.6s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.7s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.1s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.7s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.6s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57941, 3)
(57941, 3)
(14482, 3)
(14482, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990067859348

test precision: 0.122368007166
test roc: 0.729306943519
test bedroc: 0.446469623534

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.994582995534

test precision: 0.258548337181
test roc: 0.893106149628
test bedroc: 0.823595417936

done


In [13]:
run(8)

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
[1 3 4] 0 2
train files  ['../../dataset/fixed_dataset/fold_5/file_1.csv'
 '../../dataset/fixed_dataset/fold_5/file_3.csv'
 '../../dataset/fixed_dataset/fold_5/file_4.csv']
val files  ['../../dataset/fixed_dataset/fold_5/file_0.csv']
test files  ['../../dataset/fixed_dataset/fold_5/file_2.csv']
done
('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})
testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   49.5s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   35.0s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   38.1s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    1.0s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.8s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.1s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.6s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.5s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57939, 3)
(57939, 3)
(14484, 3)
(14484, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990095964004

test precision: 0.132892411445
test roc: 0.826814853281
test bedroc: 0.581823642654

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.994668490214

test precision: 0.248512246681
test roc: 0.934361608377
test bedroc: 0.83403922981

done


In [14]:
run(12)

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
[1 2 4] 0 3
train files  ['../../dataset/fixed_dataset/fold_5/file_1.csv'
 '../../dataset/fixed_dataset/fold_5/file_2.csv'
 '../../dataset/fixed_dataset/fold_5/file_4.csv']
val files  ['../../dataset/fixed_dataset/fold_5/file_0.csv']
test files  ['../../dataset/fixed_dataset/fold_5/file_3.csv']
done
('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})
testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   50.0s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   35.0s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   36.8s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.8s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.6s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.1s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.7s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.4s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57938, 3)
(57938, 3)
(14485, 3)
(14485, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990165204217

test precision: 0.149709308406
test roc: 0.866872405916
test bedroc: 0.620102298674

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.994668398524

test precision: 0.379615907271
test roc: 0.943178087636
test bedroc: 0.866226902673

done


In [15]:
run(16)

label_name_list  [u'Keck_Pria_AS_Retest', u'Keck_Pria_FP_data', u'Keck_RMI_cdd']
[1 2 3] 0 4
train files  ['../../dataset/fixed_dataset/fold_5/file_1.csv'
 '../../dataset/fixed_dataset/fold_5/file_2.csv'
 '../../dataset/fixed_dataset/fold_5/file_3.csv']
val files  ['../../dataset/fixed_dataset/fold_5/file_0.csv']
test files  ['../../dataset/fixed_dataset/fold_5/file_4.csv']
done
('process id:', 3)
('Testing set:', {u'max_features': 'log2', u'n_estimators': 4000, u'min_samples_leaf': 1, u'class_weight': 'balanced'})
testing 0, Keck_Pria_AS_Retest


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   47.3s finished


testing 1, Keck_Pria_FP_data


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   30.4s finished


testing 2, Keck_RMI_cdd


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   38.5s finished
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.8s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:    4.2s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:    5.8s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:    7.6s
[Parallel(n_jobs=32)]: Done 4000 out of 4000 | elap

(57937, 3)
(57937, 3)
(14486, 3)
(14486, 3)

train precision: 1.0
train roc: 1.0
train bedroc: 0.990234270073

test precision: 0.101716290184
test roc: 0.801515262866
test bedroc: 0.539762116634

Below is for Keck_Pria_AS_Retest

train precision: 1.0
train roc: 1.0
train bedroc: 0.99466830683

test precision: 0.0964912719995
test roc: 0.787709485142
test bedroc: 0.555535014689

done
