In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import vis.classification_vis as cvis
from Evals import *
from Data_Provider import *
import util.classification_utils as util
import util.data_utils as dutil
import util.label_utils as lutil


import numpy as np
from scipy import interp
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import os

from itertools import product
from multiprocessing import Pool


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




In [3]:
def train_and_save_best_classifier(results,x,y,configs):
    best_n, best_d, best_f = util.get_best_hyperparas(results,'AVG PR')
    classifier = RandomForestClassifier(n_estimators=int(best_n), max_depth=int(best_d), max_features=int(best_f),random_state=0) #init classifier
    classifier.fit(x,y)
    best_thr = util.get_optimal_threshold(classifier, x, y, go_after_pr=True) # get threshold using cv (on whole dataset)
    dutil.save_classifier(classifier, best_thr, configs,'RF')
    
    

# Run classifier with cross-validation
def calc_results_and_save(x, y, configs, shuff):
    cv = StratifiedKFold(n_splits=10, shuffle=shuff, random_state=1)
    #dataframe for saving results
    results = pd.DataFrame(columns=('Number Estimators','Max Depth','Max Features','AVG PR'))#,'AUC ROC'))
    #do random search on parameters
    est = np.random.choice(np.arange(130)[5:],50)
    max_d = np.random.choice(np.arange(60)[1:],50)
    max_f = np.random.choice(np.arange(min(x.shape))[1:],50)

    #Search for best hyperpara combo with cross validation
    for idx,(c,d,f) in enumerate(zip(est,max_d,max_f)):
        classifier = RandomForestClassifier(n_estimators=c, max_depth=d, max_features=f,random_state=0) #init classifier
        auc_pr = util.get_auc_score(classifier, cv, x, y, go_after_pr=True)
        #auc_roc =util.get_auc_score(classifier, cv, x, y, go_after_pr=False)
        results.loc[idx] = [c,d,f,auc_pr]
        print('Number Estimators= %d, Max Depth = %d, Max Feat = %d, AUC PR %.3f' % (c,d,f,auc_pr))
    if shuff:
        dutil.save_results(results,configs,'RF_Shuffle') # save results for later use
    else:
        dutil.save_results(results,configs,'RF') # save results for later use      
    return results


def vis_results(x,y, x_ev, y_ev, configs):
    ###using hyperpara found, evaluate and get pretty plots
    #get f1 scores on whole training set
    classifier, best_thr = dutil.load_classifier(configs,'RF')
    y_pred = util.get_prediction(classifier,x,best_thr)
    y_pred_ev = util.get_prediction(classifier,x_ev,best_thr)
    f1_tr=get_f1(y_pred,y) # calculate f1 scores for prediction on train set
    f1_ev=get_f1(y_pred_ev,y_ev)
    prec_tr,recall_tr = get_precision_recall(y_pred,y)
    prec_ev,recall_ev = get_precision_recall(y_pred_ev,y_ev)

    df_res = pd.DataFrame(index =['Train','Eval'],columns = ['Precision','Recall','F1']).astype('float')
    df_res.loc['Train'] = [prec_tr, recall_tr,f1_tr]
    df_res.loc['Eval'] = [prec_ev, recall_ev,f1_ev]
    cvis.print_results(df_res)
    #draw pretty plots
    cvis.conf_mat(y_pred,y)
    cvis.conf_mat(y_pred_ev,y_ev)

    cvis.plot_roc(x,y,classifier, 'Random Forest ROC on Train Set')
    cvis.plot_roc(x_ev,y_ev,classifier,  'Random Forest ROC on Eval Set')
    cvis.plot_pr_curve(x,y,classifier, 'Random Forest Pr-Re curve on Train Set')
    cvis.plot_pr_curve(x_ev,y_ev,classifier, 'Random Forest ROC on Eval Set')

    
def do_all(file, cut, shuffled=False, random = False, reload= False):
# def do_all(data, cut, shuffled=False, random = True):
    provider = DataProvider()
    configs = dutil.generate_configs_from_file(file, cut)
    print(configs)
    x,y,x_ev,y_ev = provider.get_data(configs)
    print(x.shape,y.shape,x_ev.shape,y_ev.shape)
    #if random:
    if False:
        print('yes, random')
        np.random.seed()
        y = randomize_labels(y)
        y_ev = randomize_labels(y_ev)
    if reload:
        res = dutil.load_results(configs,'RF')
    else:
        res = calc_results_and_save(x,y,configs,shuffled)
    
    train_and_save_best_classifier(res,x,y,configs)

    res = calc_results_and_save(x,y,shuffled)
#     #### get best resul on train set
#     pos = res['AVG PR'].idxmax()
#     best_row=list(res.loc[pos].values) # get the row with highest ev score
#     best_row.append(y)
#     best_row.append(y_ev)
#     return best_row
    ####
    vis_results(x,y, x_ev, y_ev, configs)
    
def randomize_labels(y):
    ones = int(np.sum(y))
    #where?
    fill_ones = np.random.choice(len(y),ones, replace=False)
    ret = np.empty(len(y))
    ret[:]=0
    ret[fill_ones]=1
    return ret
    
    

In [4]:
files = [f for f in os.listdir('/home/emil/OpenMindv2/data/postprocessing') if 'shuffle_True' in f and '3' in f]
cuts = [.05,.1,.2,.3]
# shuffled =[False]
all_elements = [files,cuts]
# #all_elements = [[bla],cuts]

file_cut_combos = []
for allel in product(*all_elements):
    file_cut_combos+=[allel]



In [5]:
files

['patient_cb46fd46_days_[3, 4, 5, 6, 7]_wsize_5_sliding_False_expvar_90_ratio_0.8_shuffle_True',
 'patient_cb46fd46_days_[3, 4, 5, 6, 7]_wsize_100_sliding_False_expvar_90_ratio_0.8_shuffle_True',
 'patient_cb46fd46_days_[3, 4, 5, 6, 7]_wsize_50_sliding_False_expvar_90_ratio_0.8_shuffle_True']

In [None]:
# do_all(files[2],.1)

pool = mp.Pool(7)
yass = pool.starmap(do_all,file_cut_combos)

del(pool)
del(yass)

{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 5, 'sliding': False, 'expvar': 90, 'ratio': 0.8, 'shuffle': True, 'cutoff': 0.05}
{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 5, 'sliding': False, 'expvar': 90, 'ratio': 0.8, 'shuffle': True, 'cutoff': 0.1}
{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 5, 'sliding': False, 'expvar': 90, 'ratio': 0.8, 'shuffle': True, 'cutoff': 0.2}
{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 100, 'sliding': False, 'expvar': 90, 'ratio': 0.8, 'shuffle': True, 'cutoff': 0.1}
{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 100, 'sliding': False, 'expvar': 90, 'ratio': 0.8, 'shuffle': True, 'cutoff': 0.2}
{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 5, 'sliding': False, 'expvar': 90, 'ratio': 0.8, 'shuffle': True, 'cutoff': 0.3}
/home/emil/OpenMindv2/data/postprocessing/patient_cb46fd46_days_[3, 4, 5, 6, 7]_wsize_5_sliding_False_expvar_90_ratio_0.8_shuffle_True
/home/emil/OpenMindv2/data

Number Estimators= 118, Max Depth = 29, Max Feat = 10, AUC PR 0.275
Number Estimators= 122, Max Depth = 2, Max Feat = 13, AUC PR 0.530
Number Estimators= 122, Max Depth = 2, Max Feat = 13, AUC PR 0.616
Number Estimators= 23, Max Depth = 39, Max Feat = 6, AUC PR 0.425
Number Estimators= 23, Max Depth = 39, Max Feat = 6, AUC PR 0.532
Number Estimators= 122, Max Depth = 2, Max Feat = 13, AUC PR 0.517
Number Estimators= 23, Max Depth = 39, Max Feat = 6, AUC PR 0.271
Number Estimators= 33, Max Depth = 54, Max Feat = 13, AUC PR 0.403
Number Estimators= 33, Max Depth = 54, Max Feat = 13, AUC PR 0.556
Number Estimators= 39, Max Depth = 2, Max Feat = 19, AUC PR 0.509
Number Estimators= 33, Max Depth = 54, Max Feat = 13, AUC PR 0.285
Number Estimators= 39, Max Depth = 2, Max Feat = 19, AUC PR 0.631
Number Estimators= 31, Max Depth = 35, Max Feat = 9, AUC PR 0.391
Number Estimators= 39, Max Depth = 2, Max Feat = 19, AUC PR 0.538
Number Estimators= 31, Max Depth = 31, Max Feat = 3, AUC PR 0.420
Nu

Number Estimators= 104, Max Depth = 18, Max Feat = 17, AUC PR 0.184
Number Estimators= 112, Max Depth = 46, Max Feat = 5, AUC PR 0.175
Number Estimators= 85, Max Depth = 2, Max Feat = 18, AUC PR 0.409
Number Estimators= 125, Max Depth = 7, Max Feat = 7, AUC PR 0.484
Number Estimators= 30, Max Depth = 58, Max Feat = 3, AUC PR 0.185
Number Estimators= 125, Max Depth = 7, Max Feat = 7, AUC PR 0.375
Number Estimators= 47, Max Depth = 35, Max Feat = 15, AUC PR 0.183
Number Estimators= 21, Max Depth = 53, Max Feat = 22, AUC PR 0.473
Number Estimators= 21, Max Depth = 53, Max Feat = 22, AUC PR 0.358
Number Estimators= 95, Max Depth = 3, Max Feat = 12, AUC PR 0.264
Number Estimators= 26, Max Depth = 28, Max Feat = 7, AUC PR 0.460
Number Estimators= 26, Max Depth = 28, Max Feat = 7, AUC PR 0.355
Number Estimators= 13, Max Depth = 39, Max Feat = 12, AUC PR 0.209
Number Estimators= 34, Max Depth = 37, Max Feat = 9, AUC PR 0.186
Number Estimators= 91, Max Depth = 49, Max Feat = 5, AUC PR 0.175
Num

Number Estimators= 59, Max Depth = 55, Max Feat = 5, AUC PR 0.485
Number Estimators= 111, Max Depth = 53, Max Feat = 1, AUC PR 0.442
Number Estimators= 47, Max Depth = 12, Max Feat = 15, AUC PR 0.350
Number Estimators= 47, Max Depth = 12, Max Feat = 15, AUC PR 0.440
Number Estimators= 103, Max Depth = 56, Max Feat = 15, AUC PR 0.254
Number Estimators= 43, Max Depth = 9, Max Feat = 20, AUC PR 0.345
Number Estimators= 10, Max Depth = 47, Max Feat = 19, AUC PR 0.350
Number Estimators= 43, Max Depth = 9, Max Feat = 20, AUC PR 0.455
Number Estimators= 10, Max Depth = 47, Max Feat = 19, AUC PR 0.449
Number Estimators= 54, Max Depth = 32, Max Feat = 23, AUC PR 0.261
Number Estimators= 77, Max Depth = 56, Max Feat = 44, AUC PR 0.195
Number Estimators= 105, Max Depth = 21, Max Feat = 24, AUC PR 0.354
Number Estimators= 105, Max Depth = 21, Max Feat = 24, AUC PR 0.451
Number Estimators= 104, Max Depth = 3, Max Feat = 16, AUC PR 0.496
{'patient': 'cb46fd46', 'days': [3, 4, 5, 6, 7], 'wsize': 50, 

Number Estimators= 40, Max Depth = 23, Max Feat = 43, AUC PR 0.293
Number Estimators= 40, Max Depth = 23, Max Feat = 43, AUC PR 0.197
Number Estimators= 117, Max Depth = 52, Max Feat = 34, AUC PR 0.231
Number Estimators= 117, Max Depth = 52, Max Feat = 34, AUC PR 0.337
Number Estimators= 41, Max Depth = 54, Max Feat = 44, AUC PR 0.239
Number Estimators= 117, Max Depth = 52, Max Feat = 34, AUC PR 0.294
Number Estimators= 41, Max Depth = 54, Max Feat = 44, AUC PR 0.343
Number Estimators= 41, Max Depth = 54, Max Feat = 44, AUC PR 0.301
Number Estimators= 117, Max Depth = 52, Max Feat = 34, AUC PR 0.195
Number Estimators= 95, Max Depth = 31, Max Feat = 27, AUC PR 0.232
Number Estimators= 95, Max Depth = 31, Max Feat = 27, AUC PR 0.333
Number Estimators= 109, Max Depth = 51, Max Feat = 9, AUC PR 0.234
Number Estimators= 109, Max Depth = 51, Max Feat = 9, AUC PR 0.347
Number Estimators= 41, Max Depth = 54, Max Feat = 44, AUC PR 0.197
Number Estimators= 95, Max Depth = 31, Max Feat = 27, AUC 

Number Estimators= 105, Max Depth = 17, Max Feat = 73, AUC PR 0.337
Number Estimators= 10, Max Depth = 44, Max Feat = 57, AUC PR 0.358
Number Estimators= 8, Max Depth = 50, Max Feat = 72, AUC PR 0.364
Number Estimators= 54, Max Depth = 22, Max Feat = 38, AUC PR 0.197
Number Estimators= 105, Max Depth = 17, Max Feat = 73, AUC PR 0.300
Number Estimators= 81, Max Depth = 2, Max Feat = 77, AUC PR 0.299
Number Estimators= 10, Max Depth = 44, Max Feat = 57, AUC PR 0.317
Number Estimators= 74, Max Depth = 35, Max Feat = 32, AUC PR 0.343
Number Estimators= 8, Max Depth = 50, Max Feat = 72, AUC PR 0.318
Number Estimators= 105, Max Depth = 17, Max Feat = 73, AUC PR 0.233
Number Estimators= 10, Max Depth = 44, Max Feat = 57, AUC PR 0.249
Number Estimators= 81, Max Depth = 50, Max Feat = 18, AUC PR 0.338
Number Estimators= 37, Max Depth = 26, Max Feat = 57, AUC PR 0.196
Number Estimators= 8, Max Depth = 50, Max Feat = 72, AUC PR 0.254
Number Estimators= 74, Max Depth = 35, Max Feat = 32, AUC PR 0.