## Prediction with Random Forest

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

#import expectation_reflection as ER
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
data_list = np.loadtxt('data_list.txt',dtype='str')

print(data_list)

['1paradox' '2peptide' '3stigma' '4nki' '5mental' '6smoking' '7anemia'
 '8language' '9coag' '10tazamia' '11hepato' '12heat' '13ef' '14cervix'
 '15heart' '16liver' '17nwosu' '18school' '19ibs' '21survival' '101kidney'
 '102breast_cancer' '103diabetes_niddk' '104diabetic_retinopathy']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    Xy = np.loadtxt('%s/data_processed.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):
    model = RandomForestClassifier(random_state = 1)
        
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

    # Number of features to consider at every split
    max_features = ['auto']

    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    #max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [5, 10, 15, 20]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]

    # Method of selecting samples for training each tree
    bootstrap = [True, False]
        
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, 
                               cv = 4, verbose=2, random_state=1, n_jobs = -1)
    
    random_search.fit(X_train, y_train)

    # best hyper parameters
    print(random_search.best_params_)

    # performance:
    #y_test_pred,p_test_pred = ER.predict(X_test,h0[il2_opt],w[il2_opt,:])
    y_test_pred = random_search.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test,y_test_pred)
    #print('Accuracy:', acc)
    
    p_test_pred = random_search.best_estimator_.predict_proba(X_test) # prob of [0,1]
    p_test_pred = p_test_pred[:,1] # prob of 1    
    fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)
    roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)

    return acc,roc_auc,precision,recall

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)

#data_id = 0
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id] =\
           measure_performance(X_train,X_test,y_train,y_test)

data_name: 1paradox
(array([0., 1.]), array([169,  60]))
(array([0., 1.]), array([60, 60]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 70, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 7, 'bootstrap': False}
data_name: 2peptide
(array([0., 1.]), array([675,  23]))
(array([0., 1.]), array([23, 23]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.7s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.3s finished


{'n_estimators': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 2, 'bootstrap': True}
data_name: 3stigma
(array([0., 1.]), array([2725, 7940]))
(array([0., 1.]), array([2725, 2725]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    1.2s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    2.1s finished


{'n_estimators': 30, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}
data_name: 4nki
(array([0., 1.]), array([195,  77]))
(array([0., 1.]), array([77, 77]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    1.0s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 10, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 4, 'bootstrap': True}
data_name: 5mental
(array([0., 1.]), array([616, 147]))
(array([0., 1.]), array([147, 147]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.9s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 80, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 4, 'bootstrap': False}
data_name: 6smoking
(array([0., 1.]), array([852, 722]))
(array([0., 1.]), array([722, 722]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 70, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 7, 'bootstrap': False}
data_name: 7anemia
(array([0., 1.]), array([193,  43]))
(array([0., 1.]), array([43, 43]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 2, 'bootstrap': True}
data_name: 8language
(array([0., 1.]), array([896, 267]))
(array([0., 1.]), array([267, 267]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 90, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}
data_name: 9coag
(array([0., 1.]), array([504, 994]))
(array([0., 1.]), array([504, 504]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    1.0s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 90, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 2, 'bootstrap': True}
data_name: 10tazamia
(array([0., 1.]), array([547, 124]))
(array([0., 1.]), array([124, 124]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.8s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 40, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 3, 'bootstrap': False}
data_name: 11hepato
(array([0., 1.]), array([63, 99]))
(array([0., 1.]), array([63, 63]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.8s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished


{'n_estimators': 70, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 1, 'bootstrap': False}
data_name: 12heat
(array([0., 1.]), array([2492,   83]))
(array([0., 1.]), array([83, 83]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.7s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 40, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 5, 'bootstrap': True}
data_name: 13ef
(array([0., 1.]), array([ 93, 572]))
(array([0., 1.]), array([93, 93]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 2, 'bootstrap': True}
data_name: 14cervix
(array([0., 1.]), array([834,  24]))
(array([0., 1.]), array([24, 24]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.7s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 90, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 3, 'bootstrap': False}
data_name: 15heart
(array([0., 1.]), array([138, 165]))
(array([0., 1.]), array([138, 138]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.8s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 100, 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 1, 'bootstrap': False}
data_name: 16liver
(array([0., 1.]), array([167, 416]))
(array([0., 1.]), array([167, 167]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.8s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 80, 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 3, 'bootstrap': True}
data_name: 17nwosu
(array([0., 1.]), array([59, 92]))
(array([0., 1.]), array([59, 59]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.7s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished


{'n_estimators': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 2, 'bootstrap': True}
data_name: 18school
(array([0., 1.]), array([  68, 3879]))
(array([0., 1.]), array([68, 68]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.6s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 90, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 6, 'bootstrap': True}
data_name: 19ibs
(array([0., 1.]), array([ 33, 138]))
(array([0., 1.]), array([33, 33]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.7s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 10, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 4, 'bootstrap': True}
data_name: 21survival
(array([0., 1.]), array([1945,  123]))
(array([0., 1.]), array([123, 123]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 70, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7, 'bootstrap': False}
data_name: 101kidney
(array([0., 1.]), array([149, 223]))
(array([0., 1.]), array([149, 149]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 90, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 6, 'bootstrap': True}
data_name: 102breast_cancer
(array([0., 1.]), array([357, 212]))
(array([0., 1.]), array([212, 212]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 80, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 4, 'bootstrap': False}
data_name: 103diabetes_niddk
(array([0., 1.]), array([481, 252]))
(array([0., 1.]), array([252, 252]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    0.8s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


{'n_estimators': 30, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 8, 'bootstrap': True}
data_name: 104diabetic_retinopathy
(array([0., 1.]), array([536, 611]))
(array([0., 1.]), array([536, 536]))
Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 136 out of 400 | elapsed:    1.0s remaining:    2.0s


{'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.7s finished


In [7]:
np.savetxt('result_RF.dat',(roc_auc,acc,precision,recall),fmt='%f')