## Random Forest

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

#import expectation_reflection as ER
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
#data_list = ['1paradox']
#data_list = ['29parkinson','30paradox2','31renal','32patientcare','33svr','34newt','35pcos']
data_list = np.loadtxt('data_list_30sets.txt',dtype='str')

print(data_list)

['1paradox' '2peptide' '3stigma' '4nki' '5mental' '6smoking' '7anemia'
 '8language' '9coag' '10tazamia' '11hepato' '12heat' '13ef' '14cervix'
 '15heart' '16liver' '17nwosu' '18school' '19ibs' '21survival'
 '29parkinson' '30paradox2' '31renal' '33svr' '35pcos' '36probiotic'
 '101kidney' '102breast_cancer' '103diabetes_niddk'
 '104diabetic_retinopathy']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    #Xy = np.loadtxt('%s/data_processed.dat'%data_name)
    Xy = np.loadtxt('../classification_data/%s/data_processed_mean.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    #print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):
    model = RandomForestClassifier()
        
    # Number of trees in random forest
    #n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
    n_estimators = [10,50,100]

    # Number of features to consider at every split
    max_features = ['auto']

    # Maximum number of levels in tree
    #max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    max_depth = [2,4,6,8,10]
    #max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [5, 10, 15, 20]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]

    # Method of selecting samples for training each tree
    #bootstrap = [True, False]
    bootstrap = [True]
        
    # Create the random grid
    hyper_parameters = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    #random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, 
    #                           cv = 4, verbose=2, random_state=1, n_jobs = -1)
    
    # Create grid search using cross validation
    clf = GridSearchCV(model, hyper_parameters, cv=4, iid='deprecated')
    
    # Fit grid search
    best_model = clf.fit(X_train, y_train)
    
    # View best hyperparameters
    #print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
    #print('Best C:', best_model.best_estimator_.get_params()['C'])
    
    # best hyper parameters
    print('best_hyper_parameters:',best_model.best_params_)

    # performance:
    y_test_pred = best_model.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test,y_test_pred)
    #print('Accuracy:', acc)

    p_test_pred = best_model.best_estimator_.predict_proba(X_test) # prob of [0,1]
    p_test_pred = p_test_pred[:,1] # prob of 1    
    fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)
    roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)
    
    f1_score = 2*precision*recall/(precision+recall)
    
    return acc,roc_auc,precision,recall,f1_score

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)
f1_score = np.zeros(n_data)
#data_id = 0
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id],f1_score[data_id] =\
            measure_performance(X_train,X_test,y_train,y_test)
    print(data_id,acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id],f1_score[data_id])

data_name: 1paradox
(array([-1.,  1.]), array([60, 60]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
0 0.75 0.9071680376028202 0.6333333333333333 0.8260869565217391 0.7169811320754716
data_name: 2peptide
(array([-1.,  1.]), array([23, 23]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
1 0.9565217391304348 1.0 1.0 0.9090909090909091 0.9523809523809523
data_name: 3stigma
(array([-1.,  1.]), array([2725, 2725]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
2 1.0 1.0000000000000002 1.0 1.0 1.0
data_name: 4nki
(array([-1.,  1.]), array([77, 77]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n



best_hyper_parameters: {'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}
25 0.9 1.0 0.75 1.0 0.8571428571428571
data_name: 101kidney
(array([-1.,  1.]), array([149, 149]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}
26 0.9798657718120806 0.9985554351751533 0.9857142857142858 0.971830985915493 0.9787234042553192
data_name: 102breast_cancer
(array([-1.,  1.]), array([212, 212]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
27 0.9481132075471698 0.9842775820005422 0.9821428571428571 0.9243697478991597 0.9523809523809523
data_name: 103diabetes_niddk
(array([-1.,  1.]), array([267, 267]))
best_hyper_parameters: {'bootstrap': True, 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_sp

In [7]:
print('acc_mean:',acc.mean())
print('roc_mean:',roc_auc.mean())
print('precision:',precision.mean())
print('recall:',recall.mean())
print('f1_score:',f1_score.mean())

acc_mean: 0.8379074846630751
roc_mean: 0.8891390512800414
precision: 0.8433674714419143
recall: 0.8316621188546226
f1_score: 0.8331130257266882


In [8]:
np.savetxt('result_mean_RF.dat',(roc_auc,acc,precision,recall,f1_score),fmt='%f')