## Prediction with Random Forest

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

#import expectation_reflection as ER
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
data_list = ['1paradox']
#data_list = np.loadtxt('data_list.txt',dtype='str')

print(data_list)

['1paradox']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    #Xy = np.loadtxt('%s/data_processed.dat'%data_name)
    Xy = np.loadtxt('../data/%s/data_processed.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):
    model = RandomForestClassifier(random_state = 1)
        
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

    # Number of features to consider at every split
    max_features = ['auto']

    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    #max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [5, 10, 15, 20]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]

    # Method of selecting samples for training each tree
    bootstrap = [True, False]
        
    # Create the random grid
    hyper_parameters = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    #random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, 
    #                           cv = 4, verbose=2, random_state=1, n_jobs = -1)
    
    model_search = GridSearchCV(model,hyper_parameters,cv=4,verbose=0,n_jobs = -1)
    
    model_search.fit(X_train, y_train)

    # best hyper parameters
    print(model_search.best_params_)

    # performance:
    #y_test_pred,p_test_pred = ER.predict(X_test,h0[il2_opt],w[il2_opt,:])
    y_test_pred = model_search.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test,y_test_pred)
    #print('Accuracy:', acc)
    
    p_test_pred = model_search.best_estimator_.predict_proba(X_test) # prob of [0,1]
    p_test_pred = p_test_pred[:,1] # prob of 1    
    fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)
    roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)

    return acc,roc_auc,precision,recall

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)

#data_id = 0
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id] =\
           measure_performance(X_train,X_test,y_train,y_test)

data_name: 1paradox
(array([0., 1.]), array([169,  60]))
(array([0., 1.]), array([60, 60]))
{'bootstrap': True, 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 60}




In [7]:
#np.savetxt('result_RF.dat',(roc_auc,acc,precision,recall),fmt='%f')