## Prediction with Random Forest

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

#import expectation_reflection as ER
#from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

Import data containing features and target.

In [3]:
Xy = np.loadtxt('data_processed.dat') 
X = Xy[:,:-1]
y = Xy[:,-1]

In [4]:
X,y = make_data_balance(X,y)

Shuffle the data.

In [5]:
X, y = shuffle(X, y, random_state=1)

We split data into training and test sets, then we use the training to train our model, use the test set to evaluate the performance of our method. The size of test set can be changed by `test_size`.

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)

We rescale the data.

In [7]:
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

We now train model with the `X_train` and `y_train`. Please note that our model has one hyper parameter `l2`. We will use cross validation to find the optimal value of `l2`. This process will also split the training set again into `X_train1` and `X_val`. The test set is assumed to be unknown.

In [8]:
model = RandomForestClassifier(random_state = 1)

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

# Number of features to consider at every split
max_features = ['auto']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
#max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [10]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [11]:
random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, 
                               cv = 4, verbose=2, random_state=1, n_jobs = -1)

random_search.fit(X_train, y_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    2.1s finished


RandomizedSearchCV(cv=4, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['auto'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [5, 10, 15, 20], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [12]:
# best hyper parameters
random_search.best_params_

{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 8,
 'bootstrap': True}

In [13]:
y_pred = random_search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

1.0


In [14]:
p_test_pred = random_search.best_estimator_.predict_proba(X_test)

In [15]:
fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)

ValueError: bad input shape (149, 2)

In [16]:
p_test_pred

array([[9.20122683e-01, 7.98773167e-02],
       [2.70000000e-01, 7.30000000e-01],
       [9.99718310e-01, 2.81690141e-04],
       [9.96575453e-01, 3.42454728e-03],
       [9.97423687e-02, 9.00257631e-01],
       [9.93670691e-01, 6.32930919e-03],
       [0.00000000e+00, 1.00000000e+00],
       [1.06013986e-02, 9.89398601e-01],
       [2.60439560e-02, 9.73956044e-01],
       [9.94384977e-01, 5.61502347e-03],
       [4.62676768e-02, 9.53732323e-01],
       [9.99718310e-01, 2.81690141e-04],
       [9.99718310e-01, 2.81690141e-04],
       [9.93884977e-01, 6.11502347e-03],
       [9.93289738e-01, 6.71026157e-03],
       [9.97039738e-01, 2.96026157e-03],
       [9.85075453e-01, 1.49245473e-02],
       [8.91557596e-01, 1.08442404e-01],
       [9.93242119e-01, 6.75788062e-03],
       [0.00000000e+00, 1.00000000e+00],
       [9.99718310e-01, 2.81690141e-04],
       [6.93351968e-01, 3.06648032e-01],
       [9.94384977e-01, 5.61502347e-03],
       [0.00000000e+00, 1.00000000e+00],
       [9.988849