In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

#import ER_multiclass as ER

#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import GaussianNB
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
X = np.loadtxt('../gen_cleaned_X.txt')
y = np.loadtxt('../gen_cleaned_y.txt')

In [4]:
ipred = 0
train_size = 0.8

X_train0,X_test,y_train0,y_test = train_test_split(X,y,test_size=0.2,random_state = ipred)

idx_train = np.random.choice(len(y_train0),size=int(train_size*len(y)),replace=False)
X_train,y_train = X_train0[idx_train],y_train0[idx_train]

In [5]:
## Optimize hyper parameters by RandomizedSearchCV
model = RandomForestClassifier(random_state = 1)

In [6]:
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

# Number of features to consider at every split
max_features = ['auto']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
#max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [8]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
random_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_features': ['auto'],
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_samples_split': [5, 10, 15, 20],
 'min_samples_leaf': [1, 2, 3, 4, 5],
 'bootstrap': [True, False]}

In [10]:
random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=1, n_jobs = -1)

random_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    8.8s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['auto'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [5, 10, 15, 20], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [11]:
# best hyper parameters
random_search.best_params_

{'n_estimators': 90,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': False}

In [12]:
y_pred = random_search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

0.9622047244094488
