# Random Forest Classification - GridSearchCV

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

## Importing the dataset

In [2]:
df = pd.read_csv('Classifiers.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Feature Scaling

In [4]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Preparing the parameter grid

In [5]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 30)]
criterion = ['gini', 'entropy']
max_features = ['sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
min_samples_split = [2, 4, 8, 16]
min_samples_leaf = [1, 2, 4, 8, 16]
bootstrap = [True, False]
 
param_grid = {'n_estimators': n_estimators,
                'criterion': criterion,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

## Finding the best parameters

In [6]:
rf_classifier = RandomForestClassifier()
 
# `n_jobs` means parallel jobs to run -> -1 means using all processors
grid_search = GridSearchCV(rf_classifier, param_grid, cv = 3, verbose = 1, n_jobs = -1)
 
grid_fit = grid_search.fit(X_train, y_train)
grid_fit.best_params_

Fitting 3 folds for each of 48000 candidates, totalling 144000 fits


{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 100,
 'max_features': 'log2',
 'min_samples_leaf': 16,
 'min_samples_split': 2,
 'n_estimators': 10}

## Using the best parameters to train the RF classifier

In [7]:
# Use the best parameters from the grid search
grid_classifier = RandomForestClassifier(n_estimators=grid_fit.best_params_['n_estimators'], 
                                        criterion=grid_fit.best_params_['criterion'],
                                        bootstrap=grid_fit.best_params_['bootstrap'],
                                        max_depth=grid_fit.best_params_['max_depth'],
                                        max_features=grid_fit.best_params_['max_features'],
                                        min_samples_leaf=grid_fit.best_params_['min_samples_leaf'],
                                        min_samples_split=grid_fit.best_params_['min_samples_split'],
                                        random_state = 42)
grid_classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [8]:
y_pred = grid_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[47  5]
 [ 2 26]]


0.9125