In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
X = dataset.iloc[:, 2:4].values
y = dataset.iloc[:, -1].values
print(X.shape)
print(y.shape)

(400, 2)
(400,)


In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(300, 2)
(300,)
(100, 2)
(100,)


In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [6]:
sc.fit_transform(X_train)
sc.transform(X_test)

array([[ 0.812419  , -1.39920777],
       [ 2.0889839 ,  0.52871943],
       [-0.95513241, -0.75656537],
       [ 1.0088136 ,  0.76240757],
       [-0.85693511, -1.22394166],
       [-0.75873781, -0.23076704],
       [ 0.9106163 ,  1.08372877],
       [-0.85693511,  0.38266434],
       [ 0.2232352 ,  0.14897619],
       [ 0.4196298 , -0.14313399],
       [-0.2677513 , -0.14313399],
       [ 1.4998001 , -1.04867555],
       [-1.44611891, -0.6397213 ],
       [-1.74071081, -1.36999675],
       [-0.75873781,  0.49950841],
       [-0.2677513 ,  1.11293979],
       [ 1.4016028 , -0.93183148],
       [ 0.812419  ,  0.11976517],
       [ 0.1250379 , -0.8149874 ],
       [ 1.794392  , -0.28918908],
       [-1.54431621, -1.25315268],
       [-0.85693511,  0.29503128],
       [ 0.9106163 , -1.36999675],
       [ 2.0889839 ,  0.17818721],
       [-1.83890811, -1.48684082],
       [ 1.3034055 , -1.36999675],
       [ 0.4196298 ,  0.29503128],
       [-0.0713567 , -0.49366621],
       [ 1.6961947 ,

In [7]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10,
    criterion='entropy',
    max_depth=10,
    #min_samples_split=2,
    #min_samples_leaf=1,
    #min_weight_fraction_leaf=0.0,
    #max_features='auto',
    max_leaf_nodes=8,
    #min_impurity_decrease=0.0,
    #min_impurity_split=None,
    #bootstrap=True,
    #oob_score=False,
    n_jobs=-1,
    random_state=42,
    verbose=1)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print(confusion_matrix(y_test, pred))
print(accuracy_score(y_test, pred))

cross_val = cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished


[[57  6]
 [ 1 36]]
0.93


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.2s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Paralle

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [9]:
est = RandomForestClassifier(n_jobs=-1)
params = {'n_estimators':range(0,550,50),
    'criterion':['entropy','gini'],
    'max_depth': range(0,20,4),
    #min_samples_split=2,
    'min_samples_leaf': range(1,4),
    #min_weight_fraction_leaf=0.0,
    'max_features':range(1,3),
    'max_leaf_nodes':range(8,40,8),
    #min_impurity_decrease=0.0,
    #min_impurity_split=None,
    'bootstrap':[True, False],
    #oob_score=False,
    }

In [10]:
def hypertuning_rfclf(classifier, params, iterations, dataset_X, dataset_y):
    rdSearch = RandomizedSearchCV(classifier,
                                  params,
                                 n_jobs=-1,
                                 n_iter=iterations,
                                 cv=9)
    rdSearch.fit(dataset_X, dataset_y)
    best_params = rdSearch.best_params_
    best_score = rdSearch.best_score_
    return best_params, best_score    

In [11]:
rf_params, rf_ht_score = hypertuning_rfclf(est, params, 40, X, y)

In [12]:
rf_params

{'n_estimators': 500,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 8,
 'max_features': 1,
 'max_depth': 12,
 'criterion': 'entropy',
 'bootstrap': True}

In [13]:
rf_ht_score

0.9095398428731762

In [16]:
# now that we've been provided with the best values, we create the classifier 
# again with these values and make a prediction after fine-tuning the hyper-parameters

classifier = RandomForestClassifier(
n_estimators=500,
    criterion='entropy',
    max_depth=12,
    min_samples_leaf=1,
    max_features=2,
    max_leaf_nodes=8,
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
)

classifier.fit(X_train, y_train)
pred_new = classifier.predict(X_test)

print(confusion_matrix(y_test, pred_new))
print(accuracy_score(y_test, pred_new))


[[57  6]
 [ 1 36]]
0.93


In [17]:
cross_val = cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Paralle