## Random Forest parameters tuning

This notebook is dedicated to finding the best model's hyperparameters for training with RandomOverSampling.

In [1]:
import sklearn
import pandas as pd
import numpy as np
#import scikitplot as skplt
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, make_scorer
from datetime import datetime

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score

from imblearn.over_sampling import RandomOverSampler

## Data

In [8]:
df = pd.read_csv('../data/train.csv', '|')

X, y = df.drop('fraud', axis=1), df['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ros = RandomOverSampler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_res, y_res = ros.fit_resample(X_train, y_train)

## random grid

In [9]:
# Number of trees in random forest
n_estimators = [1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

## Without tuning



In [10]:
clf = RandomForestClassifier()
clf.fit(X_res, y_res)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Stats (without tuning)

In [11]:
get_model_stats(clf)

Accuracy: 0.9707446808510638, precision: 0.8125, recall: 0.6190476190476191, f1: 0.7027027027027026, cost: -9000


## Random tuning

In [12]:
clf_random = RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator = clf_random, 
                               param_distributions = random_grid, 
                               n_iter = 200, 
                               cv = 5, 
                               verbose=5, 
                               random_state=42, 
                               n_jobs = -1,
                               scoring = 'f1',
                               return_train_score = True)
clf_random.fit(X_res, y_res)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  5.3min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=-1,
          param_distributions={'n_estimators': [1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='f1', verbose=5)

## Stats (random tuning)

In [13]:
get_model_stats(clf_random)

Accuracy: 0.7894736842105262, precision: 0.8823529411764706, recall: 0.7142857142857143, f1: 0.7894736842105262, cost: -8975


## Best params (random tuning)

In [14]:
clf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

## Grid tuning 1

In [15]:
# Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 10)]
n_estimators = [1000]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = ([int(x) for x in np.linspace(20, 50, num = 11)])
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8,9,10,11]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]
# Method of selecting samples for training each tree
bootstrap = [False]

# Create the detail grid
detail_grid1 = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [16]:
clf_g1 = RandomForestClassifier()
clf_g1 = GridSearchCV(estimator = clf_g1, param_grid = detail_grid1, 
                          cv = 5, n_jobs = -1, verbose = 5, scoring = 'f1')
clf_g1.fit(X_res, y_res)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  2.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [1000], 'max_features': ['sqrt'], 'max_depth': [20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, None], 'min_samples_split': [8, 9, 10, 11], 'min_samples_leaf': [1, 2], 'bootstrap': [False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=5)

## Stats grid tuning 1

In [17]:
get_model_stats(clf_g1)

Accuracy: 0.7894736842105262, precision: 0.8823529411764706, recall: 0.7142857142857143, f1: 0.7894736842105262, cost: -8975


## Best params - grid tuning 1

In [19]:
clf_g1.best_params_

{'bootstrap': False,
 'max_depth': 47,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 1000}

## Grid tuning 2

In [20]:
# Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 10)]
n_estimators = [1000]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [21,22,23,24,25]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [9,10,11,12,13,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]
# Method of selecting samples for training each tree
bootstrap = [False]

# Create the detail grid
detail_grid2 = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [21]:
clf_g2 = RandomForestClassifier()
clf_g2 = GridSearchCV(estimator = clf_g2, param_grid = detail_grid2, 
                      cv = 5, n_jobs = -1, verbose = 5, scoring = 'f1')
clf_g2.fit(X_res, y_res)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [1000], 'max_features': ['sqrt'], 'max_depth': [21, 22, 23, 24, 25, None], 'min_samples_split': [9, 10, 11, 12, 13, 14], 'min_samples_leaf': [1, 2], 'bootstrap': [False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=5)

## Stats grid tuning 2

In [22]:
get_model_stats(clf_g2)

Accuracy: 0.7567567567567567, precision: 0.875, recall: 0.6666666666666666, f1: 0.7567567567567567, cost: -9000


## Best params grid tuning 2

In [23]:
clf_g2.best_params_

{'bootstrap': False,
 'max_depth': 24,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 1000}

### Helper functions

In [24]:
def get_model_stats(clf):
    pred = clf.predict(X_test)
    acc = clf.score(X_test, y_test)
    prec = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    cost = profit_function(y_test, pred)
    print('Accuracy: {}, precision: {}, recall: {}, f1: {}, cost: {}'.format(acc, prec, rec, f1, cost))

In [25]:
def profit_function(y,y_pred):
    y_true = y
    confusion_m = np.array(confusion_matrix(y_true, y_pred))
    price_matrix = [[0,-25],[-5,5]]
    profit_matrix = np.matmul(confusion_m,price_matrix)
    return profit_matrix.sum()