In [7]:
import sklearn
import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

In [3]:
df = pd.read_csv('../data/train.csv', '|')

X, y = df.drop('fraud', axis=1), df['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sm = SMOTE(random_state=123)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [5]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [46]:
clf = RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = 200, 
                               cv = 5, 
                               verbose=5, 
                               random_state=42, 
                               n_jobs = -1,
                               scoring = 'recall',
                               return_train_score = True)
clf_random.fit(X_res, y_res)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 19.2min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='recall', verbose=5)

In [47]:
clf_random.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 1800}

In [48]:
clf2 = RandomForestClassifier(bootstrap=False,
                              max_depth=80,
                              max_features='sqrt',
                              min_samples_leaf=2,
                              min_samples_split=5,
                              n_estimators=400)

clf2.fit(X_train, y_train)

pred = clf2.predict(X_test)
acc = clf2.score(X_test, y_test)
prec = precision_score(y_test, pred)
rec = recall_score(y_test, pred)

print('Accuracy: {}, precision: {}, recall: {}'.format(acc, prec, rec))

Accuracy: 0.9813829787234043, precision: 0.8823529411764706, recall: 0.75
