In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
df = pd.read_pickle("training_df2")

In [None]:
def paint_confusion_matrix_and_report(model, X0_test, y0_test):
    y_pred = model.predict(X0_test)
    cm2 = confusion_matrix(y0_test, y_pred.round())
    ax= plt.subplot()
    sns.heatmap(cm2, annot=True, ax = ax, fmt="d", cmap="YlGnBu")
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(['no-conversion', 'conversion']); ax.yaxis.set_ticklabels(['no-conversion', 'conversion'])
    prec_rec = classification_report(y_pred, y0_test, target_names=['no-conversion', 'conversion'])
    print(prec_rec)

In [None]:
X = df.drop(['label'], axis=1)
y = df['label']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [None]:
def fit_model(model, X0_train, y0_train):
    start = time()
    model.fit(X0_train,y0_train)
    end = time()
    result = end - start
    print('Training time = %.3f seconds' % result)
    return model

In [None]:
print("Before undersampling: ", Counter(y_train))
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
print("After undersampling: ", Counter(y_train_under))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 400, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
# let´s try first with the resample dataset
start = time()
rf_random.fit(X_train_under, y_train_under)
end = time()
result = end - start
print('Training time = %.3f seconds' % result)

In [None]:
rf_random.best_params_

In [None]:
best_model = RandomForestClassifier(n_estimators= 88,
 min_samples_split= 10,
 min_samples_leaf= 4,
 max_features= 'sqrt',
 max_depth= 80,
 bootstrap= True,
 n_jobs=4)
bm = fit_model(best_model, X_train_under, y_train_under)
paint_confusion_matrix_and_report(bm, X_test, y_test)

In [None]:
# repeating the same configuration as for the other dataset
best_model = RandomForestClassifier(n_estimators= 361,
 min_samples_split= 5,
 min_samples_leaf= 4,
 max_features= 'auto',
 max_depth= 10,
 bootstrap= False,
 n_jobs=4)
bm = fit_model(best_model, X_train_under, y_train_under)
paint_confusion_matrix_and_report(bm, X_test, y_test)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [5, 10, 20],
    'max_features': ['auto'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [4, 5, 10],
    'n_estimators': [200, 300, 400]
}

In [None]:
# Create a based model
rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train_under, y_train_under)
grid_search.best_params_

In [None]:
final_model = RandomForestClassifier(n_estimators= 400,
 min_samples_split= 10,
 min_samples_leaf= 3,
 max_features= 'auto',
 max_depth= 20,
 bootstrap= False,
 n_jobs=4)
fm = fit_model(final_model, X_train_under, y_train_under)
paint_confusion_matrix_and_report(fm, X_test, y_test)

# Conclusion
Not much improvement here