In [1]:
! pip install pandas_profiling
! pip install pycaret
! pip install imbalanced-learn









In [1]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pandas_profiling

from pycaret.classification import *
from imblearn.combine import SMOTEENN

from pycaret.utils import check_metric

from datetime import datetime as dt


MLflow support for Python 3.6 is deprecated and will be dropped in an upcoming release. At that point, existing Python 3.6 workflows that use MLflow will continue to work without modification, but Python 3.6 users will no longer get access to the latest MLflow features and bugfixes. We recommend that you upgrade to Python 3.7 or newer.



# 2. Train Models

## 2.1 Final Clean & Partition

In [2]:
# Working from saved CSV
data = pd.read_csv('orange_small_train trimmed v4.csv')
data = data.set_index('Unnamed: 0')

In [3]:
# Some cleanup that we'll move to the EDA & Prep module once we're further along. Get rid of the junk
# data = data.loc[:, data.columns!='Var132']

# Trimming variable set based on Feature Importance data from an offline Random Forest train
data = data.loc[:, ['Var126','Var218','Var211','Var227','Var225','Var194','Var72','Var144','Var28','Var7','Var81',
                    'Var94','Var153','Var38','Var83','Var73','Var6','Var229','Var123','Var113','Var65','Var133',
                    'Var57','Var109','Var112','Var119','Var76','Var13','Var163','Var21','Var143','Var149','Var85',
                    'Var160','Var25','Var24','Var125','Var74',
                   'appetency']]    

In [4]:
data_seen = data.sample(frac=0.90, random_state=42)
data_unseen = data.drop(data_seen.index)
data_seen.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data_seen.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (40492, 39)
Unseen Data For Predictions: (4499, 39)


In [5]:
st = dt.now()
preproc = setup(data = data_seen, target = 'appetency', session_id=42, 
                normalize = True, 
                transformation = True, 
                ignore_low_variance = True,
                remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                log_experiment = True, experiment_name = 'PreprocOnly',
                preprocess = True,
                train_size = 0.8,
                silent = True, fold = 10)

print("Cell time: ",dt.now()-st)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,appetency
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(40492, 39)"
5,Missing Values,True
6,Numeric Features,28
7,Categorical Features,10
8,Ordinal Features,False
9,High Cardinality Features,False


Cell time:  0:00:07.313241


Now pull out the preprocessed training data, balance it, and put it back <br>
**DON'T** call ```setup()``` again - no need to. And calling it with ```preprocess = False``` just erases all <br>
our good work on normalizing, standardizing, etc for the datasets we're not balancing. 

In [31]:
def print_config(): 
    print("X: " + str(get_config('X').shape))
    y = get_config('y')
    print("y: " + str(y.shape))
    print("y target:")
    print(str(y.value_counts()))
    print("")
    print("X_train: " + str(get_config('X_train').shape))
    y = get_config('y_train')
    print("y_train: " + str(y.shape))
    print("y_train target:")
    print(str(y.value_counts()))
    print("")
    print("X_test: " + str(get_config('X_test').shape))
    y = get_config('y_test')
    print("y_test: " + str(y.shape))
    print("y_test target:")
    print(str(y.value_counts()))

In [6]:
train_X = get_config('X_train')
train_y = get_config('y_train')

In [36]:
smote_enn = SMOTEENN(random_state=0, sampling_strategy = 0.2)

st=dt.now()
X_resampled, y_resampled = smote_enn.fit_resample(train_X, train_y)
print("Fit_sample time: ",dt.now()-st)

set_config('X_train', X_resampled)
set_config('y_train', y_resampled)

### 2.5 Tune Model

In [8]:
def get_importance(rf, colList):
    imp_val = rf.feature_importances_

    imp = pd.DataFrame(imp_val, colList)
    imp.columns = ['Importance']
    imp.sort_values('Importance', ascending = False, inplace = True)
    return(imp)

In [11]:
smote3 = pd.read_csv('X_Resample_smote_0.3.csv')
smote3.set_index('Unnamed: 0', inplace = True)

In [20]:
frf2 = load_model('final_rf_smote_0.2')['trained_model']

Transformation Pipeline and Model Successfully Loaded


In [6]:
frf3 = load_model('final_rf_smote_0.3')['trained_model']
frf4 = load_model('final_rf_smote_0.4')['trained_model']

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [7]:
print(frf2)
print(frf3)
print(frf4)

RandomForestClassifier(class_weight={}, max_depth=9, max_features='sqrt',
                       min_impurity_decrease=0, min_samples_leaf=6,
                       min_samples_split=5, n_estimators=120, n_jobs=-1,
                       random_state=42)
RandomForestClassifier(class_weight={}, max_depth=9, max_features='sqrt',
                       min_impurity_decrease=0, min_samples_leaf=6,
                       min_samples_split=5, n_estimators=120, n_jobs=-1,
                       random_state=42)
RandomForestClassifier(class_weight={}, max_depth=9, max_features='sqrt',
                       min_impurity_decrease=0, min_samples_leaf=6,
                       min_samples_split=5, n_estimators=120, n_jobs=-1,
                       random_state=42)


In [10]:
smote2 = pd.read_csv('X_Resample_smote_0.2.csv')
smote2.set_index('Unnamed: 0', inplace = True)

smote3 = pd.read_csv('X_Resample_smote_0.3.csv')
smote3.set_index('Unnamed: 0', inplace = True)

smote4 = pd.read_csv('X_Resample_smote_0.4.csv')
smote4.set_index('Unnamed: 0', inplace = True)

print('Smote 2 shape', str(smote2.shape))
print('Smote 3 shape', str(smote3.shape))
print('Smote 4 shape', str(smote4.shape))

Smote 2 shape (32485, 86)
Smote 3 shape (35253, 86)
Smote 4 shape (38056, 86)


In [11]:
imp2 = get_importance(frf2, smote3.columns)
imp3 = get_importance(frf3, smote3.columns)
imp4 = get_importance(frf4, smote3.columns)

imp2.head()

Unnamed: 0,Importance
Var126,0.20168
Var218_cJvF,0.118346
Var211_L84s,0.112414
Var144_9.0,0.046455
Var227_RAYp,0.035312


In [12]:
imp3.head()

Unnamed: 0,Importance
Var126,0.20503
Var218_cJvF,0.129944
Var211_L84s,0.11165
Var225_ELof,0.039057
Var144_0.0,0.03582


In [13]:
imp4.head()

Unnamed: 0,Importance
Var126,0.19429
Var218_cJvF,0.133989
Var211_L84s,0.114521
Var227_RAYp,0.038575
Var144_9.0,0.038497


In [12]:
predict_preproc = setup(data = data_unseen, target = 'appetency', session_id=42, 
                normalize = True, 
                transformation = True, 
                ignore_low_variance = True,
                remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                log_experiment = True, experiment_name = 'PreprocOnly',
                preprocess = True,
                silent = True, fold = 2)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,appetency
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(4499, 39)"
5,Missing Values,True
6,Numeric Features,28
7,Categorical Features,10
8,Ordinal Features,False
9,High Cardinality Features,False


In [30]:
# Quick check to see how that's doing against the unseen dataset
data_unseen_X = get_config('X')
data_unseen_y = get_config('y')


# Missing some of the one-hot encoded categorical columns. Easy enough - add them with values of 0
for col in smote3.columns:
    if col not in data_unseen_X.columns:
        # print(col)
        data_unseen_X[col] = 0

# And vice versa - remove any extra one-hots that snuck into the unseen data 
for col in data_unseen_X.columns:
    if col not in smote3.columns:
        del data_unseen_X[col]

In [49]:
def getModelMetrics(model, modelName, threshold):
    probs03 = pd.DataFrame(model.predict_proba(data_unseen_X))

    probs03['Pred'] = -1
    probs03.loc[probs03[1] >= threshold, 'Pred'] = 1
    probs03 = pd.concat([probs03.reset_index(drop = True), data_unseen_y], axis = 1)

    results = pd.DataFrame([[modelName, threshold,
                check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Accuracy'),
                0.5,
                check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Precision'),
                check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Recall'),
                check_metric(probs03['Pred'], probs03['appetency'] , metric = 'F1'),
                check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Kappa'),
                0.5]],            
               columns = ['Model', 'Threshold', 'Accuracy', 'AUC', 'Precision', 'Recall', 'F1','Kappa', 'MCC'])
    return results

# check_metric(probs03['Pred'], probs03['appetency'] , metric = 'MCC')]]
# check_metric(probs03['Pred'], probs03['appetency'] , metric = 'AUC')

In [22]:
for j in range(6,15):
    print(j*5)

30
35
40
45
50
55
60
65
70


In [37]:
allMetrics = getModelMetrics(frf2, 'Final RF 0.2', 0.2)

In [38]:
allMetrics

Unnamed: 0,Model,Threshold,Accuracy,AUC,Precision,Recall,F1,Kappa,MCC
0,Final RF 0.2,0.2,0.8251,0.5211,0.5395,0.0517,0.0944,0.0656,0.1249


In [None]:
# for i in range(2,8):
for i in range(2,8):
    print(i)
    frf = load_model('final_rf_smote_0.' + str(i))['trained_model']
    
    for j in range(6,15):
        allMetrics = allMetrics.append(getModelMetrics(frf, ('Final RF 0.' + str(2)), (j*0.05)), ignore_index= True)
        

In [36]:
allMetrics        

Unnamed: 0,Model,Threshold,Accuracy,AUC,Precision,Recall,F1,Kappa,MCC
0,Final RF 0.2,0.2,0.8251,0.5211,0.5395,0.0517,0.0944,0.0656,0.1249
1,Final RF 0.3,0.3,0.8995,0.5298,0.4079,0.0708,0.1206,0.0946,0.1373
2,Final RF 0.3,0.35,0.9238,0.5319,0.3158,0.0762,0.1228,0.0982,0.1263
3,Final RF 0.3,0.4,0.9482,0.5399,0.2368,0.0933,0.1338,0.1123,0.1255
4,Final RF 0.3,0.45,0.9735,0.5662,0.1184,0.1475,0.1314,0.1181,0.1189
5,Final RF 0.3,0.5,0.9811,0.4915,0.0,0.0,0.0,-0.0036,-0.0059
6,Final RF 0.3,0.55,0.9829,0.4916,0.0,0.0,0.0,-0.0004,-0.002
7,Final RF 0.2,0.3,0.9113,0.5348,0.4079,0.0805,0.1345,0.1094,0.151
8,Final RF 0.2,0.35,0.9329,0.5344,0.2895,0.0815,0.1272,0.1035,0.1266
9,Final RF 0.2,0.4,0.9589,0.544,0.1842,0.1022,0.1315,0.1122,0.1173


In [None]:
# for i in range(2,8):
# for i in range(2,4):
i = 3
print(i)
frf = load_model('final_rf_smote_0.' + str(i))['trained_model']

In [32]:
frf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                       criterion='gini', max_depth=9, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=120,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
for j in range(6,15):
    print(j)
    allMetrics = allMetrics.append(getModelMetrics(frf, ('Final RF 0.' + str(i)), (j*0.05)), ignore_index= True)

In [43]:
frf2 = load_model('final_rf_smote_0.2')['trained_model']
frf3 = load_model('final_rf_smote_0.3')['trained_model']
frf4 = load_model('final_rf_smote_0.4')['trained_model']
frf5 = load_model('final_rf_smote_0.5')['trained_model']
frf6 = load_model('final_rf_smote_0.6')['trained_model']
frf7 = load_model('final_rf_smote_0.7')['trained_model']

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [55]:
frf8 = load_model('final_rf_smote_0.8')['trained_model']

Transformation Pipeline and Model Successfully Loaded


In [52]:
allMetrics = getModelMetrics(frf2, 'Final RF 0.2', 0.2)

allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.55), ignore_index= True)
# allMetrics = allMetrics.append(getModelMetrics(frf2, 'Final RF 0.2', 0.6), ignore_index= True)

allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.55), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf3, 'Final RF 0.3', 0.6), ignore_index= True)

allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.55), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf4, 'Final RF 0.4', 0.6), ignore_index= True)

allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.55), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf5, 'Final RF 0.5', 0.6), ignore_index= True)

allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.55), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf6, 'Final RF 0.6', 0.6), ignore_index= True)

allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.55), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf7, 'Final RF 0.7', 0.6), ignore_index= True)



In [56]:
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.3), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.35), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.4), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.45), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.5), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.55), ignore_index= True)
allMetrics = allMetrics.append(getModelMetrics(frf8, 'Final RF 0.8', 0.6), ignore_index= True)

In [57]:

allMetrics

Unnamed: 0,Model,Threshold,Accuracy,AUC,Precision,Recall,F1,Kappa,MCC
0,Final RF 0.2,0.2,0.8251,0.5,0.5395,0.0517,0.0944,0.0656,0.5
1,Final RF 0.2,0.3,0.9113,0.5,0.4079,0.0805,0.1345,0.1094,0.5
2,Final RF 0.2,0.35,0.9329,0.5,0.2895,0.0815,0.1272,0.1035,0.5
3,Final RF 0.2,0.4,0.9589,0.5,0.1842,0.1022,0.1315,0.1122,0.5
4,Final RF 0.2,0.45,0.9744,0.5,0.0789,0.1176,0.0945,0.082,0.5
5,Final RF 0.2,0.5,0.982,0.5,0.0,0.0,0.0,-0.0021,0.5
6,Final RF 0.2,0.55,0.9829,0.5,0.0,0.0,0.0,-0.0004,0.5
7,Final RF 0.3,0.3,0.8995,0.5,0.4079,0.0708,0.1206,0.0946,0.5
8,Final RF 0.3,0.35,0.9238,0.5,0.3158,0.0762,0.1228,0.0982,0.5
9,Final RF 0.3,0.4,0.9482,0.5,0.2368,0.0933,0.1338,0.1123,0.5


In [58]:
allMetrics.to_csv("All Metrics SMOTE 0.2-0.8 2022-05-19.csv")

In [21]:
print('Accuracy: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Accuracy'))
print('AUC: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'AUC'))
print('Precision: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Precision'))
print('Recall: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Recall'))
print('F1: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'F1'))
print('Kappa: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Kappa'))
print('MCC: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'MCC'))

print(probs03['Pred'].value_counts())

Accuracy:  0.9113
AUC:  0.5348
Precision:  0.4079
Recall:  0.0805
F1:  0.1345
Kappa:  0.1094
MCC:  0.151
-1    4114
 1     385
Name: Pred, dtype: int64


Other plots: 
- 'pr' - precision vs. recall
- 'auc' = area under curve
- 'feature' or 'feature_all' = feature importance from RF
- 'confusion_matrix'


In [48]:
plot_model(tuned_rf, plot = 'feature')

In [5]:
# tuned_rf = load_model('tuned rf v1 2022-05-18')['trained_model']

Transformation Pipeline and Model Successfully Loaded


In [None]:
# Interactive version? 
evaluate_model(tuned_rf)

In [36]:
# This is just the final tuned model vs. our test holdout set 
tuned_results = predict_model(tuned_rf)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.961,0.8104,0.1654,0.0909,0.1173,0.0991,0.1038


In [33]:
type(tuned_results)

pandas.core.frame.DataFrame

In [None]:
# Re-train the tuned_rf against the entire "seen" dataset
final_rf = finalize_model(tuned_rf)

In [34]:
# lastly, try that final model against the UNSEEN dataset 
unseen_predictions = predict_model(final_rf, data=data_unseen)
# unseen_predictions.head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9831,0.8075,0.0,0.0,0.0,0.0,0.0


## 2.3 Bagging Time!

Wen we balance an imbalanced dataset like that, we often end up with over-trained models. What to do? <br>
BAG IT! <br> 
Train/finalize 10 unique models (stick with RF) using our established hyper-parameters, each one trained on <br>
A different 80% slice of the overall "seen" dataset. <br>
Get the prediction from EACH of those models, then take the average across them all. (possibly the MAX) <br>
That should nicely reduce any impact of overtraining 

In [19]:
'X_Resample_smote_' + str(round(0.2 + i/10,1))

'X_Resample_smote0.4'

In [None]:
# Loading this so we have the right hyperparameters 
tuned_rf = load_model('Initial RF Model smote 0.3 2022-05-17')['trained_model']

In [None]:
# for i in range(0,3):
i = 0

smote_level = round(0.2 + i/10,1)

print(dt.now(), ' Starting Loop for SMOTE Level: ', str(smote_level))

#   SMOTE Level: ', str(smote_level)) data_loop = data_seen.sample(frac=0.80, random_state=42)
#     data_loop.reset_index(inplace=True, drop=True)

loopSetup = setup(data = data_seen, target = 'appetency', session_id=42, 
                  normalize = True, 
                  transformation = True, 
                  ignore_low_variance = True,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                  log_experiment = True, experiment_name = 'Bagging',
                  preprocess = True,
                  train_size = 0.8,
                  silent = True, fold = 10)

print(dt.now(), ' Preproc setup complete')

train_X = get_config('X_train')
train_y = get_config('y_train')

smote_enn = SMOTEENN(random_state=0, sampling_strategy = smote_level)
X_resampled, y_resampled = smote_enn.fit_resample(train_X, train_y)

X_resampled.to_csv('X_Resample_smote_' + str(smote_level) + '.csv')
y_resampled.to_csv('y_Resample_smote_' + str(smote_level) + '.csv')

print(dt.now(), ' SMOTE resample complete')

set_config('X_train', X_resampled)
set_config('y_train', y_resampled)

loop_rf = create_model('rf')
print(dt.now(), ' Initial model created')

tuned_rf = tune_model(loop_rf, optimize = 'MCC')
print(dt.now(), ' Model tuning complete')
final_rf = finalize_model(tuned_rf)

print(dt.now(), ' Model finalized')

save_model(final_rf, ('final_rf_smote_' + str(smote_level)))

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.871,0.9848,0.3381,1.0,0.5053,0.4513,0.5398
1,0.9988,1.0,0.9953,0.9984,0.9968,0.9961,0.9961
2,0.9997,1.0,0.9984,1.0,0.9992,0.999,0.999
3,0.9994,1.0,0.9984,0.9984,0.9984,0.998,0.998
4,0.9988,1.0,0.9968,0.9968,0.9968,0.9961,0.9961
5,0.9978,1.0,0.9889,1.0,0.9944,0.9931,0.9931
6,0.9991,1.0,0.9968,0.9984,0.9976,0.9971,0.9971
7,0.9982,0.9999,0.9968,0.9937,0.9953,0.9941,0.9941
8,0.9982,1.0,0.9921,0.9984,0.9952,0.9941,0.9941
9,0.9994,1.0,0.9968,1.0,0.9984,0.998,0.998


2022-05-19 16:46:02.302376  Initial model created


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [21]:
predict_model(rf03)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9818,0.8001,0.033,0.1176,0.0515,0.0452,0.0549


Unnamed: 0,Var126,Var28,Var81,Var94,Var153,Var38,Var83,Var73,Var6,Var123,...,Var65_45.0,Var65_54.0,Var65_63.0,Var65_72.0,Var65_9.0,Var65_90.0,Var65_99.0,appetency,Label,Score
0,0.227750,-0.332419,1.691459,0.422480,0.688928,-0.404556,-0.049269,-1.402178,-1.562186,-0.738142,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1,-1,0.02
1,0.080775,-2.440444,0.944765,0.372157,0.612014,0.727620,-0.521199,1.625160,-0.398528,0.345621,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1,-1,0.00
2,0.080775,-1.102822,-0.963627,-1.604679,-1.068791,-0.958005,-0.049269,0.527775,0.442054,-0.319183,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,0.02
3,0.425918,0.451323,1.267934,-1.402946,1.000059,0.994907,0.654137,-0.837514,-0.775375,0.603339,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1,-1,0.01
4,-1.905673,-2.201504,-1.325485,-1.597454,-1.322253,-1.028975,-1.089414,-1.327799,-1.404796,-0.738142,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1,-1,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12143,1.244986,0.086270,-1.003200,-1.078994,-0.886187,-1.018017,-0.049269,0.298569,1.271485,-0.132497,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,0.00
12144,0.227750,-0.332419,0.387084,0.422480,-0.578543,-0.913973,1.115026,-0.904268,-0.540626,0.716068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,0.07
12145,0.982651,0.451323,-0.156372,1.958001,0.947213,1.554600,1.115026,1.660571,0.890987,1.156805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,0.01
12146,0.227750,-0.332419,0.806299,0.422480,1.383801,0.825299,-0.049269,-1.182272,-0.324165,-0.520799,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1,-1,0.01


In [150]:
imp_val = rf04.feature_importances_
imp_label = train_X.columns

imp04 = pd.DataFrame(imp_val, imp_label)
imp04.columns = ['Importance']
imp04.sort_values('Importance', ascending = False, inplace = True)
imp04.head()

Unnamed: 0,Importance
Var126,0.187917
Var218_cJvF,0.106704
Var211_L84s,0.059164
Var227_RAYp,0.040636
Var225_kG3k,0.031123


## 2.3 Testing initial results

**Those results are suspiciously good...** <br>
Did we over-balance that thing? Let's do a quick check against the "unseen" data  <br>
Have to run it through ```setup()``` first so we get the same pre-proc that the model was trained on 


In [151]:
predict_preproc = setup(data = data_unseen, target = 'appetency', session_id=42, 
                normalize = True, 
                transformation = True, 
                ignore_low_variance = True,
                remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                log_experiment = True, experiment_name = 'PreprocOnly',
                preprocess = True,
                silent = True, fold = 2)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,appetency
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(4499, 39)"
5,Missing Values,True
6,Numeric Features,28
7,Categorical Features,10
8,Ordinal Features,False
9,High Cardinality Features,False


In [152]:
# Quick check to see how that's doing against the unseen dataset
data_unseen_X = get_config('X')
data_unseen_y = get_config('y')

Got to make some adjustments on the **unseen** data - there are some one-hot encoding of categorical <br>
fields that didn't show up in the **seen** data, so we've go to add them to the unseen data as empty <br>
columns. And vice versa - there are a couple one-hots that are in the unseen that aren't in the seen (training <br>
Just delete those columns - model can't handle fields that it hasn't seen before. 

In [153]:
# Missing some of the one-hot encoded categorical columns. Easy enough - add them with values of 0
for col in train_X.columns:
    if col not in data_unseen_X.columns:
        # print(col)
        data_unseen_X[col] = 0

# And vice versa - remove any extra one-hots that snuck into the unseen data 
for col in data_unseen_X.columns:
    if col not in train_X.columns:
        del data_unseen_X[col]

In [154]:
probs03 = pd.DataFrame(rf03.predict_proba(data_unseen_X))
probs04 = pd.DataFrame(rf04.predict_proba(data_unseen_X))
# preds

threshold = 0.3

probs03['Pred'] = -1
probs03.loc[probs03[1] >= threshold, 'Pred'] = 1
probs03 = pd.concat([probs03.reset_index(drop = True), data_unseen_y], axis = 1)

probs04['Pred'] = -1
probs04.loc[probs04[1] >= threshold, 'Pred'] = 1
probs04 = pd.concat([probs04.reset_index(drop = True), data_unseen_y], axis = 1)

print('Accuracy: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'Accuracy')
                  , check_metric(probs04['Pred'], probs04['appetency'] , metric = 'Accuracy'))
print('AUC: ', check_metric(probs03['Pred'], probs03['appetency'] , metric = 'AUC')
             , check_metric(probs04['Pred'], probs04['appetency'] , metric = 'AUC'))
print('Precision: ', check_metric(probs['Pred'], probs03['appetency'] , metric = 'Precision')
                   , check_metric(probs['Pred'], probs04['appetency'] , metric = 'Precision'))
print('Recall: ', check_metric(probs03['Pred'], probs['appetency'] , metric = 'Recall')
                , check_metric(probs04['Pred'], probs['appetency'] , metric = 'Recall'))
print('F1: ', check_metric(probs03['Pred'], probs['appetency'] , metric = 'F1')
             , check_metric(probs04['Pred'], probs['appetency'] , metric = 'F1'))
print('Kappa: ', check_metric(probs03['Pred'], probs['appetency'] , metric = 'Kappa')
             , check_metric(probs04['Pred'], probs['appetency'] , metric = 'Kappa'))
print('MCC: ', check_metric(probs03['Pred'], probs['appetency'] , metric = 'MCC')
             , check_metric(probs04['Pred'], probs['appetency'] , metric = 'MCC'))

print(probs03['Pred'].value_counts())

print(probs04['Pred'].value_counts())

Accuracy:  0.9729 0.9615
AUC:  0.5487 0.543
Precision:  0.0112 0.0112
Recall:  0.1163 0.1038
F1:  0.0758 0.1128
Kappa:  0.0637 0.0933
MCC:  0.0681 0.0937
-1    4456
 1      43
Name: Pred, dtype: int64
-1    4393
 1     106
Name: Pred, dtype: int64


## Metrics against unseen

**Test 1**

| Factor | Test 1 |
|:-----|:----:|
|SMOTE | 0.4|
|Threshold | 0.4|
|Accuracy | 0.9793|
|AUC: | 0.5735|
|Precision: | 0.0112|
|Recall:|  0.1667|
|F1: | 0.0211|
|Kappa: | 0.0186|
|MCC: | 0.0385|


## 2.4 - Tuning Selected Algorithm