In [6]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve 
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('data/model_data.csv')

X_train = pd.read_csv('data/X_train_transformed.csv')
X_test = pd.read_csv('data/X_test_transformed.csv')
y_train = pd.read_csv('data/y_train_transformed.csv')
y_test = pd.read_csv('data/y_test_transformed.csv')

#this step is necessary because we are using the already splot data inside the pycaret setup, and indexes must not overlap on the train and test data sets

#combine balanced train data back into a single df
X_train['subscribed'] = y_train
train_data = X_train

#combine test data back into a single df
X_test['subscribed'] = y_test
test_data = X_test.set_index( np.arange(len(X_train), len(X_train)+len(X_test)))

X=train_data.drop(labels=['subscribed'], axis=1)
y=train_data['subscribed']

In [7]:
from pycaret.classification import *
s = (setup( data=train_data,
            target = 'subscribed',
            test_data=test_data,
            fold=5,
            session_id = 420,
            ))

Unnamed: 0,Description,Value
0,Session id,420
1,Target,subscribed
2,Target type,Binary
3,Original data shape,"(63546, 54)"
4,Transformed data shape,"(63546, 54)"
5,Transformed train set shape,"(51190, 54)"
6,Transformed test set shape,"(12356, 54)"
7,Numeric features,53
8,Preprocess,True
9,Imputation type,simple


In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=420, n_jobs=-1,n_estimators=64, max_depth=3, min_samples_split=0.1 )

rf = create_model(clf)

#tuned_rf = tune_model(rf, n_iter=5, custom_grid = parameters)
calibrated_rf = calibrate_model(rf)
evaluate_model(rf)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7289,0.7923,0.656,0.7679,0.7075,0.4577,0.4626
1,0.7635,0.829,0.7103,0.7949,0.7502,0.5271,0.5301
2,0.7545,0.8196,0.7005,0.7854,0.7405,0.5091,0.5121
3,0.7587,0.8266,0.6974,0.7949,0.743,0.5175,0.5214
4,0.7568,0.8223,0.697,0.7917,0.7413,0.5136,0.5173
Mean,0.7525,0.818,0.6922,0.787,0.7365,0.505,0.5087
Std,0.0122,0.0133,0.0188,0.0101,0.0149,0.0244,0.0238


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7282,0.7916,0.6654,0.7609,0.71,0.4563,0.46
1,0.758,0.8299,0.7156,0.7819,0.7472,0.5159,0.5178
2,0.7503,0.8198,0.7056,0.7749,0.7387,0.5007,0.5027
3,0.754,0.8268,0.7091,0.779,0.7424,0.5079,0.51
4,0.7519,0.821,0.7068,0.7769,0.7402,0.5038,0.5059
Mean,0.7485,0.8178,0.7005,0.7747,0.7357,0.4969,0.4993
Std,0.0105,0.0136,0.0179,0.0073,0.0132,0.0209,0.0203


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [9]:
rf_holdout_pred = predict_model(rf)




# plot_model(rf, plot = 'calibration')
# plot_model(calibrated_rf, plot = 'calibration')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7919,0.7813,0.6444,0.3043,0.4134,0.3062,0.338


In [10]:
rf_calibrated_holdout_pred = predict_model(calibrated_rf)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7795,0.7816,0.6586,0.2921,0.4047,0.2933,0.3299


In [None]:
interpret_model(rf)

In [None]:
interpret_model(rf, use_train_data=True)