## Import necessary libraries


In [1]:
import sys
import os

# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

# Import the necessary libraries
import pycaret.classification as pc
import pandas as pd
import src.scripts.mapping_answers_dict as map_dict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from math import sqrt


## Dataset loading

In [2]:
import re

# Load the merged dataset
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")

ordinal_columns = ["SmokingFriends", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "HarmfulPassiveSmoke", "HardQuitSmoke"]
dataset[ordinal_columns] = dataset[ordinal_columns].astype('category')

# Convert categorical columns
categorical_columns = ["State", "Gender", "Age", "AttractiveSmoker", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')
# remove non letter, non number, non space characters from the categorical columns with regex

dataset[categorical_columns] = dataset[categorical_columns].applymap(lambda x: re.sub(r'[^\w\s\d]', '', x))


# Convert boolean columns
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('int')

# Comparing models

In [3]:
from sklearn.preprocessing import OneHotEncoder
# import sklearn.preprocessing as ce
import re

setup = pc.setup(data=dataset,
                 target='Smoke',
                 index=False,
                 train_size=0.8,
                 session_id=42,
                 categorical_features=categorical_columns,
                 ordinal_features={
                     "SmokingFriends": map_dict.OR46_dict.values(),
                     "SeenSmokerInPublicPlace": map_dict.CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": map_dict.CR20_dict.values(),
                     "SeenSmokerInHome": map_dict.CR19_dict.values(),
                     "HarmfulPassiveSmoke": map_dict.CR23_dict.values(),
                     "HardQuitSmoke": map_dict.CR41_dict.values(),
                 },
                 imputation_type=None,
                 normalize=False,
                 max_encoding_ohe=0,
                 encoding_method=OneHotEncoder(dtype=int, sparse_output=False),
                 n_jobs=10
                 )
X_test_df = pc.get_config('X_test_transformed').copy()
X_test_df

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Smoke
2,Target type,Binary
3,Original data shape,"(20731, 23)"
4,Transformed data shape,"(20731, 42)"
5,Transformed train set shape,"(16584, 42)"
6,Transformed test set shape,"(4147, 42)"
7,Ordinal features,6
8,Numeric features,8
9,Categorical features,8


Unnamed: 0,Age_11 years old or younger,Age_12 years old,Age_13 years old,Age_14 years old,Age_15 years old,Age_16 years old,Age_17 years old or older,Gender,SmokingFriends,SeenSmokerInSchool,...,BanTobaccoOutdoors,HarmfulPassiveSmoke,State_Italy,State_Poland,State_Portugal,State_Romania,SmokingFather,SmokingMother,WorkingFather,WorkingMother
16584,0,1,0,0,0,0,0,1.0,0.0,0,...,1,0.0,0,0,1,0,0,0,1,0
16585,0,0,0,1,0,0,0,1.0,0.0,0,...,1,2.0,0,1,0,0,1,1,1,1
16586,0,0,0,1,0,0,0,1.0,1.0,1,...,0,3.0,0,0,1,0,1,0,1,0
16587,0,0,0,0,0,1,0,0.0,2.0,1,...,0,3.0,0,1,0,0,0,0,1,1
16588,0,0,1,0,0,0,0,1.0,1.0,0,...,1,3.0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20726,0,0,0,1,0,0,0,0.0,1.0,1,...,1,2.0,0,1,0,0,0,0,1,1
20727,0,0,0,1,0,0,0,1.0,0.0,1,...,0,3.0,0,0,1,0,1,0,1,1
20728,0,0,0,0,1,0,0,0.0,1.0,0,...,1,2.0,0,0,0,1,0,0,1,1
20729,0,0,0,0,1,0,0,0.0,1.0,1,...,0,1.0,0,1,0,0,0,0,1,1


In [4]:
pc.get_config('pipeline')
X_test_df.dtypes



Age_11 years old or younger                                                    int32
Age_12 years old                                                               int32
Age_13 years old                                                               int32
Age_14 years old                                                               int32
Age_15 years old                                                               int32
Age_16 years old                                                               int32
Age_17 years old or older                                                      int32
Gender                                                                       float64
SmokingFriends                                                               float64
SeenSmokerInSchool                                                              int8
SeenSmokerInPublicPlace                                                      float64
SeenSmokerInEnclosedPlace                                        

# Compute the class weights

In [5]:

classes = dataset['Smoke'].unique()

class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=pc.get_config("y_train_transformed"))
sqrt_weights = [sqrt(weight) for weight in class_weights]

class_weights = dict(zip(classes, class_weights))
sqrt_weights = dict(zip(classes, sqrt_weights))

print(class_weights)
print(sqrt_weights)


{1: 3.0440528634361232, 0: 0.5982683982683983}
{1: 1.7447214286057597, 0: 0.7734781175110246}


Find the best model within the class-weight supporting ones

In [32]:
#all_models = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] #'rbfsvm', 'gpc', 'mlp'

# Models that support class weights
threshold_optimized_model =[ 'lr', 'dt', 'svm' , 'ridge', 'rf', 'et', 'lightgbm'] # 'rbfsvm'
models = {}
predicts = pd.DataFrame()

for model_name in threshold_optimized_model:
    try:
        model = pc.create_model(model_name, verbose=True, class_weight=sqrt_weights)
        models[model_name] = model
        pc.predict_model(model)
        predict = pc.pull()
        predicts = pd.concat([predicts, predict])
    except Exception as e:
        print(e)

# Sort the models by MCC
predicts = predicts.sort_values('Accuracy', ascending=False)

predicts

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8354,0.0,0.5128,0.5,0.5063,0.4076,0.4077
1,0.8421,0.0,0.5788,0.518,0.5467,0.4514,0.4525
2,0.8427,0.0,0.5678,0.5201,0.5429,0.4481,0.4487
3,0.8294,0.0,0.5495,0.4839,0.5146,0.4116,0.4128
4,0.8516,0.0,0.5699,0.5458,0.5576,0.4685,0.4686
5,0.8426,0.0,0.5441,0.5193,0.5314,0.4369,0.4371
6,0.851,0.0,0.5956,0.5418,0.5674,0.4777,0.4785
7,0.8366,0.0,0.5809,0.5016,0.5383,0.4397,0.4414
8,0.8366,0.0,0.5037,0.5018,0.5028,0.405,0.405
9,0.848,0.0,0.5699,0.5345,0.5516,0.4602,0.4606


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8592,0.8751,0.6123,0.5658,0.5882,0.5034,0.504


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7975,0.0,0.4286,0.3939,0.4105,0.2885,0.2889
1,0.8047,0.0,0.3956,0.4045,0.4,0.2834,0.2834
2,0.8192,0.0,0.4615,0.4516,0.4565,0.3481,0.3481
3,0.8156,0.0,0.4066,0.4353,0.4205,0.3109,0.3112
4,0.7973,0.0,0.4081,0.3881,0.3978,0.2761,0.2762
5,0.7913,0.0,0.3934,0.3715,0.3821,0.2567,0.2569
6,0.8124,0.0,0.4118,0.4259,0.4187,0.3069,0.307
7,0.7986,0.0,0.4081,0.3908,0.3993,0.2783,0.2784
8,0.8058,0.0,0.386,0.4038,0.3947,0.2791,0.2792
9,0.8106,0.0,0.4596,0.4281,0.4433,0.3293,0.3296


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.8052,0.6527,0.4258,0.4102,0.4179,0.3009,0.301


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8391,0.0,0.5348,0.5105,0.5224,0.4257,0.4258
1,0.8403,0.0,0.5348,0.5141,0.5242,0.4283,0.4284
2,0.8517,0.0,0.359,0.5799,0.4434,0.3633,0.3772
3,0.8391,0.0,0.2527,0.5227,0.3407,0.2615,0.284
4,0.854,0.0,0.364,0.5893,0.45,0.3712,0.3856
5,0.8263,0.0,0.5735,0.4756,0.52,0.4151,0.4178
6,0.8366,0.0,0.6618,0.5014,0.5705,0.472,0.4789
7,0.8359,0.0,0.625,0.5,0.5556,0.4565,0.4608
8,0.7612,0.0,0.7022,0.3775,0.491,0.3529,0.3819
9,0.8444,0.0,0.5368,0.5252,0.5309,0.4376,0.4377


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.7239,0.7752,0.8517,0.3571,0.5033,0.3537,0.4178


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8415,0.0,0.5055,0.5188,0.5121,0.4174,0.4175
1,0.8463,0.0,0.5568,0.5315,0.5438,0.4515,0.4516
2,0.8445,0.0,0.5604,0.5258,0.5426,0.449,0.4493
3,0.8342,0.0,0.5495,0.4967,0.5217,0.4218,0.4226
4,0.851,0.0,0.5551,0.5451,0.5501,0.4608,0.4609
5,0.8498,0.0,0.5331,0.5431,0.538,0.4484,0.4484
6,0.8522,0.0,0.5625,0.5484,0.5554,0.4668,0.4668
7,0.8438,0.0,0.5699,0.5219,0.5448,0.4508,0.4514
8,0.8353,0.0,0.489,0.4981,0.4935,0.3952,0.3952
9,0.848,0.0,0.5551,0.5355,0.5451,0.4539,0.454


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.8623,0.7589,0.605,0.577,0.5907,0.508,0.5082


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.871,0.0,0.3077,0.7706,0.4398,0.3817,0.4335
1,0.868,0.0,0.293,0.7547,0.4222,0.3636,0.4158
2,0.8794,0.0,0.348,0.812,0.4872,0.431,0.481
3,0.8758,0.0,0.3333,0.7913,0.4691,0.4117,0.4613
4,0.8655,0.0,0.2757,0.7426,0.4021,0.3438,0.3979
5,0.8739,0.0,0.3199,0.7838,0.4543,0.397,0.4483
6,0.8764,0.0,0.3309,0.7965,0.4675,0.4108,0.4618
7,0.8727,0.0,0.3456,0.7402,0.4712,0.4095,0.4481
8,0.8691,0.0,0.3051,0.7477,0.4334,0.3739,0.4222
9,0.8758,0.0,0.3419,0.775,0.4745,0.4158,0.4608


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.865,0.0,0.3004,0.713,0.4227,0.3603,0.4037
1,0.8734,0.0,0.3114,0.7944,0.4474,0.3909,0.446
2,0.8831,0.0,0.381,0.8062,0.5174,0.4604,0.5025
3,0.8758,0.0,0.3333,0.7913,0.4691,0.4117,0.4613
4,0.8703,0.0,0.3125,0.7522,0.4416,0.382,0.4295
5,0.8697,0.0,0.3015,0.7593,0.4316,0.3731,0.4243
6,0.8758,0.0,0.3199,0.8056,0.4579,0.4021,0.4573
7,0.8739,0.0,0.3529,0.7442,0.4788,0.4173,0.455
8,0.8667,0.0,0.3088,0.7179,0.4319,0.3697,0.4121
9,0.8776,0.0,0.3566,0.776,0.4887,0.4298,0.4719


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8547,0.0,0.5458,0.5602,0.5529,0.4662,0.4662
1,0.8602,0.0,0.5788,0.5745,0.5766,0.4929,0.4929
2,0.8632,0.0,0.619,0.5788,0.5982,0.5159,0.5163
3,0.8457,0.0,0.5421,0.5305,0.5362,0.4437,0.4437
4,0.8625,0.0,0.5404,0.588,0.5632,0.4818,0.4824
5,0.8565,0.0,0.5699,0.5616,0.5657,0.4797,0.4797
6,0.8607,0.0,0.5956,0.5724,0.5838,0.5002,0.5003
7,0.8546,0.0,0.5956,0.5529,0.5735,0.486,0.4865
8,0.854,0.0,0.5699,0.5536,0.5616,0.4741,0.4741
9,0.8577,0.0,0.5809,0.5643,0.5725,0.4871,0.4872


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8756,0.0,0.4148,0.7067,0.5226,0.4566,0.4781,0.163
rf,Random Forest Classifier,0.8743,0.0,0.3443,0.759,0.4734,0.4134,0.4553,0.233
catboost,CatBoost Classifier,0.8736,0.0,0.4141,0.6931,0.5182,0.4507,0.4707,3.457
gbc,Gradient Boosting Classifier,0.8722,0.0,0.3979,0.6945,0.5056,0.4384,0.4611,0.298
et,Extra Trees Classifier,0.8715,0.0,0.3352,0.7409,0.461,0.3997,0.4407,0.306
knn,K Neighbors Classifier,0.8617,0.0,0.3645,0.6381,0.4637,0.3912,0.4115,0.09
ada,Ada Boost Classifier,0.859,0.0,0.3311,0.6372,0.4353,0.3639,0.3896,0.165
lda,Linear Discriminant Analysis,0.858,0.0,0.3744,0.6106,0.4639,0.3875,0.4031,0.07
lr,Logistic Regression,0.8578,0.0,0.3183,0.6338,0.4235,0.3525,0.3799,0.072
svm,SVM - Linear Kernel,0.8524,0.0,0.2217,0.6499,0.3173,0.2611,0.3098,0.086


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663
0,Ridge Classifier,0.8623,0.7589,0.605,0.577,0.5907,0.508,0.5082
0,Logistic Regression,0.8592,0.8751,0.6123,0.5658,0.5882,0.5034,0.504
0,Decision Tree Classifier,0.8052,0.6527,0.4258,0.4102,0.4179,0.3009,0.301
0,SVM - Linear Kernel,0.7239,0.7752,0.8517,0.3571,0.5033,0.3537,0.4178


Choosing best model


In [34]:
lgbm_model = models['lightgbm']
lgbm_tuned_model = pc.tune_model(lgbm_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
lgbm_ensemble_model = pc.ensemble_model(lgbm_tuned_model, method='Bagging', optimize='Accuracy')

print("Base LightGBM Model performance on test data")
pc.predict_model(lgbm_model)
print("Tuned LightGBM Model performance on test data")
pc.predict_model(lgbm_tuned_model)
print("Ensemble LightGBM Model performance on test data")
pc.predict_model(lgbm_ensemble_model)


et_model = models['et']
et_tuned_model = pc.tune_model(et_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
et_ensemble_model = pc.ensemble_model(et_tuned_model, method='Bagging', optimize='Accuracy')

print("Base Extra Trees Model performance on test data")
pc.predict_model(et_model)
print("Tuned Extra Trees Model performance on test data")
pc.predict_model(et_tuned_model)
print("Ensemble Extra Trees Model performance on test data")
pc.predict_model(et_ensemble_model)

rf_model = models['rf']
rf_tuned_model = pc.tune_model(rf_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
rf_ensemble_model = pc.ensemble_model(rf_tuned_model, method='Bagging', optimize='Accuracy', probability_threshold=0.35)

print("Base Random Forest Model performance on test data")
pc.predict_model(rf_model)
print("Tuned Random Forest Model performance on test data")
pc.predict_model(rf_tuned_model)
print("Ensemble Random Forest Model performance on test data")
pc.predict_model(rf_ensemble_model)


blended_model = pc.blend_models(estimator_list=[lgbm_ensemble_model, et_ensemble_model, rf_ensemble_model], optimize='Accuracy') 
print("Blended Model performance on test data")
pc.predict_model(blended_model)


# pc.plot_model(blended_model, plot='threshold')
pc.evaluate_model(blended_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.871,0.0,0.3773,0.7007,0.4905,0.4241,0.4508
1,0.865,0.0,0.3626,0.6644,0.4692,0.3994,0.4235
2,0.8734,0.0,0.3956,0.7059,0.507,0.441,0.4653
3,0.8596,0.0,0.3626,0.6266,0.4594,0.3852,0.4043
4,0.8631,0.0,0.3125,0.68,0.4282,0.3623,0.3978
5,0.8758,0.0,0.386,0.7292,0.5048,0.4414,0.4706
6,0.8727,0.0,0.3493,0.7364,0.4738,0.4117,0.4489
7,0.8697,0.0,0.3824,0.6842,0.4906,0.4227,0.4462
8,0.8673,0.0,0.375,0.6711,0.4811,0.412,0.4349
9,0.877,0.0,0.3676,0.7576,0.495,0.4344,0.4714


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8686,0.0,0.359,0.695,0.4734,0.407,0.436
1,0.8662,0.0,0.3553,0.6783,0.4663,0.3983,0.4256
2,0.8704,0.0,0.3846,0.6908,0.4941,0.4266,0.4507
3,0.8596,0.0,0.359,0.6282,0.4569,0.383,0.4029
4,0.8637,0.0,0.3162,0.6825,0.4322,0.3663,0.4015
5,0.8739,0.0,0.3713,0.7266,0.4915,0.428,0.4595
6,0.8721,0.0,0.3419,0.7381,0.4673,0.4056,0.4445
7,0.8691,0.0,0.3713,0.6871,0.4821,0.4147,0.4405
8,0.8703,0.0,0.375,0.6939,0.4869,0.4201,0.4462
9,0.8752,0.0,0.3603,0.7481,0.4864,0.425,0.4619


Base LightGBM Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Tuned LightGBM Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8814,0.8712,0.4273,0.7405,0.5419,0.4793,0.5033


Ensemble LightGBM Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8811,0.8702,0.4229,0.7423,0.5388,0.4764,0.5013




Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8722,0.0,0.3553,0.7293,0.4778,0.4147,0.4497
1,0.8674,0.0,0.3187,0.719,0.4416,0.3788,0.4194
2,0.8788,0.0,0.3736,0.7727,0.5037,0.4441,0.4823
3,0.8716,0.0,0.359,0.7206,0.4792,0.4152,0.4481
4,0.8661,0.0,0.2978,0.7232,0.4219,0.3607,0.4064
5,0.8788,0.0,0.3603,0.784,0.4937,0.4354,0.478
6,0.8752,0.0,0.3456,0.7642,0.4759,0.4163,0.4588
7,0.8733,0.0,0.3676,0.7246,0.4878,0.4242,0.4561
8,0.8697,0.0,0.3382,0.7188,0.46,0.3967,0.4332
9,0.8733,0.0,0.3419,0.75,0.4697,0.409,0.4499


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8668,0.0,0.2857,0.75,0.4138,0.3553,0.4083
1,0.8698,0.0,0.2821,0.7938,0.4162,0.3611,0.4229
2,0.8807,0.0,0.3553,0.8151,0.4949,0.4388,0.4877
3,0.8734,0.0,0.3187,0.7838,0.4531,0.3956,0.4472
4,0.8703,0.0,0.2721,0.8132,0.4077,0.3546,0.4224
5,0.8752,0.0,0.3088,0.8155,0.448,0.3933,0.4528
6,0.8739,0.0,0.2904,0.8316,0.4305,0.3777,0.4444
7,0.8727,0.0,0.3309,0.7563,0.4604,0.4005,0.4447
8,0.8667,0.0,0.2868,0.7429,0.4138,0.3548,0.4064
9,0.8727,0.0,0.3199,0.7699,0.4519,0.3935,0.4425


Base Extra Trees Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Tuned Extra Trees Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Ensemble Extra Trees Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8821,0.8926,0.3451,0.8453,0.4901,0.4364,0.4928


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8722,0.0,0.3773,0.7103,0.4928,0.4275,0.4555
1,0.8644,0.0,0.326,0.6846,0.4417,0.3754,0.409
2,0.8764,0.0,0.3846,0.7394,0.506,0.4433,0.4744
3,0.871,0.0,0.3773,0.7007,0.4905,0.4241,0.4508
4,0.8709,0.0,0.3309,0.7377,0.4569,0.3954,0.4366
5,0.8788,0.0,0.3787,0.763,0.5061,0.4458,0.4815
6,0.8788,0.0,0.3676,0.7752,0.4988,0.4396,0.4793
7,0.8745,0.0,0.386,0.7192,0.5024,0.438,0.4658
8,0.8661,0.0,0.3493,0.6786,0.4612,0.3936,0.4219
9,0.8727,0.0,0.3493,0.7364,0.4738,0.4117,0.4489


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.868,0.0,0.4982,0.6239,0.554,0.4776,0.4818
1,0.8692,0.0,0.4908,0.6321,0.5526,0.4774,0.4826
2,0.8716,0.0,0.5311,0.6304,0.5765,0.5015,0.5041
3,0.8602,0.0,0.5092,0.5865,0.5451,0.463,0.4646
4,0.8697,0.0,0.4779,0.6373,0.5462,0.472,0.4786
5,0.8703,0.0,0.5,0.6326,0.5585,0.4837,0.4883
6,0.8818,0.0,0.5404,0.6743,0.6,0.5316,0.5361
7,0.8643,0.0,0.5074,0.6026,0.5509,0.4717,0.4741
8,0.8637,0.0,0.4853,0.6055,0.5388,0.4599,0.4638
9,0.877,0.0,0.5184,0.6589,0.5802,0.5094,0.5144


Base Random Forest Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Tuned Random Forest Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Ensemble Random Forest Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8879,0.8954,0.5727,0.6915,0.6265,0.5612,0.5647


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8746,0.0,0.3004,0.8283,0.4409,0.3872,0.4509
1,0.8674,0.0,0.2821,0.7624,0.4118,0.3544,0.4105
2,0.8782,0.0,0.3297,0.8257,0.4712,0.4164,0.4728
3,0.8728,0.0,0.3077,0.7925,0.4433,0.3868,0.4424
4,0.8703,0.0,0.2868,0.7879,0.4205,0.3649,0.4245
5,0.8776,0.0,0.3125,0.8416,0.4558,0.4027,0.466
6,0.8758,0.0,0.3015,0.8367,0.4432,0.3903,0.4553
7,0.8764,0.0,0.3493,0.7724,0.481,0.422,0.465
8,0.8679,0.0,0.2978,0.7431,0.4252,0.3657,0.4148
9,0.8788,0.0,0.3272,0.8318,0.4697,0.4155,0.4736


Blended Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8823,0.8956,0.3451,0.8484,0.4906,0.4372,0.4941




interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [38]:
# pc.optimize_threshold(rf_ensemble_model)
# pc.plot_model(blended_model, plot='threshold')
tuned_blended_model = pc.tune_model(blended_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
# print(blended_model.get_params())

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
from typing import Any, Dict, Optional
from shap import sample
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from pycaret.utils.generic import get_label_encoder


def dashboard(
        estimator,
        display_format: str = "dash",
        dashboard_kwargs: Optional[Dict[str, Any]] = None,
        run_kwargs: Optional[Dict[str, Any]] = None,
        
        **kwargs,
    ):
        """
        This function generates the interactive dashboard for a trained model. The
        dashboard is implemented using ExplainerDashboard (explainerdashboard.readthedocs.io)


        Example
        -------
        >>> from pycaret.datasets import get_data
        >>> juice = get_data('juice')
        >>> from pycaret.classification import *
        >>> exp_name = setup(data = juice,  target = 'Purchase')
        >>> lr = create_model('lr')
        >>> dashboard(lr)


        estimator: scikit-learn compatible object
            Trained model object


        display_format: str, default = 'dash'
            Render mode for the dashboard. The default is set to ``dash`` which will
            render a dashboard in browser. There are four possible options:

            - 'dash' - displays the dashboard in browser
            - 'inline' - displays the dashboard in the jupyter notebook cell.
            - 'jupyterlab' - displays the dashboard in jupyterlab pane.
            - 'external' - displays the dashboard in a separate tab. (use in Colab)


        dashboard_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``ExplainerDashboard`` class.


        run_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``run`` method of ``ExplainerDashboard``.


        **kwargs:
            Additional keyword arguments to pass to the ``ClassifierExplainer`` or
            ``RegressionExplainer`` class.


        Returns:
            ExplainerDashboard
        """

        dashboard_kwargs = dashboard_kwargs or {}
        run_kwargs = run_kwargs or {}

        le = get_label_encoder(pc.get_config("pipeline"))
        if le:
            labels_ = list(le.classes_)
        else:
            labels_ = None

        # Replacing chars which dash doesn't accept for column name `.` , `{`, `}`
        X_test_df = pc.get_config('X_test_transformed').copy().head(100)
        X_test_df.columns = [
            col.replace(".", "__").replace("{", "__").replace("}", "__")
            for col in X_test_df.columns
        ]
        print(X_test_df.dtypes)
        
        onehotencoded = categorical_columns.copy().remove("Gender")
        y_test_df = pc.get_config('y_test_transformed').copy().head(100)
        explainer = ClassifierExplainer(
            estimator, X_test_df, y_test_df, labels=labels_, n_jobs=10, **kwargs,
            cats=onehotencoded
        )
        
        explainer_dashboard = ExplainerDashboard(
            explainer, mode=display_format, **dashboard_kwargs
        )
        explainer_dashboard.run(**run_kwargs)
        return explainer_dashboard

X_test_df = pc.get_config('X_test_transformed').copy().head(100)
# blended_model.predict_proba(X_test_df)
# X_test_df.dtypes
explainer_dashboard = dashboard(estimator=blended_model, display_format='external', shap='kernel', dashboard_kwargs={"port": 8100})

Age_11 years old or younger                                                    int32
Age_12 years old                                                               int32
Age_13 years old                                                               int32
Age_14 years old                                                               int32
Age_15 years old                                                               int32
Age_16 years old                                                               int32
Age_17 years old or older                                                      int32
Gender                                                                       float64
SmokingFriends                                                               float64
SeenSmokerInSchool                                                              int8
SeenSmokerInPublicPlace                                                      float64
SeenSmokerInEnclosedPlace                                        

  0%|          | 0/100 [00:00<?, ?it/s]

Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating pred_percentiles...
Calculating predictions...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://10.201.25.7:8100
You can terminate the dashboard with ExplainerDashboard.terminate(8100)


Dash app running on http://127.0.0.1:8100/


In [None]:
explainer = explainer_dashboard.explainer
explainer.dump('blended_model_explainer.dill')

In [None]:
explainer2 = ClassifierExplainer.from_file("blended_model_explainer.dill")
explainer_dashboard2 = ExplainerDashboard(explainer2, mode='dash', port=8091)

explainer_dashboard2.run()


Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating dependencies...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://10.201.25.7:8091


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8091): Max retries exceeded with url: /_alive_9cbf1ce8-4d03-4594-9fb6-d984667f9790 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001500D6852D0>: Failed to establish a new connection: [WinError 10049] Indirizzo richiesto non valido nel proprio contesto'))

In [None]:
ExplainerDashboard.terminate(8084)