## Import necessary libraries


In [8]:
import sys
import os

# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

# Import the necessary libraries
import pycaret.classification as pc
import pandas as pd
import src.scripts.mapping_answers_dict as map_dict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from math import sqrt


## Dataset loading

In [2]:
import re

# Load the merged dataset
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")

ordinal_columns = ["SmokingFriends", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "HarmfulPassiveSmoke", "HardQuitSmoke"]
dataset[ordinal_columns] = dataset[ordinal_columns].astype('category')

# Convert categorical columns
categorical_columns = ["State", "Gender", "Age", "AttractiveSmoker", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')
# remove non letter, non number, non space characters from the categorical columns with regex

dataset[categorical_columns] = dataset[categorical_columns].applymap(lambda x: re.sub(r'[^\w\s\d]', '', x))


# Convert boolean columns
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('int')

# Comparing models

In [4]:
from sklearn.preprocessing import OneHotEncoder
# import sklearn.preprocessing as ce
import re

setup = pc.setup(data=dataset,
                 target='Smoke',
                 index=False,
                 train_size=0.8,
                 session_id=42,
                 categorical_features=categorical_columns,
                 ordinal_features={
                     "SmokingFriends": map_dict.OR46_dict.values(),
                     "SeenSmokerInPublicPlace": map_dict.CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": map_dict.CR20_dict.values(),
                     "SeenSmokerInHome": map_dict.CR19_dict.values(),
                     "HarmfulPassiveSmoke": map_dict.CR23_dict.values(),
                     "HardQuitSmoke": map_dict.CR41_dict.values(),
                 },
                 imputation_type=None,
                 normalize=False,
                 max_encoding_ohe=0,
                 encoding_method=OneHotEncoder(dtype=int, sparse_output=False),
                 n_jobs=10
                 )

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Smoke
2,Target type,Binary
3,Original data shape,"(20731, 23)"
4,Transformed data shape,"(20731, 42)"
5,Transformed train set shape,"(16584, 42)"
6,Transformed test set shape,"(4147, 42)"
7,Ordinal features,6
8,Numeric features,8
9,Categorical features,8


# Compute the class weights

In [11]:

classes = dataset['Smoke'].unique()

class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=pc.get_config("y_train_transformed"))
sqrt_weights = [sqrt(weight) for weight in class_weights]

class_weights = dict(zip(classes, class_weights))
sqrt_weights = dict(zip(classes, sqrt_weights))

print(class_weights)
print(sqrt_weights)


{1: 3.0440528634361232, 0: 0.5982683982683983}
{1: 1.7447214286057597, 0: 0.7734781175110246}


Find the best model within the class-weight supporting ones

In [12]:
#all_models = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] #'rbfsvm', 'gpc', 'mlp'

# Models that support class weights
threshold_optimized_model =[ 'lr', 'dt', 'svm' , 'ridge', 'rf', 'et', 'lightgbm'] # 'rbfsvm'
models = {}
predicts = pd.DataFrame()

for model_name in threshold_optimized_model:
    try:
        model = pc.create_model(model_name, verbose=True, class_weight=sqrt_weights)
        models[model_name] = model
        pc.predict_model(model)
        predict = pc.pull()
        predicts = pd.concat([predicts, predict])
    except Exception as e:
        print(e)

# Sort the models by MCC
predicts = predicts.sort_values('Accuracy', ascending=False)

predicts

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8354,0.0,0.5128,0.5,0.5063,0.4076,0.4077
1,0.8421,0.0,0.5788,0.518,0.5467,0.4514,0.4525
2,0.8427,0.0,0.5678,0.5201,0.5429,0.4481,0.4487
3,0.8294,0.0,0.5495,0.4839,0.5146,0.4116,0.4128
4,0.8516,0.0,0.5699,0.5458,0.5576,0.4685,0.4686
5,0.8426,0.0,0.5441,0.5193,0.5314,0.4369,0.4371
6,0.851,0.0,0.5956,0.5418,0.5674,0.4777,0.4785
7,0.8366,0.0,0.5809,0.5016,0.5383,0.4397,0.4414
8,0.8366,0.0,0.5037,0.5018,0.5028,0.405,0.405
9,0.848,0.0,0.5699,0.5345,0.5516,0.4602,0.4606


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8592,0.8751,0.6123,0.5658,0.5882,0.5034,0.504


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7975,0.0,0.4286,0.3939,0.4105,0.2885,0.2889
1,0.8047,0.0,0.3956,0.4045,0.4,0.2834,0.2834
2,0.8192,0.0,0.4615,0.4516,0.4565,0.3481,0.3481
3,0.8156,0.0,0.4066,0.4353,0.4205,0.3109,0.3112
4,0.7973,0.0,0.4081,0.3881,0.3978,0.2761,0.2762
5,0.7913,0.0,0.3934,0.3715,0.3821,0.2567,0.2569
6,0.8124,0.0,0.4118,0.4259,0.4187,0.3069,0.307
7,0.7986,0.0,0.4081,0.3908,0.3993,0.2783,0.2784
8,0.8058,0.0,0.386,0.4038,0.3947,0.2791,0.2792
9,0.8106,0.0,0.4596,0.4281,0.4433,0.3293,0.3296


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.8052,0.6527,0.4258,0.4102,0.4179,0.3009,0.301


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8391,0.0,0.5348,0.5105,0.5224,0.4257,0.4258
1,0.8403,0.0,0.5348,0.5141,0.5242,0.4283,0.4284
2,0.8517,0.0,0.359,0.5799,0.4434,0.3633,0.3772
3,0.8391,0.0,0.2527,0.5227,0.3407,0.2615,0.284
4,0.854,0.0,0.364,0.5893,0.45,0.3712,0.3856
5,0.8263,0.0,0.5735,0.4756,0.52,0.4151,0.4178
6,0.8366,0.0,0.6618,0.5014,0.5705,0.472,0.4789
7,0.8359,0.0,0.625,0.5,0.5556,0.4565,0.4608
8,0.7612,0.0,0.7022,0.3775,0.491,0.3529,0.3819
9,0.8444,0.0,0.5368,0.5252,0.5309,0.4376,0.4377


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.7239,0.7752,0.8517,0.3571,0.5033,0.3537,0.4178


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8415,0.0,0.5055,0.5188,0.5121,0.4174,0.4175
1,0.8463,0.0,0.5568,0.5315,0.5438,0.4515,0.4516
2,0.8445,0.0,0.5604,0.5258,0.5426,0.449,0.4493
3,0.8342,0.0,0.5495,0.4967,0.5217,0.4218,0.4226
4,0.851,0.0,0.5551,0.5451,0.5501,0.4608,0.4609
5,0.8498,0.0,0.5331,0.5431,0.538,0.4484,0.4484
6,0.8522,0.0,0.5625,0.5484,0.5554,0.4668,0.4668
7,0.8438,0.0,0.5699,0.5219,0.5448,0.4508,0.4514
8,0.8353,0.0,0.489,0.4981,0.4935,0.3952,0.3952
9,0.848,0.0,0.5551,0.5355,0.5451,0.4539,0.454


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.8623,0.7589,0.605,0.577,0.5907,0.508,0.5082


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.871,0.0,0.3077,0.7706,0.4398,0.3817,0.4335
1,0.868,0.0,0.293,0.7547,0.4222,0.3636,0.4158
2,0.8794,0.0,0.348,0.812,0.4872,0.431,0.481
3,0.8758,0.0,0.3333,0.7913,0.4691,0.4117,0.4613
4,0.8655,0.0,0.2757,0.7426,0.4021,0.3438,0.3979
5,0.8739,0.0,0.3199,0.7838,0.4543,0.397,0.4483
6,0.8764,0.0,0.3309,0.7965,0.4675,0.4108,0.4618
7,0.8727,0.0,0.3456,0.7402,0.4712,0.4095,0.4481
8,0.8691,0.0,0.3051,0.7477,0.4334,0.3739,0.4222
9,0.8758,0.0,0.3419,0.775,0.4745,0.4158,0.4608


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.865,0.0,0.3004,0.713,0.4227,0.3603,0.4037
1,0.8734,0.0,0.3114,0.7944,0.4474,0.3909,0.446
2,0.8831,0.0,0.381,0.8062,0.5174,0.4604,0.5025
3,0.8758,0.0,0.3333,0.7913,0.4691,0.4117,0.4613
4,0.8703,0.0,0.3125,0.7522,0.4416,0.382,0.4295
5,0.8697,0.0,0.3015,0.7593,0.4316,0.3731,0.4243
6,0.8758,0.0,0.3199,0.8056,0.4579,0.4021,0.4573
7,0.8739,0.0,0.3529,0.7442,0.4788,0.4173,0.455
8,0.8667,0.0,0.3088,0.7179,0.4319,0.3697,0.4121
9,0.8776,0.0,0.3566,0.776,0.4887,0.4298,0.4719


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8547,0.0,0.5458,0.5602,0.5529,0.4662,0.4662
1,0.8602,0.0,0.5788,0.5745,0.5766,0.4929,0.4929
2,0.8632,0.0,0.619,0.5788,0.5982,0.5159,0.5163
3,0.8457,0.0,0.5421,0.5305,0.5362,0.4437,0.4437
4,0.8625,0.0,0.5404,0.588,0.5632,0.4818,0.4824
5,0.8565,0.0,0.5699,0.5616,0.5657,0.4797,0.4797
6,0.8607,0.0,0.5956,0.5724,0.5838,0.5002,0.5003
7,0.8546,0.0,0.5956,0.5529,0.5735,0.486,0.4865
8,0.854,0.0,0.5699,0.5536,0.5616,0.4741,0.4741
9,0.8577,0.0,0.5809,0.5643,0.5725,0.4871,0.4872


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663
0,Ridge Classifier,0.8623,0.7589,0.605,0.577,0.5907,0.508,0.5082
0,Logistic Regression,0.8592,0.8751,0.6123,0.5658,0.5882,0.5034,0.504
0,Decision Tree Classifier,0.8052,0.6527,0.4258,0.4102,0.4179,0.3009,0.301
0,SVM - Linear Kernel,0.7239,0.7752,0.8517,0.3571,0.5033,0.3537,0.4178


Choosing best model


In [13]:
lgbm_model = models['lightgbm']
lgbm_tuned_model = pc.tune_model(lgbm_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
lgbm_ensemble_model = pc.ensemble_model(lgbm_tuned_model, method='Bagging', optimize='Accuracy')


print("Base LightGBM Model performance on test data")
pc.predict_model(lgbm_model)
print("Tuned LightGBM Model performance on test data")
pc.predict_model(lgbm_tuned_model)
print("Ensemble LightGBM Model performance on test data")
pc.predict_model(lgbm_ensemble_model)


et_model = models['et']
et_tuned_model = pc.tune_model(et_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
et_ensemble_model = pc.ensemble_model(et_tuned_model, method='Bagging', optimize='Accuracy')

print("Base Extra Trees Model performance on test data")
pc.predict_model(et_model)
print("Tuned Extra Trees Model performance on test data")
pc.predict_model(et_tuned_model)
print("Ensemble Extra Trees Model performance on test data")
pc.predict_model(et_ensemble_model)

rf_model = models['rf']
rf_tuned_model = pc.tune_model(rf_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
rf_ensemble_model = pc.ensemble_model(rf_tuned_model, method='Bagging', optimize='Accuracy', probability_threshold=0.35)

print("Base Random Forest Model performance on test data")
pc.predict_model(rf_model)
print("Tuned Random Forest Model performance on test data")
pc.predict_model(rf_tuned_model)
print("Ensemble Random Forest Model performance on test data")
pc.predict_model(rf_ensemble_model)


blended_model = pc.blend_models(estimator_list=[lgbm_ensemble_model, et_ensemble_model, rf_ensemble_model], optimize='Accuracy') 
print("Blended Model performance on test data")
pc.predict_model(blended_model)


# pc.plot_model(blended_model, plot='threshold')
pc.evaluate_model(blended_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8499,0.0,0.0952,0.9286,0.1728,0.1466,0.27
1,0.8523,0.0,0.1099,0.9375,0.1967,0.168,0.2924
2,0.8565,0.0,0.1465,0.8889,0.2516,0.215,0.3262
3,0.8541,0.0,0.1209,0.9429,0.2143,0.1838,0.3082
4,0.8522,0.0,0.1176,0.8649,0.2071,0.1747,0.2859
5,0.8498,0.0,0.0919,0.9259,0.1672,0.1418,0.2647
6,0.8583,0.0,0.1397,0.9744,0.2444,0.2119,0.3396
7,0.8534,0.0,0.114,0.9394,0.2033,0.174,0.2984
8,0.8492,0.0,0.0919,0.8929,0.1667,0.1403,0.2579
9,0.8595,0.0,0.1507,0.9535,0.2603,0.2256,0.3478


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.862,0.0,0.5311,0.5894,0.5588,0.4772,0.4781
1,0.865,0.0,0.5495,0.5976,0.5725,0.4925,0.4931
2,0.871,0.0,0.5934,0.6113,0.6022,0.5253,0.5254
3,0.862,0.0,0.5495,0.5859,0.5671,0.4851,0.4855
4,0.8679,0.0,0.5294,0.6128,0.568,0.4906,0.4924
5,0.8631,0.0,0.5478,0.5889,0.5676,0.4864,0.4869
6,0.8758,0.0,0.5956,0.6279,0.6113,0.5374,0.5377
7,0.8625,0.0,0.5735,0.5821,0.5778,0.4957,0.4957
8,0.8577,0.0,0.5368,0.5703,0.553,0.4685,0.4688
9,0.8637,0.0,0.5588,0.5891,0.5736,0.4925,0.4928


Base LightGBM Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Tuned LightGBM Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Ensemble LightGBM Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8843,0.8964,0.6314,0.6525,0.6418,0.5728,0.5729


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8692,0.0,0.337,0.7188,0.4589,0.3953,0.4322
1,0.8656,0.0,0.3223,0.6984,0.4411,0.3763,0.4128
2,0.8758,0.0,0.3553,0.7638,0.485,0.4249,0.4653
3,0.8668,0.0,0.3516,0.6857,0.4649,0.3977,0.4267
4,0.8649,0.0,0.2941,0.7143,0.4167,0.3549,0.3999
5,0.8806,0.0,0.3676,0.7937,0.5025,0.4448,0.4876
6,0.8709,0.0,0.3162,0.7544,0.4456,0.3861,0.4332
7,0.8745,0.0,0.364,0.7388,0.4877,0.4255,0.4602
8,0.8691,0.0,0.3309,0.72,0.4534,0.3904,0.4287
9,0.8758,0.0,0.3382,0.7797,0.4718,0.4136,0.4602


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8668,0.0,0.2857,0.75,0.4138,0.3553,0.4083
1,0.8698,0.0,0.2821,0.7938,0.4162,0.3611,0.4229
2,0.8807,0.0,0.3553,0.8151,0.4949,0.4388,0.4877
3,0.8734,0.0,0.3187,0.7838,0.4531,0.3956,0.4472
4,0.8703,0.0,0.2721,0.8132,0.4077,0.3546,0.4224
5,0.8752,0.0,0.3088,0.8155,0.448,0.3933,0.4528
6,0.8739,0.0,0.2904,0.8316,0.4305,0.3777,0.4444
7,0.8727,0.0,0.3309,0.7563,0.4604,0.4005,0.4447
8,0.8667,0.0,0.2868,0.7429,0.4138,0.3548,0.4064
9,0.8727,0.0,0.3199,0.7699,0.4519,0.3935,0.4425


Base Extra Trees Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Tuned Extra Trees Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Ensemble Extra Trees Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8821,0.8926,0.3451,0.8453,0.4901,0.4364,0.4928


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.877,0.0,0.381,0.7482,0.5049,0.443,0.476
1,0.8674,0.0,0.3297,0.7087,0.45,0.3858,0.4225
2,0.8819,0.0,0.3919,0.781,0.522,0.4629,0.4988
3,0.871,0.0,0.359,0.7153,0.478,0.4136,0.4457
4,0.8661,0.0,0.3088,0.7119,0.4308,0.368,0.4095
5,0.877,0.0,0.3566,0.7698,0.4874,0.428,0.4691
6,0.8794,0.0,0.364,0.7857,0.4975,0.4392,0.4814
7,0.8782,0.0,0.386,0.75,0.5097,0.4482,0.4805
8,0.8679,0.0,0.3382,0.7023,0.4566,0.3917,0.4257
9,0.8758,0.0,0.3456,0.7705,0.4772,0.418,0.4615


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8674,0.0,0.4725,0.6293,0.5397,0.4641,0.4706
1,0.8656,0.0,0.4579,0.625,0.5285,0.4523,0.4598
2,0.8716,0.0,0.5128,0.6364,0.568,0.4936,0.4975
3,0.8583,0.0,0.4725,0.5864,0.5233,0.4413,0.4448
4,0.8637,0.0,0.4375,0.6198,0.5129,0.4364,0.4454
5,0.8752,0.0,0.4926,0.6601,0.5642,0.4931,0.5003
6,0.8806,0.0,0.5184,0.6779,0.5875,0.5191,0.5255
7,0.8613,0.0,0.5,0.5913,0.5418,0.4608,0.463
8,0.8607,0.0,0.4816,0.5928,0.5314,0.4506,0.454
9,0.8685,0.0,0.4816,0.6298,0.5458,0.4706,0.4763


Base Random Forest Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Tuned Random Forest Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8835,0.8898,0.4097,0.775,0.536,0.4766,0.5083


Ensemble Random Forest Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.885,0.8917,0.5272,0.6984,0.6008,0.5352,0.5424


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8704,0.0,0.3846,0.6908,0.4941,0.4266,0.4507
1,0.8704,0.0,0.3773,0.6959,0.4893,0.4225,0.4485
2,0.8843,0.0,0.4286,0.7647,0.5493,0.4889,0.5159
3,0.874,0.0,0.4176,0.6951,0.5217,0.4543,0.4739
4,0.8745,0.0,0.3676,0.7353,0.4902,0.4276,0.4611
5,0.8733,0.0,0.3897,0.7067,0.5024,0.4367,0.4621
6,0.8854,0.0,0.4338,0.7662,0.554,0.494,0.5203
7,0.8782,0.0,0.4118,0.7273,0.5258,0.462,0.4867
8,0.8703,0.0,0.3897,0.6839,0.4965,0.4284,0.4508
9,0.8782,0.0,0.3971,0.7397,0.5167,0.4542,0.4831


Blended Model performance on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8898,0.8983,0.4552,0.7828,0.5757,0.5174,0.5425


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [23]:
pc.save_model(blended_model, "../../data/models/final_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('ordinal_encoding',
                  TransformerWrapper(exclude=None,
                                     include=['Gender', 'SmokingFriends',
                                              'SeenSmokerInPublicPlace',
                                              'SeenSmokerInEnclosedPlace',
                                              'SeenSmokerInHome',
                                              'HarmfulPassiveSmoke',
                                              'HardQuitSmoke'],
                                     transformer=OrdinalEncoder(cols=['Gender',
                                                                      'SmokingFriends',
                                                                      'SeenSmokerInPublicPlace',
                                                                      'SeenSmokerInEnclosed...
                                                                                          

In [None]:
from typing import Any, Dict, Optional
from shap import sample
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from pycaret.utils.generic import get_label_encoder

def dashboard(
        estimator,
        display_format: str = "dash",
        dashboard_kwargs: Optional[Dict[str, Any]] = None,
        run_kwargs: Optional[Dict[str, Any]] = None,
        dump: bool = False,
        **kwargs,
    ):
        """
        This function generates the interactive dashboard for a trained model. The
        dashboard is implemented using ExplainerDashboard (explainerdashboard.readthedocs.io)


        Example
        -------
        >>> from pycaret.datasets import get_data
        >>> juice = get_data('juice')
        >>> from pycaret.classification import *
        >>> exp_name = setup(data = juice,  target = 'Purchase')
        >>> lr = create_model('lr')
        >>> dashboard(lr)


        estimator: scikit-learn compatible object
            Trained model object


        display_format: str, default = 'dash'
            Render mode for the dashboard. The default is set to ``dash`` which will
            render a dashboard in browser. There are four possible options:

            - 'dash' - displays the dashboard in browser
            - 'inline' - displays the dashboard in the jupyter notebook cell.
            - 'jupyterlab' - displays the dashboard in jupyterlab pane.
            - 'external' - displays the dashboard in a separate tab. (use in Colab)


        dashboard_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``ExplainerDashboard`` class.


        run_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``run`` method of ``ExplainerDashboard``.

        dump: bool, default = False
            When set to True, will dump the explainer object as a dill file in the
            current working directory.

        **kwargs:
            Additional keyword arguments to pass to the ``ClassifierExplainer`` or
            ``RegressionExplainer`` class.


        Returns:
            ExplainerDashboard
        """

        dashboard_kwargs = dashboard_kwargs or {}
        run_kwargs = run_kwargs or {}

        le = get_label_encoder(pc.get_config("pipeline"))
        if le:
            labels_ = list(le.classes_)
        else:
            labels_ = None

        seed = pc.get_config("seed")
        # Replacing chars which dash doesn't accept for column name `.` , `{`, `}`
        X_test_df = sample(pc.get_config('X_test_transformed').copy(), 1000, random_state=seed)
        X_test_df.columns = [
            col.replace(".", "__").replace("{", "__").replace("}", "__")
            for col in X_test_df.columns
        ]

   
        y_test_df = sample(pc.get_config('y_test_transformed').copy(), 1000, random_state=seed)
        
        onehotencoded = categorical_columns.copy()
        onehotencoded.remove("Gender")
        explainer = ClassifierExplainer(
            model=estimator, 
            X=X_test_df, 
            y=y_test_df, 
            labels=labels_, 
            n_jobs=10, 
            cats=onehotencoded,
            **kwargs,
        )
        if dump:
            explainer.dump('blended_model_explainer.dill')
        
        explainer_dashboard = ExplainerDashboard(
            explainer, mode=display_format, **dashboard_kwargs
        )
        explainer_dashboard.run(**run_kwargs)
        return explainer_dashboard

explainer_dashboard = dashboard(estimator=blended_model, display_format='external', shap='kernel', dashboard_kwargs={"port": 8100}, dump=True)

Exporting to yaml file the dashboard

In [22]:
explainer_dashboard.to_yaml("../../data/models/dashboard_config.yaml", dump_explainer=True, explainerfile="blended_model_explainer.dill")

Dumping configuration .yaml to c:\Users\miaob\Desktop\tobacco-analysis\src\notebooks\..\..\data\models\dashboard_config.yaml...
Dumping explainer to c:\Users\miaob\Desktop\tobacco-analysis\src\notebooks\..\..\data\models\blended_model_explainer.dill...


# Importazione del modello
Siccome l'addestramento del modello richiede molto tempo e risorse, il modello è stato salvato in un file .pkl e verrà importato in questa sezione del notebook, per poi essere utilizzato per fare predizioni sul dataset di test. Inoltre può essere importata la dashboard per visualizzare i risultati ottenuti.

In [5]:
final_model = pc.load_model("../../data/models/final_model")
pc.evaluate_model(final_model)

Transformation Pipeline and Model Successfully Loaded


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [12]:
from explainerdashboard import ExplainerDashboard
explainer_dashboard2 = ExplainerDashboard.from_config("../../data/models/blended_model_explainer.dill", "../../data/models/dashboard_config.yaml",)
explainer_dashboard2.run(port=5500)

Building ExplainerDashboard..
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating dependencies...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://192.168.188.168:5500
You can terminate the dashboard with ExplainerDashboard.terminate(5500)


Dash app running on http://127.0.0.1:5500/
