## Import necessary libraries


In [1]:
import sys
import os

# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

# Import the necessary libraries
import pycaret.classification as pc
import pandas as pd
import src.scripts.mapping_answers_dict as map_dict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from math import sqrt


## Dataset loading

In [2]:
import re

# Load the merged dataset
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")

ordinal_columns = ["SmokingFriends", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "HarmfulPassiveSmoke", "HardQuitSmoke"]
dataset[ordinal_columns] = dataset[ordinal_columns].astype('category')

# Convert categorical columns
categorical_columns = ["State", "Gender", "Age", "AttractiveSmoker", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')
# remove non letter, non number, non space characters from the categorical columns with regex

dataset[categorical_columns] = dataset[categorical_columns].applymap(lambda x: re.sub(r'[^\w\s\d]', '', x))


# Convert boolean columns
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('bool')

# Comparing models

In [3]:
from sklearn.preprocessing import OneHotEncoder
# import sklearn.preprocessing as ce
import re

setup = pc.setup(data=dataset,
                 target='Smoke',
                 index=False,
                 train_size=0.8,
                 session_id=42,
                 categorical_features=categorical_columns,
                 ordinal_features={
                     "SmokingFriends": map_dict.OR46_dict.values(),
                     "SeenSmokerInPublicPlace": map_dict.CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": map_dict.CR20_dict.values(),
                     "SeenSmokerInHome": map_dict.CR19_dict.values(),
                     "HarmfulPassiveSmoke": map_dict.CR23_dict.values(),
                     "HardQuitSmoke": map_dict.CR41_dict.values(),
                 },
                 imputation_type=None,
                 normalize=False,
                 max_encoding_ohe=0,
                 encoding_method=OneHotEncoder(dtype=bool, sparse_output=False),
                 n_jobs=10
                 )
X_test_df = pc.get_config('X_test_transformed').copy()
X_test_df

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Smoke
2,Target type,Binary
3,Original data shape,"(20731, 23)"
4,Transformed data shape,"(20731, 42)"
5,Transformed train set shape,"(16584, 42)"
6,Transformed test set shape,"(4147, 42)"
7,Ordinal features,6
8,Categorical features,8
9,Preprocess,True


Unnamed: 0,Age_11 years old or younger,Age_12 years old,Age_13 years old,Age_14 years old,Age_15 years old,Age_16 years old,Age_17 years old or older,Gender,SmokingFriends,SeenSmokerInSchool,...,BanTobaccoOutdoors,HarmfulPassiveSmoke,State_Italy,State_Poland,State_Portugal,State_Romania,SmokingFather,SmokingMother,WorkingFather,WorkingMother
16584,False,True,False,False,False,False,False,1.0,0.0,False,...,True,0.0,False,False,True,False,False,False,True,False
16585,False,False,False,True,False,False,False,1.0,0.0,False,...,True,2.0,False,True,False,False,True,True,True,True
16586,False,False,False,True,False,False,False,1.0,1.0,True,...,False,3.0,False,False,True,False,True,False,True,False
16587,False,False,False,False,False,True,False,0.0,2.0,True,...,False,3.0,False,True,False,False,False,False,True,True
16588,False,False,True,False,False,False,False,1.0,1.0,False,...,True,3.0,False,False,True,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20726,False,False,False,True,False,False,False,0.0,1.0,True,...,True,2.0,False,True,False,False,False,False,True,True
20727,False,False,False,True,False,False,False,1.0,0.0,True,...,False,3.0,False,False,True,False,True,False,True,True
20728,False,False,False,False,True,False,False,0.0,1.0,False,...,True,2.0,False,False,False,True,False,False,True,True
20729,False,False,False,False,True,False,False,0.0,1.0,True,...,False,1.0,False,True,False,False,False,False,True,True


In [4]:
pc.get_config('pipeline')
X_test_df.dtypes



Age_11 years old or younger                                                     bool
Age_12 years old                                                                bool
Age_13 years old                                                                bool
Age_14 years old                                                                bool
Age_15 years old                                                                bool
Age_16 years old                                                                bool
Age_17 years old or older                                                       bool
Gender                                                                       float64
SmokingFriends                                                               float64
SeenSmokerInSchool                                                              bool
SeenSmokerInPublicPlace                                                      float64
SeenSmokerInEnclosedPlace                                        

# Compute the class weights

In [5]:

classes = dataset['Smoke'].unique()

class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=pc.get_config("y_train_transformed"))
sqrt_weights = [sqrt(weight) for weight in class_weights]

class_weights = dict(zip(classes, class_weights))
sqrt_weights = dict(zip(classes, sqrt_weights))

print(class_weights)
print(sqrt_weights)


{True: 3.0440528634361232, False: 0.5982683982683983}
{True: 1.7447214286057597, False: 0.7734781175110246}


Find the best model within the class-weight supporting ones

In [6]:
#all_models = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] #'rbfsvm', 'gpc', 'mlp'

# Models that support class weights
threshold_optimized_model =[ 'lr', 'dt', 'svm' , 'ridge', 'rf', 'et', 'lightgbm'] # 'rbfsvm'
models = {}
predicts = pd.DataFrame()

for model_name in threshold_optimized_model:
    try:
        model = pc.create_model(model_name, verbose=False, class_weight=sqrt_weights)
        models[model_name] = model
        pc.predict_model(model)
        predict = pc.pull()
        predicts = pd.concat([predicts, predict])
    except Exception as e:
        print(e)

# Sort the models by MCC
predicts = predicts.sort_values('MCC', ascending=False)
predicts


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8592,0.8751,0.6123,0.5658,0.5882,0.5034,0.504


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.8052,0.6527,0.4258,0.4102,0.4179,0.3009,0.301


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.7239,0.7752,0.8517,0.3571,0.5033,0.3537,0.4178


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.8623,0.7589,0.605,0.577,0.5907,0.508,0.5082


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663
0,Ridge Classifier,0.8623,0.7589,0.605,0.577,0.5907,0.508,0.5082
0,Logistic Regression,0.8592,0.8751,0.6123,0.5658,0.5882,0.5034,0.504
0,Extra Trees Classifier,0.883,0.8857,0.37,0.8182,0.5096,0.4537,0.5
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982
0,SVM - Linear Kernel,0.7239,0.7752,0.8517,0.3571,0.5033,0.3537,0.4178
0,Decision Tree Classifier,0.8052,0.6527,0.4258,0.4102,0.4179,0.3009,0.301


Choosing best model


In [7]:

# model = best
rf_model = models['rf']
rf_ensemble_model = pc.ensemble_model(rf_model, method='Bagging', optimize='MCC', probability_threshold=0.35)

pc.predict_model(rf_model)
pc.predict_model(rf_ensemble_model)

lgbm_model = models['lightgbm']
lgbm_ensemble_model = pc.ensemble_model(lgbm_model, method='Bagging', optimize='MCC')

pc.predict_model(lgbm_model)
pc.predict_model(lgbm_ensemble_model)

blended_model = pc.blend_models(estimator_list=[rf_model, lgbm_model], optimize='MCC', probability_threshold=0.4) 
pc.predict_model(blended_model)


# pc.plot_model(blended_model, plot='threshold')
pc.evaluate_model(blended_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.868,0.0,0.4982,0.6239,0.554,0.4776,0.4818
1,0.8692,0.0,0.4908,0.6321,0.5526,0.4774,0.4826
2,0.8716,0.0,0.5311,0.6304,0.5765,0.5015,0.5041
3,0.8602,0.0,0.5092,0.5865,0.5451,0.463,0.4646
4,0.8697,0.0,0.4779,0.6373,0.5462,0.472,0.4786
5,0.8703,0.0,0.5,0.6326,0.5585,0.4837,0.4883
6,0.8818,0.0,0.5404,0.6743,0.6,0.5316,0.5361
7,0.8643,0.0,0.5074,0.6026,0.5509,0.4717,0.4741
8,0.8637,0.0,0.4853,0.6055,0.5388,0.4599,0.4638
9,0.877,0.0,0.5184,0.6589,0.5802,0.5094,0.5144


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8828,0.8873,0.3642,0.8239,0.5051,0.4497,0.4982


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8879,0.8954,0.5727,0.6915,0.6265,0.5612,0.5647


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.862,0.0,0.5311,0.5894,0.5588,0.4772,0.4781
1,0.865,0.0,0.5495,0.5976,0.5725,0.4925,0.4931
2,0.871,0.0,0.5934,0.6113,0.6022,0.5253,0.5254
3,0.862,0.0,0.5495,0.5859,0.5671,0.4851,0.4855
4,0.8679,0.0,0.5294,0.6128,0.568,0.4906,0.4924
5,0.8631,0.0,0.5478,0.5889,0.5676,0.4864,0.4869
6,0.8758,0.0,0.5956,0.6279,0.6113,0.5374,0.5377
7,0.8625,0.0,0.5735,0.5821,0.5778,0.4957,0.4957
8,0.8577,0.0,0.5368,0.5703,0.553,0.4685,0.4688
9,0.8637,0.0,0.5588,0.5891,0.5736,0.4925,0.4928


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8789,0.8945,0.652,0.6262,0.6388,0.5662,0.5663


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8843,0.8964,0.6314,0.6525,0.6418,0.5728,0.5729


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8559,0.0,0.5641,0.562,0.5631,0.4768,0.4768
1,0.8626,0.0,0.5824,0.5824,0.5824,0.5002,0.5002
2,0.8686,0.0,0.6227,0.5965,0.6093,0.5304,0.5306
3,0.8451,0.0,0.5604,0.5276,0.5435,0.4503,0.4506
4,0.8637,0.0,0.5662,0.5878,0.5768,0.4956,0.4957
5,0.8583,0.0,0.5699,0.5678,0.5688,0.484,0.484
6,0.8601,0.0,0.5956,0.5704,0.5827,0.4987,0.4989
7,0.8589,0.0,0.5919,0.5669,0.5791,0.4944,0.4946
8,0.8522,0.0,0.5809,0.5467,0.5633,0.4744,0.4748
9,0.8607,0.0,0.5809,0.5745,0.5777,0.4943,0.4943


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8773,0.8959,0.6476,0.6211,0.6341,0.5604,0.5606


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [17]:
from typing import Any, Dict, Optional
from shap import sample
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from pycaret.utils.generic import get_label_encoder


def dashboard(
        estimator,
        display_format: str = "dash",
        dashboard_kwargs: Optional[Dict[str, Any]] = None,
        run_kwargs: Optional[Dict[str, Any]] = None,
        
        **kwargs,
    ):
        """
        This function generates the interactive dashboard for a trained model. The
        dashboard is implemented using ExplainerDashboard (explainerdashboard.readthedocs.io)


        Example
        -------
        >>> from pycaret.datasets import get_data
        >>> juice = get_data('juice')
        >>> from pycaret.classification import *
        >>> exp_name = setup(data = juice,  target = 'Purchase')
        >>> lr = create_model('lr')
        >>> dashboard(lr)


        estimator: scikit-learn compatible object
            Trained model object


        display_format: str, default = 'dash'
            Render mode for the dashboard. The default is set to ``dash`` which will
            render a dashboard in browser. There are four possible options:

            - 'dash' - displays the dashboard in browser
            - 'inline' - displays the dashboard in the jupyter notebook cell.
            - 'jupyterlab' - displays the dashboard in jupyterlab pane.
            - 'external' - displays the dashboard in a separate tab. (use in Colab)


        dashboard_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``ExplainerDashboard`` class.


        run_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``run`` method of ``ExplainerDashboard``.


        **kwargs:
            Additional keyword arguments to pass to the ``ClassifierExplainer`` or
            ``RegressionExplainer`` class.


        Returns:
            ExplainerDashboard
        """

        dashboard_kwargs = dashboard_kwargs or {}
        run_kwargs = run_kwargs or {}

        le = get_label_encoder(pc.get_config("pipeline"))
        if le:
            labels_ = list(le.classes_)
        else:
            labels_ = None

        # Replacing chars which dash doesn't accept for column name `.` , `{`, `}`
        X_test_df = pc.get_config('X_test_transformed').copy().head(10)
        X_test_df.columns = [
            col.replace(".", "__").replace("{", "__").replace("}", "__")
            for col in X_test_df.columns
        ]
        for column in X_test_df.columns:
            if X_test_df[column].dtype == 'float64':
                X_test_df[column] = X_test_df[column].astype(bool)
        print(X_test_df.dtypes)
            
        x_train_df = pc.get_config('X_train_transformed').copy().head(10)
        for column in x_train_df.columns:
            if x_train_df[column].dtype == 'float64':
                x_train_df[column] = x_train_df[column].astype(bool)
            print(X_test_df[column].dtype)
        
        y_test_df = pc.get_config('y_test_transformed').copy().head(10)
        explainer = ClassifierExplainer(
            estimator, X_test_df, y_test_df, labels=labels_, n_jobs=10, **kwargs
        )
        
        explainer_dashboard = ExplainerDashboard(
            explainer, mode=display_format, **dashboard_kwargs
        )
        explainer_dashboard.run(**run_kwargs)
        return explainer_dashboard

X_test_df = pc.get_config('X_test_transformed').copy().head(100)
# blended_model.predict_proba(X_test_df)
# X_test_df.dtypes
explainer_dashboard = dashboard(estimator=blended_model, display_format='external', shap='kernel', dashboard_kwargs={"port": 8100})

Age_11 years old or younger                                                  bool
Age_12 years old                                                             bool
Age_13 years old                                                             bool
Age_14 years old                                                             bool
Age_15 years old                                                             bool
Age_16 years old                                                             bool
Age_17 years old or older                                                    bool
Gender                                                                       bool
SmokingFriends                                                               bool
SeenSmokerInSchool                                                           bool
SeenSmokerInPublicPlace                                                      bool
SeenSmokerInEnclosedPlace                                                    bool
SeenSmokerInHome

  0%|          | 0/10 [00:00<?, ?it/s]

Calculating pred_percentiles...
Calculating prediction probabilities...
Calculating predictions...


In [None]:
explainer = explainer_dashboard.explainer
explainer.dump('blended_model_explainer.dill')

In [None]:
explainer2 = ClassifierExplainer.from_file("blended_model_explainer.dill")
explainer_dashboard2 = ExplainerDashboard(explainer2, mode='dash', port=8091)

explainer_dashboard2.run()


Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating dependencies...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://192.168.188.168:8091


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8091): Max retries exceeded with url: /_alive_dcdae75c-0d61-447c-aedd-5354035e8b0f (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001D0E9F7E7D0>: Failed to establish a new connection: [WinError 10049] Indirizzo richiesto non valido nel proprio contesto'))

In [None]:
ExplainerDashboard.terminate(8084)

Trying to shut down dashboard on port 8084...
