## Import necessary libraries


In [None]:
import sys
import os

# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

# Import the necessary libraries
import pycaret.classification as pc
import pandas as pd
import src.scripts.mapping_answers_dict as map_dict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from math import sqrt


## Dataset loading

In [None]:
import re

# Load the merged dataset
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")

ordinal_columns = ["SmokingFriends", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "HarmfulPassiveSmoke", "HardQuitSmoke"]
dataset[ordinal_columns] = dataset[ordinal_columns].astype('category')

# Convert categorical columns
categorical_columns = ["State", "Gender", "Age", "AttractiveSmoker", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')
# remove non letter, non number, non space characters from the categorical columns with regex

dataset[categorical_columns] = dataset[categorical_columns].applymap(lambda x: re.sub(r'[^\w\s\d]', '', x))


# Convert boolean columns to int
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('int')

# Comparing models

In [None]:
from sklearn.preprocessing import OneHotEncoder

setup = pc.setup(data=dataset,
                 target='Smoke',
                 index=False,
                 train_size=0.8,
                 session_id=42,
                 categorical_features=categorical_columns,
                 ordinal_features={
                     "SmokingFriends": map_dict.OR46_dict.values(),
                     "SeenSmokerInPublicPlace": map_dict.CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": map_dict.CR20_dict.values(),
                     "SeenSmokerInHome": map_dict.CR19_dict.values(),
                     "HarmfulPassiveSmoke": map_dict.CR23_dict.values(),
                     "HardQuitSmoke": map_dict.CR41_dict.values(),
                 },
                 imputation_type=None,
                 max_encoding_ohe=0,
                 encoding_method=OneHotEncoder(dtype=int, sparse_output=False),
                 n_jobs=10
                 )

# Compute the class weights

In [None]:

classes = dataset['Smoke'].unique()

class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=pc.get_config("y_train_transformed"))
sqrt_weights = [sqrt(weight) for weight in class_weights]

class_weights = dict(zip(classes, class_weights))
sqrt_weights = dict(zip(classes, sqrt_weights))

print(class_weights)
print(sqrt_weights)

Find the best model within the class-weight supporting ones

In [None]:
from IPython.display import display, HTML
#all_models = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] #'rbfsvm', 'gpc', 'mlp'

# Models that support class weights
weighted_model_id =[ 'lr', 'dt', 'svm' , 'ridge', 'rf', 'et', 'lightgbm'] # 'rbfsvm'
models = {}
predicts = pd.DataFrame()
cv_results = pd.DataFrame()

for model_id in weighted_model_id:
    try:
        # get model name from setup
        model_name = pc.models().loc[model_id].Name
        
        display(HTML(f"<h2>Training {model_name}</h2>"))
        model = pc.create_model(model_id, verbose=True, class_weight=sqrt_weights)
        models[model_id] = model
        
        cv =  pc.pull()
        cv = cv.loc['Mean']
        cv = cv.to_frame().transpose()
        
        cv.index = [model_name] # set index to model name
        cv_results = pd.concat([cv_results, cv])
        
        display(HTML(f"<h3>Predicting {model_name}</h3>"))
        pc.predict_model(model)
        predict = pc.pull()
        predicts = pd.concat([predicts, predict])
    except Exception as e:
        print(e)

# Sort the models by MCC
cv_results = cv_results.sort_values('MCC', ascending=False)
predicts = predicts.sort_values('MCC', ascending=False)

display(HTML(f"<h2>Cross validation mean results</h2>{cv_results.to_html()}"))
display(HTML(f"<h2>Predictions result</h2>{predicts.to_html()}"))

Choosing best model


In [None]:
lgbm_model = models['lightgbm']
lgbm_tuned_model = pc.tune_model(lgbm_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
lgbm_ensemble_model = pc.ensemble_model(lgbm_tuned_model, method='Bagging', optimize='Accuracy')


print("Base LightGBM Model performance on test data")
pc.predict_model(lgbm_model)
print("Tuned LightGBM Model performance on test data")
pc.predict_model(lgbm_tuned_model)
print("Ensemble LightGBM Model performance on test data")
pc.predict_model(lgbm_ensemble_model)


et_model = models['et']
et_tuned_model = pc.tune_model(et_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
et_ensemble_model = pc.ensemble_model(et_tuned_model, method='Bagging', optimize='Accuracy')

print("Base Extra Trees Model performance on test data")
pc.predict_model(et_model)
print("Tuned Extra Trees Model performance on test data")
pc.predict_model(et_tuned_model)
print("Ensemble Extra Trees Model performance on test data")
pc.predict_model(et_ensemble_model)

rf_model = models['rf']
rf_tuned_model = pc.tune_model(rf_model, optimize='Accuracy', search_library='optuna', search_algorithm='tpe')
rf_ensemble_model = pc.ensemble_model(rf_tuned_model, method='Bagging', optimize='Accuracy', probability_threshold=0.35)

print("Base Random Forest Model performance on test data")
pc.predict_model(rf_model)
print("Tuned Random Forest Model performance on test data")
pc.predict_model(rf_tuned_model)
print("Ensemble Random Forest Model performance on test data")
pc.predict_model(rf_ensemble_model)


blended_model = pc.blend_models(estimator_list=[lgbm_ensemble_model, et_ensemble_model, rf_ensemble_model], optimize='Accuracy') 
print("Blended Model performance on test data")
pc.predict_model(blended_model)


# pc.plot_model(blended_model, plot='threshold')
pc.evaluate_model(blended_model)

In [None]:
pc.save_model(blended_model, "../../data/models/final_model")

In [None]:
from typing import Any, Dict, Optional
from shap import sample
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from pycaret.utils.generic import get_label_encoder

def dashboard(
        estimator,
        display_format: str = "dash",
        dashboard_kwargs: Optional[Dict[str, Any]] = None,
        run_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        """
        This function generates the interactive dashboard for a trained model. The
        dashboard is implemented using ExplainerDashboard (explainerdashboard.readthedocs.io)


        Example
        -------
        >>> from pycaret.datasets import get_data
        >>> juice = get_data('juice')
        >>> from pycaret.classification import *
        >>> exp_name = setup(data = juice,  target = 'Purchase')
        >>> lr = create_model('lr')
        >>> dashboard(lr)


        estimator: scikit-learn compatible object
            Trained model object


        display_format: str, default = 'dash'
            Render mode for the dashboard. The default is set to ``dash`` which will
            render a dashboard in browser. There are four possible options:

            - 'dash' - displays the dashboard in browser
            - 'inline' - displays the dashboard in the jupyter notebook cell.
            - 'jupyterlab' - displays the dashboard in jupyterlab pane.
            - 'external' - displays the dashboard in a separate tab. (use in Colab)


        dashboard_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``ExplainerDashboard`` class.


        run_kwargs: dict, default = {} (empty dict)
            Dictionary of arguments passed to the ``run`` method of ``ExplainerDashboard``.

        **kwargs:
            Additional keyword arguments to pass to the ``ClassifierExplainer`` or
            ``RegressionExplainer`` class.


        Returns:
            ExplainerDashboard
        """

        dashboard_kwargs = dashboard_kwargs or {}
        run_kwargs = run_kwargs or {}

        le = get_label_encoder(pc.get_config("pipeline"))
        if le:
            labels_ = list(le.classes_)
        else:
            labels_ = None

        seed = pc.get_config("seed")
        # Replacing chars which dash doesn't accept for column name `.` , `{`, `}`
        X_test_df = sample(pc.get_config('X_test_transformed').copy(), 1000, random_state=seed)
        X_test_df.columns = [
            col.replace(".", "__").replace("{", "__").replace("}", "__")
            for col in X_test_df.columns
        ]

   
        y_test_df = sample(pc.get_config('y_test_transformed').copy(), 1000, random_state=seed)
        
        onehotencoded = categorical_columns.copy()
        onehotencoded.remove("Gender")
        explainer = ClassifierExplainer(
            model=estimator, 
            X=X_test_df, 
            y=y_test_df, 
            labels=labels_, 
            n_jobs=10, 
            cats=onehotencoded,
            **kwargs,
        )
        
        explainer_dashboard = ExplainerDashboard(
            explainer, mode=display_format, **dashboard_kwargs
        )
        
        explainer_dashboard.run(**run_kwargs)
        return explainer_dashboard

explainer_dashboard = dashboard(estimator=blended_model, display_format='external', shap='kernel', dashboard_kwargs={"port": 8100})

Exporting to yaml file the dashboard

In [None]:
explainer_dashboard.to_yaml("../../data/models/dashboard_config.yaml", dump_explainer=True, explainerfile="final_model_explainer.dill")

# Importazione del modello
Siccome l'addestramento del modello richiede molto tempo e risorse, il modello è stato salvato in un file .pkl e verrà importato in questa sezione del notebook, per poi essere utilizzato per fare predizioni sul dataset di test. Inoltre può essere importata la dashboard per visualizzare i risultati ottenuti.

In [None]:
final_model = pc.load_model("../../data/models/final_model")
pc.evaluate_model(final_model)

In [None]:
from explainerdashboard import ExplainerDashboard
explainer_dashboard2 = ExplainerDashboard.from_config("../../data/models/final_model_explainer.dill", "../../data/models/dashboard_config.yaml",)
explainer_dashboard2.run(port=5502)