## Import necessary libraries


In [3]:
import sys
import os

# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

# Import the necessary libraries
import pycaret.classification as pc
import pandas as pd
import src.scripts.mapping_answers_dict as map_dict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from math import sqrt


## Dataset loading

In [None]:
# Load the merged dataset
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")

# Convert categorical columns
categorical_columns = ["State", "Gender", "Age", "SmokingFriends", "SeenSmokerInPublicPlace",
                       "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "AttractiveSmoker",
                       "HardQuitSmoke", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents", "HarmfulPassiveSmoke"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')

# Convert boolean columns
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('bool')

# Comparing models

In [None]:
setup = pc.setup(data=dataset,
                 target='Smoke',
                 index=False,
                 train_size=0.8,
                 session_id=42,
                 ordinal_features={
                     "SmokingFriends": map_dict.OR46_dict.values(),
                     "SeenSmokerInPublicPlace": map_dict.CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": map_dict.CR20_dict.values(),
                     "SeenSmokerInHome": map_dict.CR19_dict.values(),
                     "HarmfulPassiveSmoke": map_dict.CR23_dict.values(),
                     "HardQuitSmoke": map_dict.CR41_dict.values(),
                 },
                 transformation=True,
                 max_encoding_ohe=0,
                 n_jobs=10
                 )

# Compute the class weights

In [None]:

classes = dataset['Smoke'].unique()

class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=pc.get_config("y_train_transformed"))
sqrt_weights = [sqrt(weight) for weight in class_weights]

class_weights = dict(zip(classes, class_weights))
sqrt_weights = dict(zip(classes, sqrt_weights))

print(class_weights)
print(sqrt_weights)


Find the best model within the class-weight supporting ones

In [None]:
#all_models = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] #'rbfsvm', 'gpc', 'mlp'

# Models that support class weights
threshold_optimized_model =[ 'lr', 'dt', 'svm' , 'ridge', 'rf', 'et', 'lightgbm'] # 'rbfsvm'
models = {}
predicts = pd.DataFrame()

for model_name in threshold_optimized_model:
    try:
        model = pc.create_model(model_name, verbose=False, class_weight=sqrt_weights)
        models[model_name] = model
        pc.predict_model(model)
        predict = pc.pull()
        predicts = pd.concat([predicts, predict])
    except Exception as e:
        print(e)
        print(f"Model {model} cannot have weights")

# Sort the models by MCC
predicts = predicts.sort_values('MCC', ascending=False)
predicts


Choosing best model


In [None]:

# model = best
rf_model = models['rf']
rf_ensemble_model = pc.ensemble_model(rf_model, method='Bagging', optimize='MCC', probability_threshold=0.35)

pc.predict_model(rf_model)
pc.predict_model(rf_ensemble_model)

lgbm_model = models['lightgbm']
lgbm_ensemble_model = pc.ensemble_model(lgbm_model, method='Bagging', optimize='MCC')

pc.predict_model(lgbm_model)
pc.predict_model(lgbm_ensemble_model)

blended_model = pc.blend_models(estimator_list=[rf_model, lgbm_model], optimize='MCC', probability_threshold=0.4) 
pc.predict_model(blended_model)

pc.plot_model(blended_model, plot='threshold')
pc.evaluate_model(blended_model)

In [None]:
# pc.evaluate_model(ensemble_model)

pc.dashboard(blended_model)
# print(tuned_weighted_model)