## Import necessary libraries


In [None]:
# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

# Import the necessary libraries
import pycaret.classification as pc
import pandas as pd
import os
import sys
import src.scripts.mapping_answers_dict as map_dict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from math import sqrt


## Dataset loading

In [None]:
# Load the merged dataset
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")

# Convert categorical columns
categorical_columns = ["State", "Gender", "Age", "SmokingFriends", "SeenSmokerInPublicPlace",
                       "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "AttractiveSmoker",
                       "HardQuitSmoke", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents", "HarmfulPassiveSmoke"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')

# Convert boolean columns
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('bool')

# Comparing models

In [None]:
setup = pc.setup(data=dataset,
                 target='Smoke',
                 index=False,
                 train_size=0.8,
                 session_id=42,
                 ordinal_features={
                     "SmokingFriends": map_dict.OR46_dict.values(),
                     "SeenSmokerInPublicPlace": map_dict.CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": map_dict.CR20_dict.values(),
                     "SeenSmokerInHome": map_dict.CR19_dict.values(),
                     "HarmfulPassiveSmoke": map_dict.CR23_dict.values(),
                     "HardQuitSmoke": map_dict.CR41_dict.values(),
                 },
                 transformation=True,
                 max_encoding_ohe=0,
                 n_jobs=10
                 )

# Compute the class weights

In [None]:

classes = dataset['Smoke'].unique()

class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=pc.get_config("y_train_transformed"))
sqrt_weights = [sqrt(weight) for weight in class_weights]

class_weights = dict(zip(classes, class_weights))
sqrt_weights = dict(zip(classes, sqrt_weights))

print(class_weights)
print(sqrt_weights)


Find the best model within the class-weight supporting ones

In [None]:
#all_models = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] #'rbfsvm', 'gpc', 'mlp'

# Models that support class weights
threshold_optimized_model =[ 'lr', 'dt', 'svm' , 'ridge', 'rf', 'et', 'lightgbm'] # 'rbfsvm'
models = {}
predicts = pd.DataFrame()

for model_name in threshold_optimized_model:
    try:
        model = pc.create_model(model_name, verbose=False, class_weight=sqrt_weights)
        models[model_name] = model
        pc.predict_model(model)
        predict = pc.pull()
        predicts = pd.concat([predicts, predict])
    except Exception as e:
        print(e)
        print(f"Model {model} cannot have weights")

# Sort the models by MCC
predicts = predicts.sort_values('MCC', ascending=False)
predicts


Choosing best model


In [None]:

# model = best
rf_model = models['rf']
rf_ensemble_model = pc.ensemble_model(rf_model, method='Bagging', optimize='MCC', probability_threshold=0.35)

pc.predict_model(rf_model)
pc.predict_model(rf_ensemble_model)

lgbm_model = models['lightgbm']
lgbm_ensemble_model = pc.ensemble_model(lgbm_model, method='Bagging', optimize='MCC')

pc.predict_model(lgbm_model)
pc.predict_model(lgbm_ensemble_model)

blended_model = pc.blend_models(estimator_list=[rf_model, lgbm_model], optimize='MCC', probability_threshold=0.4) 
pc.predict_model(blended_model)

pc.plot_model(blended_model, plot='threshold')
pc.evaluate_model(blended_model)

In [None]:
# pc.evaluate_model(ensemble_model)

pc.dashboard(blended_model)
# print(tuned_weighted_model)

In [None]:
### Data analysis (Move to data_analysis.ipynb)

list_of_columns = ["State", "Gender", "Smoke", "SmokingFriends", 'SmokingFather', 'SmokingMother', 'WorkingFather', 'WorkingMother',
                       "SeenSmokerInSchool", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace",
                       "SeenSmokerInHome", "ParentWarnings", "AttractiveSmoker", "HardQuitSmoke",
                       "SmokerConfidentInCelebrations", "SchoolWarnings", "SeenHealthWarnings",
                       "AntiTobaccoInEvents", "AntiTobaccoInMedia", "BanTobaccoOutdoors",
                       "HarmfulPassiveSmoke"
                    ]

def bar_perc(plot, dataframe, feature):
    total = len(feature) # length of the column
    total_smokers = len(dataframe[dataframe['Smoke'] == True]) # length of the smokers
    total_non_smokers = len(dataframe[dataframe['Smoke'] == False]) # length of the non-smokers
  
    for i, p in enumerate(plot.patches):
        percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
     
        x = p.get_x() + p.get_width() / 2 - (0.05 if (100 * p.get_height()/total) < 10 else 0.07) # width of the plot
        y = p.get_y() + p.get_height() + 100        # height of the plot
        plot.annotate(percentage, (x, y), size = 12) # annotate the percentage
        
dataset2 = dataset.copy()
dataset2['SeenHealthWarnings'] = dataset2['SeenHealthWarnings'].map(
    {"Yes, and they led me to think about quitting smoking or not starting smoking": "Yes, they made me consider quitting or avoiding smoking",
     "Yes, but I didn't think much of them": "Yes, but I didn't think much of them",
     "No": "No",})

for i, column in enumerate(list_of_columns):
    ax=sns.countplot(x='Smoke', hue=column, data=dataset2, palette='rainbow', dodge=True)
    bar_perc(ax,dataset, dataset2[column])
    ax.set(title=f"Smoker vs {column}");
    
    
    
    
    # ax = sns.histplot(data=dataset, x=column, hue='Smoke', multiple="dodge",
    #                 stat='percent', shrink = 0.8, common_norm=False,bins=2)
    # ax.bar_label(ax.containers[0], fmt='%.2f%%')
    # ax.bar_label(ax.containers[1], fmt='%.2f%%')
    # # print(list(ax.containers[1]))
    plt.savefig(f"../../data/processed/multivariate_analyisis/{column}_histogram.png")
    plt.show()
    
# # count smokers and non-smokers by AttractiveSmoker
# sns.countplot(x='AttractiveSmoker', hue='Smoke', data=dataset)
# plt.show()
# percentage of smokers and non-smokers by AttractiveSmoker in percentage
# x,y = 'AttractiveSmoker', 'Smoke'

# (dataset.groupby(x)[y]
# .value_counts(normalize=True)
# .mul(100)
# .rename('percent')
# .reset_index()
# .pipe((sns.catplot,'data'), x=x, y='percent',hue=y, kind='bar'))

