## Import all libraries

In [1]:
import os
import sys

# set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

import pandas as pd
import pycaret.classification as pc
from imblearn.over_sampling import SMOTEN
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# import from ../script/data.py in scripts folder
from src.scripts.data import *

define some functions

In [2]:
def read_dataframes(folder_path):
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    file_list = file_list[2:]
    dataframes = {file: pd.read_csv(os.path.join(folder_path, file)) for file in file_list}
    return dataframes


def get_common_columns(dataframes):
    common_columns = dataframes[list(dataframes.keys())[0]].columns
    for df in dataframes.values():
        common_columns = common_columns.intersection(df.columns)
    return common_columns


def preprocess_dataframes(dataframes, common_columns):
    merged_df = pd.DataFrame()
    for i, (file, df) in enumerate(dataframes.items()):
        df = df[common_columns]
        # df.insert(0, "State", file.split(" ")[2])
        df.insert(0, "State", i)
        merged_df = pd.concat([merged_df, df])
    return merged_df

Import and preprocess data

In [4]:

# Read dataframes from GYTS folder
dataframes = read_dataframes("../../data/raw/GYTS/")

# Find common columns in all dataframes
common_columns = get_common_columns(dataframes)
print(common_columns)

# Preprocess dataframes
merged_df = preprocess_dataframes(dataframes, common_columns)

# Rename columns
merged_df.rename(columns={"CR1": "Age"}, inplace=True)
merged_df["Age"] = merged_df["Age"].map(CR1_dict)

merged_df.rename(columns={"CR2": "Gender"}, inplace=True)
merged_df["Gender"] = merged_df["Gender"].map(CR2_dict)

merged_df["CR8"] = merged_df["CR8"].map(CR8_smoke_dict)
merged_df.rename(columns={"CR8": "Smoke"}, inplace=True)

merged_df.rename(columns={"OR45": "SmokingParents"}, inplace=True)
merged_df.rename(columns={"OR46": "SmokingFriends"}, inplace=True)
merged_df["SmokingFriends"] = merged_df["SmokingFriends"].map(OR46_dict)
merged_df.rename(columns={"OR1": "WorkingParents"}, inplace=True)
merged_df["WorkingParents"] = merged_df["WorkingParents"].map(OR1_dict)
merged_df.rename(columns={"CR22": "SeenSmokerInSchool"}, inplace=True)
merged_df["SeenSmokerInSchool"] = merged_df["SeenSmokerInSchool"].map({1: True, 2: False})

merged_df.rename(columns={"CR21": "SeenSmokerInPublicPlace"}, inplace=True)
merged_df["SeenSmokerInPublicPlace"] = merged_df["SeenSmokerInPublicPlace"].map(CR21_dict)

merged_df.rename(columns={"CR20": "SeenSmokerInEnclosedPlace"}, inplace=True)
merged_df["SeenSmokerInEnclosedPlace"] = merged_df["SeenSmokerInEnclosedPlace"].map(CR20_dict)

merged_df.rename(columns={"CR19": "SeenSmokerInHome"}, inplace=True)
merged_df["SeenSmokerInHome"] = merged_df["SeenSmokerInHome"].map(CR19_dict)

merged_df.rename(columns={"CR5": "TriedCigarette"}, inplace=True)
merged_df.rename(columns={"CR6": "AgeFirstCigarette"}, inplace=True)

# Keep only the desired columns
# merged_df = merged_df[["State", "Gender", "Age", "Smoke", "SmokingParents", "SmokingFriends", "WorkingParents",
#                        "SeenSmokerInSchool", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace",
#                        "SeenSmokerInHome", "TriedCigarette", "AgeFirstCigarette"]]
merged_df = merged_df[["State", "Gender", "Age", "Smoke", "SmokingParents", "SmokingFriends", "WorkingParents",
                       "SeenSmokerInSchool", "SeenSmokerInPublicPlace", "SeenSmokerInEnclosedPlace",
                       "SeenSmokerInHome"]]
# merged_df = merged_df[["State", "Gender", "Age", "Smoke", "SmokingParents", "SmokingFriends", "WorkingParents"]]

# Drop rows with missing values
merged_df = merged_df.dropna()

merged_df['SmokingFather'] = merged_df['SmokingParents'].apply(lambda x: True if OR45_dict[x] in ['Both', 'Father only'] else False)
merged_df['SmokingMother'] = merged_df['SmokingParents'].apply(lambda x: True if OR45_dict[x] in ['Both', 'Mother only'] else False)
merged_df = merged_df.drop(columns=['SmokingParents'])

merged_df['WorkingFather'] = merged_df['WorkingParents'].apply(lambda x: True if x in ['Both', 'Father only'] else False)
merged_df['WorkingMother'] = merged_df['WorkingParents'].apply(lambda x: True if x in ['Both', 'Mother only'] else False)
merged_df = merged_df.drop(columns=['WorkingParents'])

# Convert columns to categorical
merged_df['State'] = merged_df['State'].astype('int').astype('category')
merged_df["Gender"] = merged_df["Gender"].astype('category')
merged_df["Age"] = merged_df["Age"].astype('category')
merged_df["SmokingFriends"] = merged_df["SmokingFriends"].astype('category')
merged_df["SeenSmokerInPublicPlace"] = merged_df["SeenSmokerInPublicPlace"].astype('category')
merged_df["SeenSmokerInEnclosedPlace"] = merged_df["SeenSmokerInEnclosedPlace"].astype('category')
merged_df["SeenSmokerInHome"] = merged_df["SeenSmokerInHome"].astype('category')
# merged_df["TriedCigarette"] = merged_df["TriedCigarette"].astype('category')
# merged_df["AgeFirstCigarette"] = merged_df["AgeFirstCigarette"].astype('category')

# Convert to boolean
merged_df["Smoke"] = merged_df["Smoke"].astype("bool")
merged_df["SeenSmokerInSchool"] = merged_df["SeenSmokerInSchool"].astype('bool')
merged_df['SmokingFather'] = merged_df['SmokingFather'].astype('bool')
merged_df['SmokingMother'] = merged_df['SmokingMother'].astype('bool')
merged_df['WorkingFather'] = merged_df['WorkingFather'].astype('bool')
merged_df['WorkingMother'] = merged_df['WorkingMother'].astype('bool')

# Save the preprocessed dataframe to a CSV file
merged_df.to_csv("../../data/processed/merged_GYTS.csv", index=False)

# Remove the 17 and older age group
# merged_df = merged_df[merged_df["Age"] != "17 years old or older"]

merged_df

Index(['FinalWgt', 'Stratum', 'PSU', 'CR1', 'CR2', 'CR5', 'CR6', 'CR7', 'CR8',
       'CR9', 'CR10', 'CR11', 'CR12', 'CR13', 'CR15', 'CR16', 'CR17', 'CR18',
       'CR19', 'CR20', 'CR21', 'CR22', 'CR23', 'CR25', 'CR30', 'CR31', 'CR32',
       'CR33', 'OR57', 'CR35', 'CR36', 'CR41', 'CR42', 'CR43', 'OR1', 'OR45',
       'OR46', 'OR49', 'OR55'],
      dtype='object')


Unnamed: 0,State,Gender,Age,Smoke,SmokingFriends,SeenSmokerInSchool,SeenSmokerInPublicPlace,SeenSmokerInEnclosedPlace,SeenSmokerInHome,SmokingFather,SmokingMother,WorkingFather,WorkingMother
49,0,Female,15 years old,True,None of them,False,1 to 2 days,1 to 2 days,0 days,True,True,False,True
51,0,Female,15 years old,True,Some of them,False,1 to 2 days,1 to 2 days,0 days,False,False,True,True
55,0,Male,14 years old,False,None of them,False,3 to 4 days,1 to 2 days,0 days,True,True,False,True
111,0,Female,15 years old,True,Some of them,False,3 to 4 days,1 to 2 days,0 days,True,True,False,True
122,0,Male,14 years old,True,Some of them,False,3 to 4 days,1 to 2 days,1 to 2 days,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674,3,Female,13 years old,False,Most of them,False,7 days,0 days,0 days,False,False,True,True
1675,3,Female,13 years old,False,Some of them,False,3 to 4 days,0 days,1 to 2 days,False,False,True,True
1677,3,Male,13 years old,False,All of them,True,3 to 4 days,1 to 2 days,1 to 2 days,False,False,True,True
1678,3,Male,13 years old,False,Some of them,True,0 days,0 days,1 to 2 days,True,False,True,True


Train and test model

In [None]:
# # Correlation matrix
# corr = merged_df_encoded.corr()
# fig, ax = plt.subplots(figsize=(40, 30))
# sns.heatmap(corr, annot=True, annot_kws={"size": 8}, linewidths=.5, ax=ax)
# plt.savefig("./data/processed/correlation_matrix.svg")
# # plt.show()

# Split the data into train and test sets
train, test = train_test_split(merged_df, test_size=0.2, random_state=42)
test.reset_index(drop=True, inplace=True)

# Split the train set into features and target
X = train.drop(columns=["Smoke"])
y = train["Smoke"]

# Convert y values to categorical values
lab = preprocessing.LabelEncoder()
y = lab.fit_transform(y)

No Sampling

In [None]:
y = pd.DataFrame(y, columns=["Smoke"])
y.reset_index(drop=True, inplace=True)
X.reset_index(drop=True, inplace=True)
train = pd.concat([X, y], axis=1)

Sampling

In [None]:
# SMOTE Oversampling
smote = SMOTEN()
# smote = ADASYN()
# smote = RandomUnderSampler()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert array to dataframe
y_resampled = pd.DataFrame(y_resampled, columns=['Smoke'])
# y = pd.DataFrame(y, columns=['Smoke'])

# remove index
# X.reset_index(drop=True, inplace=True)
# y.reset_index(drop=True, inplace=True)
X_resampled.reset_index(drop=True, inplace=True)

train = pd.concat([X_resampled, y_resampled], axis=1)
# df_resampled = pd.concat([X, y], axis=1)

Comparing models

In [12]:
# X = merged_df.drop(columns=["Smoke"])
# y = merged_df["Smoke"]
# 
# # Convert y values to categorical values
# lab = preprocessing.LabelEncoder()
# y = lab.fit_transform(y)
# 
# y = pd.DataFrame(y, columns=["Smoke"])
# y.reset_index(drop=True, inplace=True)
# X.reset_index(drop=True, inplace=True)
# merged_df = pd.concat([X, y], axis=1)

merged_df.reset_index(drop=True, inplace=True)

setup = pc.setup(data=merged_df,
                 target='Smoke',
                 # index=False,
                 session_id=123,
                 normalize=True,
                 imputation_type=None,
                 #  encoding_method=preprocessing.LabelEncoder()
                 ordinal_features={
                     "SmokingFriends": OR46_dict.values(),
                     "SeenSmokerInPublicPlace": CR21_dict.values(),
                     "SeenSmokerInEnclosedPlace": CR20_dict.values(),
                     "SeenSmokerInHome": CR19_dict.values()
                 },
                 #  fix_imbalance=True, fix_imbalance_method=SMOTEN(),
                 #  transformation=True,
                 remove_multicollinearity=True, multicollinearity_threshold=0.95,
                 max_encoding_ohe=0
                 )

best = pc.compare_models(sort='MCC')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Smoke
2,Target type,Binary
3,Original data shape,"(20368, 13)"
4,Transformed data shape,"(20368, 13)"
5,Transformed train set shape,"(14257, 13)"
6,Transformed test set shape,"(6111, 13)"
7,Ordinal features,4
8,Categorical features,7
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8888,0.5204,0.3739,0.576,0.453,0.3943,0.4059,0.044
qda,Quadratic Discriminant Analysis,0.8744,0.6071,0.4597,0.4926,0.4749,0.4038,0.4045,0.046
nb,Naive Bayes,0.8451,0.6001,0.5705,0.4097,0.4764,0.3885,0.3961,0.05
gbc,Gradient Boosting Classifier,0.8923,0.599,0.3176,0.6249,0.4205,0.3682,0.3942,0.151
lightgbm,Light Gradient Boosting Machine,0.89,0.5715,0.3165,0.6031,0.4147,0.3606,0.3837,0.239
ada,Ada Boost Classifier,0.8908,0.5766,0.3085,0.6148,0.4102,0.3574,0.3836,0.096
lr,Logistic Regression,0.8899,0.5601,0.2852,0.6161,0.3895,0.3379,0.3685,0.049
xgboost,Extreme Gradient Boosting,0.8837,0.5261,0.3335,0.5474,0.4139,0.3537,0.3673,0.072
rf,Random Forest Classifier,0.8785,0.5492,0.3091,0.5126,0.3851,0.3224,0.3353,0.18
et,Extra Trees Classifier,0.8787,0.5223,0.3062,0.5135,0.3829,0.3206,0.3341,0.176


Choosing best model

In [14]:
model = pc.create_model('lda')

# model = best

pc.predict_model(model)
pc.evaluate_model(model)

tuned_model = pc.tune_model(model, optimize='MCC', choose_better = True)
pc.predict_model(tuned_model)
pc.evaluate_model(tuned_model)



Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8878,0.5442,0.3693,0.5702,0.4483,0.389,0.4004
1,0.8773,0.5051,0.3352,0.5043,0.4027,0.3374,0.3462
2,0.8969,0.522,0.4091,0.6261,0.4948,0.4402,0.4526
3,0.8843,0.5176,0.358,0.5478,0.433,0.3717,0.3822
4,0.8808,0.5236,0.3295,0.5273,0.4056,0.3432,0.355
5,0.8948,0.5193,0.4261,0.6048,0.5,0.4432,0.4517
6,0.8955,0.5047,0.392,0.6216,0.4808,0.426,0.4401
7,0.8856,0.5264,0.392,0.552,0.4585,0.3966,0.4038
8,0.8842,0.5284,0.3068,0.5567,0.3956,0.3375,0.3558
9,0.9004,0.5125,0.4205,0.6491,0.5103,0.4577,0.4711


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.8918,0.8473,0.3475,0.6079,0.4422,0.3872,0.4058


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8801,0.5942,0.4261,0.5172,0.4673,0.4004,0.4028
1,0.8794,0.6284,0.4375,0.5133,0.4724,0.4048,0.4064
2,0.8906,0.613,0.5057,0.5633,0.5329,0.4712,0.4721
3,0.8745,0.5958,0.4318,0.4903,0.4592,0.3885,0.3895
4,0.8731,0.5992,0.4148,0.4834,0.4465,0.3753,0.3767
5,0.8899,0.5858,0.4886,0.5621,0.5228,0.4609,0.4624
6,0.8927,0.5973,0.4886,0.5772,0.5292,0.4692,0.4712
7,0.88,0.5894,0.4773,0.5153,0.4956,0.4276,0.428
8,0.8779,0.6331,0.3636,0.5079,0.4238,0.3576,0.3639
9,0.8947,0.6327,0.4716,0.5929,0.5253,0.467,0.4709


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.882,0.8466,0.4284,0.5269,0.4726,0.4069,0.4097


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
pc.dashboard(model)

In [None]:
import seaborn as sns

# count smokers and non-smokers by age
sns.countplot(x='Age', hue='Smoke', data=merged_df)


In [None]:
# model = ExtraTreesClassifier(model)

pd.DataFrame({'Feature': pc.get_config('X_train').columns, 'Value': abs(model.feature_importances_)}).sort_values(by='Value', ascending=False)

In [None]:
pc.check_fairness(model, sensitive_features=['Age', "Gender"])