In [None]:
import os, sys
# set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

import pandas as pd
import pycaret.classification as pc
from imblearn.over_sampling import SMOTE, SMOTEN, ADASYN
from sklearn import preprocessing
from sklearn.model_selection import  train_test_split

# import from ../script/data.py in scripts folder
from src.scripts.data import CR8_smoke_dict, OR45_dict 

In [None]:
greece_dict = {1:1, 2:1, 3:2, 4:2, 5:3, 6:3, 7:4, 8:5, 9:6}
poland_dict = {1:1, 2:1, 3:2, 4:3, 5:4, 6:4, 7:4, 8:5, 9:6}
romania_dict = {1:1, 2:2, 3:3, 4:4, 5:4, 6:4, 7:4, 8:5, 9:5, 10:6}
england_dict = {"No Qualification": 1, "GCSE/O Level": 2, "GCSE/CSE": 3, "A Levels": 4, "ONC/BTEC": 4, "Other/Sub Degree": 4, "Higher/Sub Degree": 5, "Degree": 5}

def read_dataframes(folder_path):
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.sas7bdat')]
    print(file_list)
    dataframes = {file: pd.read_sas(os.path.join(folder_path, file)) for file in file_list}
    return dataframes


def get_common_columns(dataframes):
    # rename all columns to lower case
    for df in dataframes.values():
        df.columns = df.columns.str.upper()
    
    common_columns = dataframes[list(dataframes.keys())[0]].columns
    for df in dataframes.values():
        common_columns = common_columns.intersection(df.columns)
    return common_columns


def preprocess_dataframes(dataframes, common_columns):
    merged_df = pd.DataFrame()
    for i, (file, df) in enumerate(dataframes.items()):
        df = df[common_columns]
        # df.insert(0, "State", file.split(" ")[2])
        
        state = file.split(" ")[2].split(".")[0]
        if state == "Greece":
            df["A04"] = df["A04"].map(greece_dict)
        elif state == "Poland":
            df["A04"] = df["A04"].map(poland_dict)
        elif state == "Romania":
            df["A04"] = df["A04"].map(romania_dict)
        
        df.insert(0, "State", i)
        merged_df = pd.concat([merged_df, df])
        
    merged_df.rename(columns={"AGE": "Age"}, inplace=True)
    merged_df.rename(columns={"A01": "Gender"}, inplace=True)
    merged_df.rename(columns={"B01": "Smoke"}, inplace=True)
    merged_df["Smoke"] = merged_df["Smoke"].map({1: True, 2: True, 3: False})
    merged_df.rename(columns={"A04": "Education"}, inplace=True)    
    merged_df.rename(columns={"A11": "MaritalStatus"}, inplace=True) 

    england_df = pd.read_csv("../../data/raw/smoking_england.csv")
    england_df.insert(0, "State", i+1)
    
    # rename all column of england_df to Title Case
    england_df.columns = england_df.columns.str.title()
    
    england_df["Smoke"] = england_df["Smoke"].map({"Yes": True, "No": False})
    england_df["Education"] = england_df["Highest_Qualification"].map(england_dict)
    england_df["Gender"] = england_df["Gender"].map({"Male":1, "Female":2})
    england_df["MaritalStatus"] = england_df["Marital_Status"].map({"Single": 1, "Married": 2, "Separated":3, "Divorced": 4, "Widowed": 5})
    merged_df = pd.concat([merged_df, england_df])
    # merged_df = england_df


    return merged_df


Data preprocessing

In [None]:
# open sas7bdat file
# file = pd.read_sas('../../data/raw/GATS/GATS_Greece_National_2013_SAS/GREECE_PUBLIC_USE_11Mar2015.sas7bdat')

# Read dataframes from GYTS folder
dataframes = read_dataframes("../../data/raw/GATS/")

for df in dataframes.values():
    if "A11" not in df.columns:
        df["A11"] = None

# Find common columns in all dataframes
common_columns = get_common_columns(dataframes)
print([c for c in common_columns])

# Preprocess dataframes
merged_df = preprocess_dataframes(dataframes, common_columns)

# count number of A11 values
# print(merged_df["MaritalStatus"].value_counts())

# Drop rows with missing values
# merged_df = merged_df.dropna()

In [None]:
merged_df = merged_df[['State', 'Age', 'Gender', 'Smoke', 'Education', 'MaritalStatus']]

# merged_df = merged_df.dropna()

#use IterativeImputer to fill missing values of Education and MaritalStatus
#imp = IterativeImputer(max_iter=10, random_state=0)
#merged_df = pd.DataFrame(imp.fit_transform(merged_df), columns=merged_df.columns)


merged_df["State"] = merged_df["State"].astype('int').astype('category')
merged_df['Age'] = merged_df['Age'].astype('int')
merged_df['Gender'] = merged_df['Gender'].astype('int').astype('category')
merged_df["Education"] = merged_df["Education"].astype('category')
merged_df["MaritalStatus"] = merged_df["MaritalStatus"].astype('category')
merged_df["Smoke"] = merged_df["Smoke"].astype('bool')



# print the number of missing values in each column
print(merged_df.isnull().sum())

merged_df.tail()

# 1 no formal education
# 2 primary education
# 3 secondary school
# 4 high school
# 5 university
# 6 postgraduate

In [None]:
# Split the data into train and test sets
train, test = train_test_split(merged_df, test_size=0.2, random_state=42)
test.reset_index(drop=True, inplace=True)

# Split the train set into features and target
X = train.drop(columns=["Smoke"])
y = train["Smoke"]

# Convert y values to categorical values
lab = preprocessing.LabelEncoder()
y = lab.fit_transform(y)

No Sampling

In [None]:
y = pd.DataFrame(y, columns=["Smoke"])
y.reset_index(drop=True, inplace=True)
X.reset_index(drop=True, inplace=True)
train = pd.concat([X, y] , axis=1)

+ Oversampling

In [None]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert array to dataframe
y_resampled = pd.DataFrame(y_resampled, columns=['Smoke'])
# y = pd.DataFrame(y, columns=['Smoke'])

# remove index
# X.reset_index(drop=True, inplace=True)
# y.reset_index(drop=True, inplace=True)
X_resampled.reset_index(drop=True, inplace=True)

train = pd.concat([X_resampled, y_resampled], axis=1)
# df_resampled = pd.concat([X, y], axis=1)

Comparing models

In [None]:
setup = pc.setup(data=train,
                target='Smoke',
                session_id=123,
                # normalize=True,
                transformation=True,
                remove_multicollinearity=True, multicollinearity_threshold=0.95, max_encoding_ohe=0,
                fix_imbalance=True, fix_imbalance_method=ADASYN(),
                imputation_type='iterative',
                categorical_features=['State', 'Gender', 'Education', 'MaritalStatus'],
                numeric_features=['Age'])

pc.compare_models()

Choosing best model

In [None]:
# Extra Trees Classifier
model = pc.create_model('lightgbm')


# pc.plot_model(model, plot='auc')
pc.plot_model(model, plot='pr')
pc.plot_model(model, plot='feature')
pc.plot_model(model, plot='feature_all',scale=3)
pc.plot_model(model, plot='confusion_matrix')

final_rf = pc.finalize_model(model)
# final_rf
pc.predict_model(final_rf)
# print(final_rf)

unseen_predictions = pc.predict_model(final_rf, data=test)
print(unseen_predictions.head())

In [None]:
# model = ExtraTreesClassifier(model)

pd.DataFrame({'Feature': pc.get_config('X_train').columns, 'Value' : abs(model.feature_importances_)}).sort_values(by='Value', ascending=False)

In [None]:
pc.interpret_model(model, plot='summary')

In [None]:
pc.interpret_model(model, plot='correlation', feature='Age')

In [None]:
pc.interpret_model(model, plot='reason', feature='Age')


In [None]:
import shap
shap.plots.bar(shap_values.abs.mean(0))