In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import export_text, plot_tree

In [2]:
# Get data type of each column
def analyzeDataTypes(df):
    continuous_vars = []
    categorical_vars = []
    for column in df.columns:
        if df[column].dtype == 'object':
            categorical_vars.append(column)
        else:
            continuous_vars.append(column)
    return continuous_vars, categorical_vars

In [33]:
# Preprocessing data with One-hot Encoding
# Convert categorical variable into dummy/indicator variables.

# - Reduce complexity of the model with many categories
# - Convert categorical variable into numerical variable
# - Careful with ohe variable trap
# - Careful with multicollinearity
def preprocessingOneHotEncoding(df, variables):
    if 'NObeyesdad' in variables:
        variables.remove('NObeyesdad')
    df_train = pd.get_dummies(df, columns=variables)
    # Eliminate multicollinearity
    df_train.drop('Gender_Female', axis=1, inplace=True)
    df_train.drop('family_history_with_overweight_yes', axis=1, inplace=True)
    df_train.drop('FAVC_no', axis=1, inplace=True)
    df_train.drop('SMOKE_yes', axis=1, inplace=True)
    df_train.drop('SCC_yes', axis=1, inplace=True)
    return df_train



In [3]:
# Preprocessing data with label encoder

# - Convert categorical variable into numerical variable
# - Avoid dummy variable trap
# - In this case, most of the categorical variables are ordinal
# - Check the ones that don't have a clear order (TODO)
def preprocessingWithLabelEncoder(df, variables):
    if 'NObeyesdad' in variables:
        variables.remove('NObeyesdad')
    le = LabelEncoder()
    for column in variables:
        df[column] = le.fit_transform(df[column])
    return df



In [4]:
# Get data ready and separate X and Y
def prepareData(df):
    df = df.dropna()
    df = df.drop_duplicates()
    X = df.drop('NObeyesdad', axis=1)
    Y = df['NObeyesdad']
    return X, Y

In [7]:
# Training the model with Decision Tree Classifier

# - Ideal for categorical values
# - Easy to interpret
# - Dataset authors recommend using a Tree Classifier: 
#   De-La-Hoz-Correa, E., Mendoza Palechor, F., De-La-Hoz-Manotas, A., Morales Ortega, R., & Sánchez Hernández, A. B., 
#   "Obesity level estimation software based on decision trees," Universidad de la Costa, 2019.

def trainModelDT(X_train, y_train):
    clf = Pipeline([
        # Standarize the data and train the model
        ('scaler', StandardScaler()),
        ('clf', tree.DecisionTreeClassifier(max_depth=30, min_samples_split=5, random_state=42))
    ])
    param_grid = {
        'clf__max_depth': [10, 20, 30],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4]
    }
    clf = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=3)
    clf.fit(X_train, y_train)
    return clf
    


In [8]:
# Training the model with Random Forest

# - Due to the high number of categorical variables,Random Forest is a good choice
# - It's an expansion of Decision Trees, an upgrade to a recommended classifier
# - It's a good choice for high-dimensional data

def trainModelRF (X_train, y_train):
    # Standarize the data and train the model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    # Define the parameter grid for Grid Search
    param_grid = {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [None, 10, 20, 30],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4]
    }

    # Initialize Grid Search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search


In [28]:
# Training the model with Gradient Boosting
def trainModelBoosting(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', GradientBoostingClassifier())
    ])

    # Define the parameter grid for Grid Search
    # param_grid = {
    #     'clf__n_estimators': [50, 100, 200],
    #     'clf__learning_rate': [0.1, 0.01, 0.001],
    #     'clf__max_depth': [3, 5, 7],
    #     'clf__min_samples_split': [2, 5, 10],
    #     'clf__min_samples_leaf': [1, 2, 4]
    # }

    # Initialize Grid Search
    # grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
    pipeline.fit(X_train, y_train)
    return pipeline

In [77]:
""" Main function """
# Preprocessing data
# Load the data
df = pd.read_csv('../data/train.csv')
df = df.drop('id', axis=1)
df = df.drop("SMOKE", axis=1)
df = df.drop("MTRANS", axis=1)
continuous_vars, categorical_vars = analyzeDataTypes(df)
print("Continuous variables: ", continuous_vars)
df.head()
# FCVC: Vegetables
    # Most of distribution is between 2 and 3
# NCP: Main meals
# CH20: Water consumption
# FAF: Physical activity frequency
# TUE: Time using technology devices

Continuous variables:  ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,CH2O,SCC,FAF,TUE,CALC,NObeyesdad
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,2.763573,no,0.0,0.976473,Sometimes,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,2.0,no,1.0,1.0,no,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,1.910378,no,0.866045,1.673584,no,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,1.674061,no,1.467863,0.780199,Sometimes,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,1.979848,no,1.967973,0.931721,Sometimes,Overweight_Level_II


In [78]:
# Preprocessing with variables
df_ohe = preprocessingOneHotEncoding(df, categorical_vars)
X_ohe, Y_ohe = prepareData(df_ohe)
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X_ohe, Y_ohe, test_size=0.2, random_state=42)



In [80]:
# Preprocessing with label encoder
df_encoder = preprocessingWithLabelEncoder(df, categorical_vars)
X_encoder, Y_encoder = prepareData(df_encoder)
X_train_encoder, X_test_encoder, y_train_encoder, y_test_encoder = train_test_split(X_encoder, Y_encoder, test_size=0.2, random_state=42)

In [81]:
# Train Decision Trees
dt_ohe = trainModelDT(X_train_ohe, y_train_ohe)
y_pred_dt_ohe = dt_ohe.predict(X_test_ohe)

dt_encoder = trainModelDT(X_train_encoder, y_train_encoder)
y_pred_dt_encoder = dt_encoder.predict(X_test_encoder)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [82]:
# Train Random Forest
rf_ohe = trainModelRF(X_train_ohe, y_train_ohe)
y_pred_rf_ohe = rf_ohe.predict(X_test_ohe)

rf_encoder = trainModelRF(X_train_encoder, y_train_encoder)
y_pred_rf_encoder = rf_encoder.predict(X_test_encoder)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [83]:
# Train Gradient Boosting
gbt_ohe = trainModelBoosting(X_train_ohe, y_train_ohe)
y_pred_gbt_ohe = gbt_ohe.predict(X_test_ohe)

gbt_encoder = trainModelBoosting(X_train_encoder, y_train_encoder)
y_pred_gbt_encoder = gbt_encoder.predict(X_test_encoder)

In [87]:
import joblib

# Save the trained model to a file

# joblib.dump(dt_ohe, 'DT_OHE_FI1_model.pkl')
# joblib.dump(dt_encoder, 'DT_ENC_FI1.pkl')
# joblib.dump(rf_ohe, 'RF_OHE_FI1_model.pkl')
# joblib.dump(rf_encoder, 'RF_ENC_FI1_model.pkl')
# joblib.dump(gbt_ohe, 'GBT_OHE_FI1_model.pkl')
joblib.dump(gbt_encoder, 'GBT_ENC_model.pkl')

# Load the model from the file
# loaded_clf = joblib.load('GBT_ENC_model.pkl')

# Verify the loaded model
# print(f"Model loaded successfully: {loaded_clf}")

['GBT_ENC_model.pkl']

In [84]:
# Compare the results
results = pd.DataFrame({'Model': ['DT - One-hot Encoding', 'DT - Label Encoder', 
                                  'RF - One-hot Encoding', 'RF - Label Enconder',
                                  'GBT - One-hot Encoding', 'GBT - Label Encoder'],})
results['Accuracy'] = [accuracy_score(y_test_ohe, y_pred_dt_ohe), 
                      accuracy_score(y_test_encoder, y_pred_dt_encoder),
                      accuracy_score(y_test_ohe, y_pred_rf_ohe),
                      accuracy_score(y_test_encoder, y_pred_rf_encoder),
                      accuracy_score(y_test_ohe, y_pred_gbt_ohe),
                      accuracy_score(y_test_encoder, y_pred_gbt_encoder)]

results['Precision'] = [classification_report(y_test_ohe, y_pred_dt_ohe, output_dict=True)['weighted avg']['precision'], 
                        classification_report(y_test_encoder, y_pred_dt_encoder, output_dict=True)['weighted avg']['precision'],
                        classification_report(y_test_ohe, y_pred_rf_ohe, output_dict=True)['weighted avg']['precision'], 
                        classification_report(y_test_encoder, y_pred_rf_encoder, output_dict=True)['weighted avg']['precision'],
                        classification_report(y_test_ohe, y_pred_gbt_ohe, output_dict=True)['weighted avg']['precision'],
                        classification_report(y_test_encoder, y_pred_gbt_encoder, output_dict=True)['weighted avg']['precision']]


results['Recall'] = [classification_report(y_test_ohe, y_pred_dt_ohe, output_dict=True)['weighted avg']['recall'], 
                     classification_report(y_test_encoder, y_pred_dt_encoder, output_dict=True)['weighted avg']['recall'],
                        classification_report(y_test_ohe, y_pred_rf_ohe, output_dict=True)['weighted avg']['recall'], 
                        classification_report(y_test_encoder, y_pred_rf_encoder, output_dict=True)['weighted avg']['recall'],
                        classification_report(y_test_ohe, y_pred_gbt_ohe, output_dict=True)['weighted avg']['recall'],
                        classification_report(y_test_encoder, y_pred_gbt_encoder, output_dict=True)['weighted avg']['recall']]

results['F1-Score'] = [classification_report(y_test_ohe, y_pred_dt_ohe, output_dict=True)['weighted avg']['f1-score'], 
                       classification_report(y_test_encoder, y_pred_dt_encoder, output_dict=True)['weighted avg']['f1-score'],
                        classification_report(y_test_ohe, y_pred_rf_ohe, output_dict=True)['weighted avg']['f1-score'], 
                        classification_report(y_test_encoder, y_pred_rf_encoder, output_dict=True)['weighted avg']['f1-score'],
                        classification_report(y_test_ohe, y_pred_gbt_ohe, output_dict=True)['weighted avg']['f1-score'],
                        classification_report(y_test_encoder, y_pred_gbt_encoder, output_dict=True)['weighted avg']['f1-score']]

print(results)

                    Model  Accuracy  Precision    Recall  F1-Score
0   DT - One-hot Encoding  0.872228   0.872552  0.872228  0.872369
1      DT - Label Encoder  0.871504   0.872275  0.871504  0.871831
2   RF - One-hot Encoding  0.892237   0.892371  0.892237  0.892008
3     RF - Label Enconder  0.897541   0.897425  0.897541  0.897274
4  GBT - One-hot Encoding  0.902363   0.902277  0.902363  0.902229
5     GBT - Label Encoder  0.904050   0.903950  0.904050  0.903924


In [85]:
print("Decision Tree: One-hot Encoding")
print(accuracy_score(y_test_ohe, y_pred_dt_ohe))
# print(classification_report(y_test_ohe, y_pred_dt_ohe))

print("Decision Tree: Label Encoder")
print(accuracy_score(y_test_encoder, y_pred_dt_encoder))
# print(classification_report(y_test_encoder, y_pred_dt_encoder))


print("Random Forest: One-hot Encoding")
print(accuracy_score(y_test_ohe, y_pred_rf_ohe))
# print(classification_report(y_test_ohe, y_pred_rf_ohe))

print("Random Forest: Label Encoder")
print(accuracy_score(y_test_encoder, y_pred_rf_encoder))
# print(classification_report(y_test_encoder, y_pred_rf_encoder))

print("Gradient Boosting: One-hot Encoding")
print(accuracy_score(y_test_ohe, y_pred_gbt_ohe))
# print(classification_report(y_test_ohe, y_pred_gbt_ohe))

print("Gradient Boosting: Label Encoder")
print(accuracy_score(y_test_encoder, y_pred_gbt_encoder))
# print(classification_report(y_test_encoder, y_pred_gbt_encoder))

Decision Tree: One-hot Encoding
0.8722275795564127
Decision Tree: Label Encoder
0.8715043394406943
Random Forest: One-hot Encoding
0.8922372227579557
Random Forest: Label Encoder
0.8975409836065574
Gradient Boosting: One-hot Encoding
0.9023625843780135
Gradient Boosting: Label Encoder
0.9040501446480231


In [86]:
results.to_csv('results_wo_data.csv', index=False)