In [1]:
import numpy as np # Array, Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Visualization
import seaborn as sns # Visualization
# import textwrap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, classification_report

from sklearn.svm import SVC

# So you won't need to use plt.show() anymore
%matplotlib inline

sns.set(color_codes=True) # Enabling Color Parameter in Seaborn

In [2]:
# Reading Dataset and storing into dataframe
path = 'train.csv'
trainDF = pd.read_csv(path) # Dataframe to hold training dataset
trainDF.tail(5) # To display the botton 5 rows
trainDF.head(5) # To display the top 5 rows

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


# Preprocessing


Renaming

In [3]:
#Rename FHWO and Obese Level
trainDF = trainDF.rename(columns={"family_history_with_overweight": "FHWO", "NObeyesdad" : "ObeseLevel"})

Cleaning
- Removes Duplicates


In [4]:
# Removing Duplicates
trainDF = trainDF.drop_duplicates()

- Removes Outliers


In [5]:
# Identifying Outliers
outlierDF = trainDF.drop(["Gender","FHWO", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS", "ObeseLevel"], axis = 1) # Dropping Categorical Values
Q1 = outlierDF.quantile(float(0.25))
Q3 = outlierDF.quantile(float(0.75))
IQR = Q3 - Q1

outlierDF = outlierDF[~((outlierDF < (Q1 - 1.5 * IQR)) |(outlierDF > (Q3 + 1.5 * IQR))).any(axis=1)]

In [6]:
#Filtering from outlier values,then assign to trainDF_filtered
# Assuming trainDF is the original dataset and outlierDF is the subset dataset
# Both datasets have an 'id' column that uniquely identifies each row

# Extract unique IDs from the subset dataset
subset_ids = set(outlierDF['id'])

# Filter the original dataset to keep only the rows with IDs present in the subset dataset
trainDF_filtered = trainDF[trainDF['id'].isin(subset_ids)]

# Now trainDF_filtered contains only the rows from the original dataset that have matching IDs with the subset dataset

In [7]:
#Drop outlier values
trainDF_filtered.drop(labels='id', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDF_filtered.drop(labels='id', axis=1, inplace=True)


In [8]:
#Check number of row after drop outliers
trainDF_filtered.shape

(14040, 17)

In [9]:
x_train = trainDF_filtered.drop(columns='ObeseLevel')
y_train = trainDF_filtered['ObeseLevel']

Encoding
- One Hot Encoding


In [10]:
# Columns to be one-hot encoded
columns_to_encode = ['FHWO', 'FAVC', 'SMOKE', 'SCC', 'MTRANS', 'Gender']

# Initialize an empty DataFrame to store encoded features
train_encoded_features = pd.DataFrame(index=x_train.index)

# Iterate over each column to encode
for column in columns_to_encode:
    # One-hot encode the current column
    one_hot_encoder = OneHotEncoder(sparse_output=True, drop='first')  # drop='first' to drop the first level for each feature
    train_encoded_column = one_hot_encoder.fit_transform(x_train[[column]])

    # Convert the encoded features into DataFrame and concatenate with the existing encoded features
    encoded_df = pd.DataFrame(train_encoded_column.toarray(), columns=one_hot_encoder.get_feature_names_out([column]), index=x_train.index)
    train_encoded_features = pd.concat([train_encoded_features, encoded_df], axis=1)

# Reset Index is done to preserve the indices
train_encoded_features.reset_index(drop=True, inplace=True)

# Create a dataframe without the encoded features
train_df = x_train.drop(columns=columns_to_encode, inplace=False)
train_df.reset_index(drop=True, inplace=True)  # Reset Index is done to preserve the indices

# Concatenate the encoded features with the original DataFrame
x_train_encoded = pd.concat([train_encoded_features, train_df], axis=1)




In [11]:
# Display the encoded training data
print("Encoded Training Data:")
x_train_encoded

Encoded Training Data:


Unnamed: 0,FHWO_yes,FAVC_yes,SMOKE_yes,SCC_yes,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,Gender_Male,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.000000,1.560000,57.000000,2.000000,3.0,Frequently,2.000000,1.000000,1.000000,no
1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.952737,1.710730,131.274851,3.000000,3.0,Sometimes,1.674061,1.467863,0.780199,Sometimes
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,18.128249,1.748524,51.552595,2.919751,3.0,Sometimes,2.137550,1.930033,1.000000,Sometimes
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,29.883021,1.754711,112.725005,1.991240,3.0,Sometimes,2.000000,0.000000,0.696948,Sometimes
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,29.891473,1.750150,118.206565,1.397468,3.0,Sometimes,2.000000,0.598655,0.000000,Sometimes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14035,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,25.783865,1.646390,104.835346,3.000000,3.0,Sometimes,1.530992,0.015860,0.445495,Sometimes
14036,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,21.030909,1.605495,133.466763,3.000000,3.0,Sometimes,2.839069,1.683497,0.143675,Sometimes
14037,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,25.137087,1.766626,114.187096,2.919584,3.0,Sometimes,2.151809,1.330519,0.196680,Sometimes
14038,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,20.101026,1.819557,105.580491,2.407817,3.0,Sometimes,2.000000,1.158040,1.198439,no


- Ordinal Encoding

In [12]:
# Define the encoding order
encoding_order = ['no', 'Sometimes', 'Frequently', 'Always']

# Instance of Ordinal Encoder
ordinal_encoder = OrdinalEncoder(categories=[encoding_order] * 2)

# Apply ordinal encoding to the selected columns
x_train_encoded[['CALC', 'CAEC']] = ordinal_encoder.fit_transform(x_train_encoded[['CALC', 'CAEC']])


- Label Encoding

In [13]:
# Instance of Ordinal Encoder
label_encoder = LabelEncoder()

# Fit our train data and tranform train and test data
y_train = label_encoder.fit_transform(y_train)



Normalization
- Min-Max Scaler

In [14]:
# Instance of Min-Max Scaler
minMax_scaler = MinMaxScaler() #Min-Max scaler
train_norm = minMax_scaler.fit_transform(x_train_encoded)
x_train_norm = pd.DataFrame(train_norm, columns=x_train_encoded.columns)


In [15]:
x_train_norm

Unnamed: 0,FHWO_yes,FAVC_yes,SMOKE_yes,SCC_yes,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,Gender_Male,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190476,0.221147,0.142792,0.500000,0.0,0.666667,0.500000,0.333333,0.500000,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.331083,0.524179,0.732007,1.000000,0.0,0.333333,0.337031,0.489288,0.390099,0.5
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.196583,0.600162,0.099579,0.959876,0.0,0.333333,0.568775,0.643344,0.500000,0.5
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.756334,0.612600,0.584853,0.495620,0.0,0.333333,0.500000,0.000000,0.348474,0.5
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.756737,0.603431,0.628338,0.198734,0.0,0.333333,0.500000,0.199552,0.000000,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14035,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.561136,0.394828,0.522265,1.000000,0.0,0.333333,0.265496,0.005287,0.222747,0.5
14036,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.334805,0.312612,0.749396,1.000000,0.0,0.333333,0.919534,0.561166,0.071837,0.5
14037,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.530337,0.636554,0.596452,0.959792,0.0,0.333333,0.575905,0.443506,0.098340,0.5
14038,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.290525,0.742969,0.528177,0.703909,0.0,0.333333,0.500000,0.386013,0.599220,0.0


# Chosen Model

In [16]:
x_train_data = x_train_norm.copy()

y_train_data = y_train.copy()


# LGBM(Light Gradient Boosting Machine)

In [17]:
from lightgbm import LGBMClassifier
# Ignoring warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Inisialisasi LGBMClassifier dengan beberapa parameter
lgbm_classifier = LGBMClassifier(
    n_estimators=1000,       # Jumlah pohon keputusan
    learning_rate=0.01,      # Kecepatan belajar
    max_depth=5
)

# Melatih model dengan data training
lgbm_classifier.fit(x_train_data, y_train_data)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1797
[LightGBM] [Info] Number of data points in the train set: 14040, number of used features: 18
[LightGBM] [Info] Start training from score -2.324184
[LightGBM] [Info] Start training from score -1.712901
[LightGBM] [Info] Start training from score -2.308299
[LightGBM] [Info] Start training from score -1.893329
[LightGBM] [Info] Start training from score -1.244676
[LightGBM] [Info] Start training from score -2.381086
[LightGBM] [Info] Start training from score -2.393489


# Saving Trained Model

In [18]:
import pickle

# Buat dictionary untuk menyimpan model dan encoder
model_dict = {
    'model': lgbm_classifier,
    'one_hot_encoder': one_hot_encoder,
    'ordinal_encoder': ordinal_encoder,
    'label_encoder': label_encoder,
    'min_max_scaler': minMax_scaler,
    'x_train':x_train
}



Write model to file

In [19]:
# Simpan dictionary ke file
with open('trained_model.sav', 'wb') as file:
    pickle.dump(model_dict, file)
