## Multi-Class Prediction of Obesity Risk

In [194]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder


In [186]:
df_train = pd.read_csv(r'datasets/train.csv')
df_test = pd.read_csv(r'datasets/test.csv')

In [187]:
df_train

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Automobile,Normal_Weight
2,2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


### 1. Data preprocessing

In [188]:
# Check NaN and duplicates
miss = df_train.isna().sum().sum()
dupli = df_train.duplicated().sum()
print(f'Missing values: {miss}\t Duplicated rows: {dupli}')

Missing values: 0	 Duplicated rows: 0


Biometrics

In [189]:
# Convert gender to binary values
df_train['Gender'] = df_train['Gender'].map({'Female': 0, 'Male': 1})
df_test['Gender'] = df_test['Gender'].map({'Female': 0, 'Male': 1})

# Round age
df_train['Age'] = df_train['Age'].round()
df_test['Age'] = df_test['Age'].round()

# Round height and weight
df_train[['Height', 'Weight']] = df_train[['Height', 'Weight']].round(2)
df_test[['Height', 'Weight']] = df_test[['Height', 'Weight']].round(2)

# Calculate IMC (kg/m^2) and round
df_train['IMC'] = (df_train['Weight'] / (df_train['Height']**2)).round(2)
df_test['IMC'] = (df_test['Weight'] / (df_test['Height']**2)).round(2)


Family history & eating habits

In [190]:
# Convert family history to binary
df_train['family_history_with_overweight'] = df_train['family_history_with_overweight'].map({'no': 0, 'yes': 1})
df_test['family_history_with_overweight'] = df_test['family_history_with_overweight'].map({'no': 0, 'yes': 1})

# FAVC (consumption of high caloric food)
df_train['FAVC'] = df_train['FAVC'].map({'no': 0, 'yes': 1})
df_test['FAVC'] = df_test['FAVC'].map({'no': 0, 'yes': 1})

# FCVC (consumption of vegetables)
df_train['FCVC'] = df_train['FCVC'].round(2)
df_test['FCVC'] = df_test['FCVC'].round(2)

# NCP (number of main meals)
df_train['NCP'] = df_train['NCP'].round(2)
df_test['NCP'] = df_test['NCP'].round(2)

# Encode CAEC (food between meals) ordinally
ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

df_train['CAEC'] = ord_encoder.fit_transform(df_train[['CAEC']])
df_test['CAEC'] = ord_encoder.transform(df_test[['CAEC']])

Daily-life habits

In [191]:
# Smoke to binary
df_train['SMOKE'] = df_train['SMOKE'].map({'no': 0, 'yes': 1})
df_test['SMOKE'] = df_test['SMOKE'].map({'no': 0, 'yes': 1})

# Round CH20 (water daily)
df_train['CH2O'] = df_train['CH2O'].round(2)
df_test['CH2O'] = df_test['CH2O'].round(2)

# SCC (monitoring of calories) to binary
df_train['SCC'] = df_train['SCC'].map({'no': 0, 'yes': 1})
df_test['SCC'] = df_test['SCC'].map({'no': 0, 'yes': 1})

# FAF (physical activity)
df_train['FAF'] = df_train['FAF'].round(2)
df_test['FAF'] = df_test['FAF'].round(2)

# TUE (time using tech devices)
df_train['TUE'] = df_train['TUE'].round(2)
df_test['TUE'] = df_test['TUE'].round(2)

# Encode CALC (consumption of alcohol)
df_train['CALC'] = ord_encoder.fit_transform(df_train[['CALC']])
df_test['CALC'] = ord_encoder.transform(df_test[['CALC']])

# Encode MTRANS (transportation used)
oh_encoder = OneHotEncoder(drop='first', sparse_output=False)
oh_train = oh_encoder.fit_transform(df_train[['MTRANS']])
oh_test = oh_encoder.transform(df_test[['MTRANS']])



MTRANS
Public_Transportation    11111
Automobile                2405
Walking                    280
Bike                        25
Motorbike                   19
Name: count, dtype: int64