In [2]:
import pandas as pd
import seaborn as sns
import numpy as np

In [3]:
df_train = pd.read_csv('../Kaggle-Obesity/train.csv')
df_test = pd.read_csv('../Kaggle-Obesity/test.csv')
df_ss = pd.read_csv('../Kaggle-Obesity/sample_submission.csv')

In [3]:
df_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


In [4]:
df_train.drop(['id'], axis = 1, inplace = True)
df_test.drop(['id'], axis = 1, inplace = True)

In [5]:
#df_test['CALC'].value_counts()
# Since df_test has 4 unique categories and train only has 3, we can merge 'Always' 
# with the nearest value ie Frequently.
df_test['CALC'] = df_test['CALC'].replace({'Always':'Frequently'})

In [6]:
# df_train = df_train.rename(columns={'NObeyesdad':'target'})
# df_test = df_test.rename(columns={'NObeyesdad':'target'})

In [6]:
df_train['NObeyesdad'].unique()

array(['Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight',
       'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I',
       'Obesity_Type_I'], dtype=object)

In [8]:
df_train.dtypes

Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

### Categorical Variables - One Hot Encoding vs Label Encoding
#### Label Encoding: When there is an ordinal relationship among categories - order to a relationship
Ordinal - CAEC, CALC, NObeyesdad 

#### One Hot Encoding: When there is a nominal relationship among categories - no order- so every category has equal importance (Even Yes or No/ other binary categories come under this)
Nominal - Gender, family_history_with_overweight, FAVC, SCC, SMOKE, MTRANS

#### Guide to encoding
1. Binary - Use as is
2. Nominal - One Hot Encoding
3. Ordinal - Label Encoding

In [9]:
# Label Encoding of Target
# NObeyesdad is the target variable (['Overweight_Level_II', 'Normal_Weight', 
#  'Insufficient_Weight', 'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I',
#   'Obesity_Type_I]'

In [7]:
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Insufficient_Weight',0)
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Normal_Weight',1)
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Overweight_Level_I',2)
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Overweight_Level_II',3)
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Obesity_Type_I',4)
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Obesity_Type_II',5)
df_train['NObeyesdad'] = df_train['NObeyesdad'].replace('Obesity_Type_III',6)

In [None]:
#  Label Encoding of Other Ordinal Variables
# CAEC (['Sometimes', 'Frequently', 'no', 'Always'])
# CALC (['Sometimes', 'no', 'Frequently'])

In [8]:
df_train['CAEC'] = df_train['CAEC'].replace('no',0)
df_train['CAEC'] = df_train['CAEC'].replace('Sometimes',1)
df_train['CAEC'] = df_train['CAEC'].replace('Frequently',2)
df_train['CAEC'] = df_train['CAEC'].replace('Always',3)
df_train['CALC'] = df_train['CALC'].replace('no',0)
df_train['CALC'] = df_train['CALC'].replace('Sometimes',1)
df_train['CALC'] = df_train['CALC'].replace('Frequently',2)

In [10]:
df_test['CAEC'] = df_test['CAEC'].replace('no',0)
df_test['CAEC'] = df_test['CAEC'].replace('Sometimes',1)
df_test['CAEC'] = df_test['CAEC'].replace('Frequently',2)
df_test['CAEC'] = df_test['CAEC'].replace('Always',3)
df_test['CALC'] = df_test['CALC'].replace('no',0)
df_test['CALC'] = df_test['CALC'].replace('Sometimes',1)
df_test['CALC'] = df_test['CALC'].replace('Frequently',2)

In [13]:
# One hot encoding - Gender, family_history_with_overweight, FAVC, SMOKE
# MTRANS (['Public_Transportation', 'Automobile', 'Walking', 'Motorbike','Bike'])

In [9]:
nominal = ['Gender', 'family_history_with_overweight', 'FAVC', 'SCC', 'SMOKE','MTRANS']

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
one_hot_enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse = False)

In [12]:
df_ohe = one_hot_enc.fit_transform(df_train[nominal])
X_cat = pd.DataFrame(df_ohe, columns=one_hot_enc.get_feature_names_out())
X_cat.head()

Unnamed: 0,Gender_Male,family_history_with_overweight_yes,FAVC_yes,SCC_yes,SMOKE_yes,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
df_ohe1 = one_hot_enc.fit_transform(df_test[nominal])
x_cat = pd.DataFrame(df_ohe1, columns=one_hot_enc.get_feature_names_out())
x_cat.head()

Unnamed: 0,Gender_Male,family_history_with_overweight_yes,FAVC_yes,SCC_yes,SMOKE_yes,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
df_train = df_train.select_dtypes(exclude=['object'])
#df_train = df_train.reset_index(drop=True)
df_train = pd.concat([df_train,X_cat],axis=1)
df_train.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC,...,Gender_Male,family_history_with_overweight_yes,FAVC_yes,SCC_yes,SMOKE_yes,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,24.443011,1.699998,81.66995,2.0,2.983297,1,2.763573,0.0,0.976473,1,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,18.0,1.56,57.0,2.0,3.0,2,2.0,1.0,1.0,0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,18.0,1.71146,50.165754,1.880534,1.411685,1,1.910378,0.866045,1.673584,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,20.952737,1.71073,131.274851,3.0,3.0,1,1.674061,1.467863,0.780199,1,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,31.641081,1.914186,93.798055,2.679664,1.971472,1,1.979848,1.967973,0.931721,1,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
df_test= df_test.select_dtypes(exclude=['object'])
#df_train = df_train.reset_index(drop=True)
df_test = pd.concat([df_test,x_cat],axis=1)
df_test.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC,Gender_Male,family_history_with_overweight_yes,FAVC_yes,SCC_yes,SMOKE_yes,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,26.899886,1.848294,120.644178,2.938616,3.0,1,2.825629,0.8554,0.0,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,21.0,1.6,66.0,2.0,1.0,1,3.0,1.0,0.0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,26.0,1.643355,111.600553,3.0,3.0,1,2.621877,0.0,0.250502,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,20.979254,1.553127,103.669116,2.0,2.977909,1,2.786417,0.094851,0.0,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,26.0,1.627396,104.835346,3.0,3.0,1,2.653531,0.0,0.741069,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

In [16]:
numerical_data1 = df_train[['Age','Height', 'Weight']]
numerical_data2 = df_test[['Age','Height', 'Weight']]
scaler = StandardScaler()
scaled_numerical_data1 = scaler.fit_transform(numerical_data1)
scaled_numerical_data2 = scaler.fit_transform(numerical_data2)
df_train[['Age','Height', 'Weight']] = scaled_numerical_data1
df_test[['Age','Height', 'Weight']] = scaled_numerical_data2

In [26]:
#df_test.head()

In [26]:
df_train['NObeyesdad'].value_counts()
## there is an imbalance in the target distribution.

6    4046
5    3248
1    3082
4    2910
0    2523
3    2522
2    2427
Name: NObeyesdad, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

# Extract the target column as a pandas Series
target_column_name = 'NObeyesdad'
y = df_train[target_column_name]

# Extract the feature columns
X = df_train.drop(columns=[target_column_name])

# Assuming 'data' is your preprocessed dataset and 'target' is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Predictive Modeling

In [19]:
## SVM, K-Nearest Neighbors, Decision Trees
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


In [20]:
svm_model = svm.SVC(kernel='linear', C=1, random_state=42)
# Train the SVM model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report_str)



Accuracy: 0.8697

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       524
           1       0.87      0.81      0.83       626
           2       0.70      0.75      0.73       484
           3       0.73      0.69      0.71       514
           4       0.84      0.84      0.84       543
           5       0.96      0.97      0.96       657
           6       1.00      1.00      1.00       804

    accuracy                           0.87      4152
   macro avg       0.85      0.86      0.86      4152
weighted avg       0.87      0.87      0.87      4152



In [36]:
dt_model = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree model on the training data
dt_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model's performance
accuracy_dt = accuracy_score(y_test, y_pred_dt)
classification_report_dt = classification_report(y_test, y_pred_dt)


# Display the results
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")
print("\nDecision Tree Classification Report:")
print(classification_report_dt)

Decision Tree Accuracy: 0.8490

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       524
           1       0.80      0.78      0.79       626
           2       0.66      0.67      0.66       484
           3       0.71      0.74      0.72       514
           4       0.84      0.82      0.83       543
           5       0.96      0.95      0.95       657
           6       0.99      1.00      1.00       804

    accuracy                           0.85      4152
   macro avg       0.83      0.83      0.83      4152
weighted avg       0.85      0.85      0.85      4152



In [38]:
# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the Gradient Boosting model on the training data
gb_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model's performance
accuracy_gb = accuracy_score(y_test, y_pred_gb)
classification_report_gb = classification_report(y_test, y_pred_gb)

# Display the results
print(f"Gradient Boosting Accuracy: {accuracy_gb:.4f}")
print("\nGradient Boosting Classification Report:")
print(classification_report_gb)

Gradient Boosting Accuracy: 0.9046

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       524
           1       0.88      0.89      0.88       626
           2       0.78      0.78      0.78       484
           3       0.80      0.80      0.80       514
           4       0.88      0.88      0.88       543
           5       0.98      0.97      0.97       657
           6       1.00      1.00      1.00       804

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152

