## Multi-Class Prediction of Obesity Risk - Model 2

In [178]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

In [179]:
train = pd.read_csv(r'datasets/train.csv')
test = pd.read_csv(r'datasets/test.csv')

In [180]:
train

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Automobile,Normal_Weight
2,2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


### 1. Preprocessing

Biometrics

In [181]:
# Gender to int
train['Gender'] = train['Gender'].map({'Female': 0, 'Male': 1})
test['Gender'] = test['Gender'].map({'Female': 0, 'Male': 1})

# Round age
train['Age'] = train['Age'].round()
test['Age'] = test['Age'].round()

# Round height and weight
train[['Height', 'Weight']] = train[['Height', 'Weight']].round(3)
test[['Height', 'Weight']] = test[['Height', 'Weight']].round(3)

# Calculate IMC (kg/m^2) and round
train['IMC'] = (train['Weight'] / (train['Height']**2)).round(3)
test['IMC'] = (test['Weight'] / (test['Height']**2)).round(3)

Family history and diet

In [182]:
# Encodes based on frequency 
def freq_encoder(val):
    if val == 'no':
        return 3
    elif val == 'Sometimes':
        return 2
    elif val == 'Frequently':
        return 1
    else:
        return 0

In [183]:
# History to int
train['family_history_with_overweight'] = train['family_history_with_overweight'].map({'no': 0, 'yes': 1})
test['family_history_with_overweight'] = test['family_history_with_overweight'].map({'no': 0, 'yes': 1})

# FAVC (high-caloric food) to int
train['FAVC'] = train['FAVC'].map({'no': 0, 'yes': 1})
test['FAVC'] = test['FAVC'].map({'no': 0, 'yes': 1})

# Map and group FCVC (vegetables)
train['FCVC'] = train['FCVC'].round(3)
test['FCVC'] = test['FCVC'].round(3)

# Round NCP (main meals)
train['NCP'] = train['NCP'].round(3)
test['NCP'] = test['NCP'].round(3)

# Map and group CH2O (water)
train['CH2O'] = train['CH2O'].round(3)
test['CH2O'] = test['CH2O'].round(3)

Daily life habits

In [184]:
# Encode CAEC (high-caloric meals)
train['CAEC'] = train['CAEC'].map(freq_encoder)
test['CAEC'] = test['CAEC'].map(freq_encoder)

# Smoke to binary
train['SMOKE'] = train['SMOKE'].map({'no': 0, 'yes': 1})
test['SMOKE'] = test['SMOKE'].map({'no': 0, 'yes': 1})

# SCC (monitoring of calories) to binary
train['SCC'] = train['SCC'].map({'no': 0, 'yes': 1})
test['SCC'] = test['SCC'].map({'no': 0, 'yes': 1})

# Map and group FAF (physicall activity)
train['FAF'] = train['FAF'].round(3)
test['FAF'] = test['FAF'].round(3)

# Map and group TUE (devices)
train['TUE'] = train['TUE'].round(3)
test['TUE'] = test['TUE'].round(3)

# Encode CALC (alcohol)
train['CALC'] = train['CALC'].map(freq_encoder)
test['CALC'] = test['CALC'].map(freq_encoder)

In [185]:
train['MTRANS'].value_counts()

MTRANS
Public_Transportation    16687
Automobile                3534
Walking                    467
Motorbike                   38
Bike                        32
Name: count, dtype: int64

Transportation

In [186]:
def encode_transport(val):
    """
    Encodes each type of transport based on caloric waste
    Considering walking better than bike
    and public transport better than motorbike
    """
    if val == 'Walking':
        return 4
    elif val == 'Bike':
        return 3
    elif val == 'Public_Transportation':
        return 2
    elif val == 'Motorbike':
        return 1
    else:
        return 0

train['MTRANS'] = train['MTRANS'].map(encode_transport)
test['MTRANS'] = test['MTRANS'].map(encode_transport)

Target

In [187]:
obesity_encoder = OrdinalEncoder(categories=[
    ['Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III']
    ])

target_encoded = obesity_encoder.fit_transform(train[['NObeyesdad']])
train['NObeyesdad'] = target_encoded.astype(int)

### 2. Training and validation

In [188]:
# Scale numerical columns (not bool nor encoded nor clustered)
scaler = MinMaxScaler()
to_scale = ['Age', 'Height', 'Weight', 'FCVC', 'CH2O', 'FAF', 'TUE', 'NCP', 'IMC']

train[to_scale] = scaler.fit_transform(train[to_scale])
test[to_scale] = scaler.transform(test[to_scale])

In [189]:
# Separate features and target
X = train.drop(columns=['id', 'NObeyesdad'])
y = train.NObeyesdad

# Split into training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, random_state=11)

# Check arrays' sizes
print('Training:')
print(f'Size of features: {X_train.shape}\t Size of target: {y_train.shape}')
print()
print('Validation:')
print(f'Size of features: {X_valid.shape}\t Size of target: {y_valid.shape}')

Training:
Size of features: (15568, 17)	 Size of target: (15568,)

Validation:
Size of features: (5190, 17)	 Size of target: (5190,)


### 3. Build the models and compare

In [190]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=25, max_leaf_nodes=None, random_state=10)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_valid)

rfc_acc = accuracy_score(y_valid, rfc_pred)
print(f'Random Forest accuracy: {rfc_acc*100:.2f}%')

Random Forest accuracy: 89.79%


In [191]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgbc = HistGradientBoostingClassifier(max_iter=100, random_state=11)
hgbc.fit(X_train, y_train)
hgbc_pred = hgbc.predict(X_valid)

hgbc_acc = accuracy_score(y_valid, hgbc_pred)
print(f'HGradient Booster accuracy: {hgbc_acc*100:.2f}%')

HGradient Booster accuracy: 89.98%


In [192]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=12)
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_valid)

ada_acc = accuracy_score(y_valid, ada_pred)
print(f'Ada Booster accuracy: {ada_acc*100:.2f}%')

Ada Booster accuracy: 68.77%


### 4. Predictions and submission

In [193]:
# Process test data
X_test = test.drop(columns=['id'])

# Predict and convert into labels again
preds = hgbc.predict(X_test)
labels = obesity_encoder.inverse_transform(preds.reshape(-1, 1))

# Create DF for submission
submission = pd.DataFrame({
    'id': test['id'],
    'NObeyesdad': labels.flatten() # Convert into 1D
})

In [194]:
submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [195]:
# Create CSV file if it doesn't exist yet
if not os.path.exists('obesity2_sub.csv'):
    submission.to_csv('obesity2_sub.csv', index=False)
else:
    print('Submission already created')

Submission already created
