# Multi-Class Prediction of Obesity Risk

## Data Pre-processing

In [4]:
!pip install sweetviz;

Collecting sweetviz
  Using cached sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Using cached sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
Installing collected packages: sweetviz
Successfully installed sweetviz-2.3.1


In [5]:
import numpy as np 
import pandas as pd
import sweetviz as sv

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df_train=pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
df_test=pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
sample_submission=pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")
#report = sv.analyze(df_train)
df_train

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Automobile,Normal_Weight
2,2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


In [6]:
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
numerical_features = [feature for feature in df_test.columns[1:] if feature not in categorical_features]

In [7]:
for feature in categorical_features:
    dummies = pd.get_dummies(df_test[feature], prefix=feature, dtype=int)
    df_test = pd.concat([dummies, df_test], axis=1)
    
    dummies = pd.get_dummies(df_train[feature], prefix=feature, dtype=int)
    df_train = pd.concat([dummies, df_train], axis=1)

df_test.drop(columns=categorical_features, inplace=True)
df_train.drop(columns=categorical_features, inplace=True)

In [8]:
# dummies = pd.get_dummies(df_train['NObeyesdad'], prefix='target', dtype=int)
# df_train = pd.concat([df_train, dummies], axis=1)
# df_train.drop(columns=['NObeyesdad'], inplace=True)

# target_columns = [elem for elem in df_train.columns if elem.startswith('target')]
target_columns = 'NObeyesdad'
df_train[target_columns].unique()

array(['Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight',
       'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I',
       'Obesity_Type_I'], dtype=object)

In [9]:
def encodeTarget(obesity):
    dictObesity = {'Insufficient_Weight' : 0, 
                  'Normal_Weight' : 1,
                  'Overweight_Level_I': 2,
                  'Overweight_Level_II' : 3,
                  'Obesity_Type_I' : 4,
                  'Obesity_Type_II' : 5,
                  'Obesity_Type_III' : 6}
    return(dictObesity[obesity])

def unencodeTarget(key):
    revertedDictObesity = {0: 'Insufficient_Weight',
                           1: 'Normal_Weight',
                           2: 'Overweight_Level_I',
                           3: 'Overweight_Level_II',
                           4: 'Obesity_Type_I',
                           5: 'Obesity_Type_II',
                           6: 'Obesity_Type_III'}
    return(revertedDictObesity[key])

encodeTarget('Overweight_Level_II'), unencodeTarget(3)

(3, 'Overweight_Level_II')

In [10]:
df_train[target_columns] = df_train[target_columns].map(lambda x : encodeTarget(x))

In [11]:
df_test.drop('CALC_Always', axis = 1, inplace = True)

## Multi-class Re-sampling

In [12]:
import imblearn
print(imblearn.__version__)

0.11.0


In [13]:
import plotly.graph_objs as go

# Supposons que df_train soit votre DataFrame et target_columns la colonne cible
# Remplacez 'target_column' par le nom de votre colonne cible

# Création de la figure Plotly
fig = go.Figure()

# Ajout de l'histogramme à la figure
fig.add_trace(go.Histogram(x=df_train[target_columns], name='Répartition de target_column'))

# Mise en forme du titre et des axes
fig.update_layout(
    title="Répartition de la colonne cible",
    xaxis_title="Valeurs de la colonne cible",
    yaxis_title="Fréquence"
)

# Affichage de la figure
fig.show()

In [14]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

X, y = df_train.drop([target_columns, 'id'], axis = 1), df_train[target_columns]
X, y = oversample.fit_resample(X, y)

df_train_resampled = pd.concat([X, y], axis = 1)

In [15]:
import plotly.graph_objs as go

# Supposons que df_train soit votre DataFrame et target_columns la colonne cible
# Remplacez 'target_column' par le nom de votre colonne cible

# Création de la figure Plotly
fig = go.Figure()

# Ajout de l'histogramme à la figure
fig.add_trace(go.Histogram(x=df_train_resampled[target_columns], name='Répartition de target_column'))

# Mise en forme du titre et des axes
fig.update_layout(
    title="Répartition de la colonne cible",
    xaxis_title="Valeurs de la colonne cible",
    yaxis_title="Fréquence"
)

# Affichage de la figure
fig.show()

In [16]:
df_train_resampled

Unnamed: 0,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,CALC_Frequently,CALC_Sometimes,CALC_no,SCC_no,SCC_yes,...,Gender_Male,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad
0,0,0,0,1,0,0,1,0,1,0,...,1,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473,3
1,1,0,0,0,0,0,0,1,1,0,...,0,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000,1
2,0,0,0,1,0,0,0,1,1,0,...,0,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,0
3,0,0,0,1,0,0,1,0,1,0,...,0,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199,6
4,0,0,0,1,0,0,1,0,1,0,...,1,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28317,0,0,0,1,0,0,1,0,1,0,...,1,24.007488,1.636231,100.490041,2.706213,1.738974,1.028426,0.986254,0.157833,5
28318,0,0,0,1,0,0,1,0,1,0,...,1,31.365102,1.810365,125.281542,2.497397,3.000000,1.503773,0.591149,0.620867,5
28319,0,0,0,1,0,0,1,0,1,0,...,1,26.190117,1.829708,119.576602,2.927489,3.000000,2.509158,0.219955,0.369875,5
28320,1,0,0,0,0,0,1,0,1,0,...,1,27.937613,1.786789,119.603491,2.943365,3.000000,2.850806,0.638644,0.305506,5


# Model training

In [None]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


# Définir la fonction à optimiser
def xgb_evaluate(max_depth, min_child_weight, gamma, subsample, colsample_bytree, 
                 alpha, lambda_, learning_rate):
    params = {'eval_metric': 'logloss',
              'max_depth': int(max_depth),
              'min_child_weight': min_child_weight,
              'subsample': subsample,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              'alpha': alpha,
              'lambda': lambda_,
              'learning_rate': learning_rate,
              'use_label_encoder': False,
              'objective': 'binary:logistic',
              'seed': 42}
    # Le modèle XGBClassifier est créé avec les paramètres sélectionnés
    clf = xgb.XGBClassifier(**params)
    # Calcul de la précision en utilisant la validation croisée
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    return scores.mean()

# Définir les bornes des paramètres à optimiser
param_bounds = {
    'max_depth': (3, 12),
    'min_child_weight': (0, 10),
    'gamma': (0, 5),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.3, 0.9),
    'alpha': (0, 10),
    'lambda_': (1, 10),
    'learning_rate': (0.01, 0.3)
}

# Utiliser BayesianOptimization pour optimiser les hyperparamètres
optimizer = BayesianOptimization(
    f=xgb_evaluate,
    pbounds=param_bounds,
    random_state=42,
)

# Démarrer l'optimisation
optimizer.maximize(init_points=10, n_iter=600)

# Afficher les meilleurs paramètres
print(optimizer.max)

In [None]:
params_opti = {'eval_metric': 'logloss',
              'max_depth': int(12.0),
              'min_child_weight': 0.4004,
              'subsample': 1.0,
              'gamma': 0,
              'colsample_bytree': 0.9,
              'alpha': 0.112,
              'lambda': 2.653,
              'learning_rate': 0.3,
              'use_label_encoder': False,
              'objective': 'binary:logistic',
              'seed': 42}

model = xgb.XGBClassifier(**params_opti)
model.fit(X, y)

In [17]:
y_pred = pd.DataFrame(model.predict(df_test.drop('id', axis=1)), columns = ['NObeyesdad'])

df_submission = pd.concat([df_test['id'], y_pred], axis = 1)
df_submission

Unnamed: 0,id,NObeyesdad
0,20758,5
1,20759,2
2,20760,6
3,20761,4
4,20762,6
...,...,...
13835,34593,3
13836,34594,1
13837,34595,0
13838,34596,1


In [18]:
df_submission['NObeyesdad'] = df_submission['NObeyesdad'].map(lambda x : unencodeTarget(x))

In [19]:
df_submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [21]:
df_submission.to_csv('submissionXgb.csv', index = False)

In [37]:
df_submission.loc[df_submission['NObeyesdad'] != 1]

Unnamed: 0,id,NObeyesdad
