In [390]:
#Load the required libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [391]:
#Load the data
obesity = pd.read_csv('ObesityDataSet.csv')
obesity.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObesity
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [392]:
#Summary of the dataset
obesity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [393]:
#Use code if bottom 3 variables in feature importance are dropped
#columns_dropped = ['MTRANS', 'CALC', 'SCC']
#obesity = obesity.drop(columns = columns_dropped)

In [394]:
#Address categorical data features with one hot encoding
#pre_obesity = pd.get_dummies(obesity,columns=['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE'],drop_first=False) #use code if bottom 3 variables in feature importance are dropped
pre_obesity = pd.get_dummies(obesity,columns=['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS'],drop_first=False)
pre_obesity.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObesity,Gender_Female,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.0,1.62,64.0,2.0,3.0,2.0,0.0,1.0,Normal_Weight,1,...,0,0,0,0,1,0,0,0,1,0
1,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0,Normal_Weight,1,...,1,0,0,1,0,0,0,0,1,0
2,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0,Normal_Weight,0,...,0,0,1,0,0,0,0,0,1,0
3,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0,Overweight_Level_I,0,...,0,0,1,0,0,0,0,0,0,1
4,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0,Overweight_Level_II,0,...,0,0,0,1,0,0,0,0,1,0


In [395]:
#Separate the independent and dependent variables
#X = pre_obesity[['Weight', 'Age', 'FAF', 'FCVC', 'NCP']] #use code if top 5 variables in feature importance are used
X = pre_obesity.drop('NObesity', axis=1)
y = pre_obesity['NObesity']

In [396]:
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [397]:
#Use the Categorical Naive Bayes algorithm to train the model
model = CategoricalNB()

model.fit(X_train, y_train);

In [398]:
#Check the evaluation metrics of the model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy:", accuracy*100)
print("Precision:", precision*100)
print("Recall:", recall*100)

Accuracy: 71.13564668769716
Precision: 70.19632548764163
Recall: 69.3709146596747


In [399]:
# Create and fit the Categorical Naive Bayes model
cnb = CategoricalNB()
cnb.fit(X_train, y_train)

# Calculate permutation importance
perm_importance = permutation_importance(cnb, X_test, y_test, n_repeats=30, random_state=42)

# Create a DataFrame with feature names and importance scores
feature_importance_df = pd.DataFrame(data=perm_importance.importances_mean, index=X_train.columns, columns=['Importance'])

# Sort the DataFrame by importance scores in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df)


                                    Importance
Weight                                0.122713
Age                                   0.044374
FAF                                   0.016562
FCVC                                  0.012303
NCP                                   0.009201
CAEC_Frequently                       0.008833
FAVC_no                               0.008360
FAVC_yes                              0.008360
Gender_Female                         0.008360
Gender_Male                           0.008360
family_history_with_overweight_no     0.007834
family_history_with_overweight_yes    0.007834
CAEC_Sometimes                        0.004679
TUE                                   0.004259
MTRANS_Walking                        0.002471
CH2O                                  0.002261
MTRANS_Bike                           0.001577
CAEC_Always                           0.001420
SMOKE_yes                             0.000841
SMOKE_no                              0.000841
CAEC_no      