In [241]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [242]:
# Load the data
df = pd.read_csv('Obesity_prediction.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [243]:
df.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
1510,Female,18.945961,1.605469,82.039,yes,yes,2.76533,3.0,Sometimes,no,1.048584,no,0.192559,0.720411,no,Public_Transportation,Obesity_Type_I
436,Female,18.0,1.56,55.0,no,yes,2.0,3.0,Sometimes,no,1.0,no,0.0,0.0,Frequently,Automobile,Normal_Weight
2044,Female,25.561868,1.675185,110.621723,yes,yes,3.0,3.0,Sometimes,no,1.49583,no,0.109327,0.384129,Sometimes,Public_Transportation,Obesity_Type_III
515,Female,21.962426,1.57206,43.919835,no,no,3.0,1.600812,Frequently,no,2.651258,no,0.600817,0.0,no,Public_Transportation,Insufficient_Weight
1344,Male,18.0,1.844218,109.195529,yes,yes,2.0,1.548407,Sometimes,no,2.191401,no,1.0,1.676944,no,Public_Transportation,Obesity_Type_I


In [244]:
df.dtypes

Gender             object
Age               float64
Height            float64
Weight            float64
family_history     object
FAVC               object
FCVC              float64
NCP               float64
CAEC               object
SMOKE              object
CH2O              float64
SCC                object
FAF               float64
TUE               float64
CALC               object
MTRANS             object
Obesity            object
dtype: object

In [245]:
df.isnull().sum()

Gender            0
Age               0
Height            0
Weight            0
family_history    0
FAVC              0
FCVC              0
NCP               0
CAEC              0
SMOKE             0
CH2O              0
SCC               0
FAF               0
TUE               0
CALC              0
MTRANS            0
Obesity           0
dtype: int64

In [246]:
df['SCC'].value_counts()

SCC
no     2015
yes      96
Name: count, dtype: int64

In [247]:
# Drop the TUE column
df = df.drop(columns=['TUE'])

# Check the DataFrame after dropping the column
print(df.head())

   Gender   Age  Height  Weight family_history FAVC  FCVC  NCP       CAEC  \
0  Female  21.0    1.62    64.0            yes   no   2.0  3.0  Sometimes   
1  Female  21.0    1.52    56.0            yes   no   3.0  3.0  Sometimes   
2    Male  23.0    1.80    77.0            yes   no   2.0  3.0  Sometimes   
3    Male  27.0    1.80    87.0             no   no   3.0  3.0  Sometimes   
4    Male  22.0    1.78    89.8             no   no   2.0  1.0  Sometimes   

  SMOKE  CH2O  SCC  FAF        CALC                 MTRANS  \
0    no   2.0   no  0.0          no  Public_Transportation   
1   yes   3.0  yes  3.0   Sometimes  Public_Transportation   
2    no   2.0   no  2.0  Frequently  Public_Transportation   
3    no   2.0   no  2.0  Frequently                Walking   
4    no   2.0   no  0.0   Sometimes  Public_Transportation   

               Obesity  
0        Normal_Weight  
1        Normal_Weight  
2        Normal_Weight  
3   Overweight_Level_I  
4  Overweight_Level_II  


In [248]:
# One hot encoding for CAEC, CALC, MTRANS
df = pd.get_dummies(df, columns=['CAEC', 'CALC','MTRANS'])

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,SMOKE,CH2O,...,CAEC_no,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,no,2.0,...,False,False,False,False,True,False,False,False,True,False
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,yes,3.0,...,False,False,False,True,False,False,False,False,True,False
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,no,2.0,...,False,False,True,False,False,False,False,False,True,False
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,no,2.0,...,False,False,True,False,False,False,False,False,False,True
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,no,2.0,...,False,False,False,True,False,False,False,False,True,False


In [249]:
# rounding age
df['Age'] = df['Age'].round()

#GENDER (binary)
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

#SMOKE (binary)
df['SMOKE'] = df['SMOKE'].map({'yes': 1, 'no': 0})

#family_history (binary)
df['family_history'] = df['family_history'].map({'yes': 1, 'no': 0})

#FAVC (binary)
df['FAVC'] = df['FAVC'].map({'yes': 1, 'no': 0})

#SCC
df['SCC'] = df['SCC'].map({'yes': 1, 'no': 0})

df.sample(10)


Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,SMOKE,CH2O,...,CAEC_no,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
144,1,22.0,1.61,67.0,1,0,2.0,4.0,0,3.0,...,False,False,False,True,False,False,False,False,True,False
2040,0,26.0,1.629225,104.838425,1,1,3.0,3.0,0,2.556068,...,False,False,False,True,False,False,False,False,True,False
322,0,23.0,1.75,56.0,0,0,3.0,3.0,0,2.0,...,False,False,False,True,False,False,False,False,True,False
470,0,20.0,1.56,51.5,0,1,2.0,3.0,0,2.0,...,False,False,False,True,False,False,False,False,True,False
201,0,41.0,1.54,80.0,1,1,2.0,3.0,0,1.0,...,False,False,False,True,False,True,False,False,False,False
954,0,23.0,1.589616,65.127324,0,1,2.286146,2.961113,0,2.657303,...,False,False,False,True,False,False,False,False,True,False
1550,1,23.0,1.754996,119.087557,1,1,1.631144,3.0,0,2.0,...,False,False,False,True,False,False,False,False,True,False
148,0,27.0,1.6,61.0,0,1,3.0,3.0,0,2.0,...,False,False,False,True,False,True,False,False,False,False
246,0,20.0,1.57,60.0,0,1,3.0,3.0,0,3.0,...,False,False,False,True,False,False,False,False,True,False
1254,1,21.0,1.780503,103.189532,1,1,2.0,3.0,0,1.608128,...,False,False,False,False,True,False,False,False,True,False


In [255]:
# calculating of the BMI
df['BMI'] = df['Weight'] / (df['Height'] ** 2)

print(df[['Height', 'Weight', 'BMI']].head())

   Height  Weight        BMI
0    1.62    64.0  24.386526
1    1.52    56.0  24.238227
2    1.80    77.0  23.765432
3    1.80    87.0  26.851852
4    1.78    89.8  28.342381


In [256]:
# after getting BMI can drop weight and height
df = df.drop(columns=['Weight', 'Height'])

print(df.head())

   Gender   Age  family_history  FAVC  FCVC  NCP  SMOKE  CH2O  SCC  FAF  ...  \
0       0  21.0               1     0   2.0  3.0      0   2.0    0  0.0  ...   
1       0  21.0               1     0   3.0  3.0      1   3.0    1  3.0  ...   
2       1  23.0               1     0   2.0  3.0      0   2.0    0  2.0  ...   
3       1  27.0               0     0   3.0  3.0      0   2.0    0  2.0  ...   
4       1  22.0               0     0   2.0  1.0      0   2.0    0  0.0  ...   

  CALC_Always  CALC_Frequently  CALC_Sometimes  CALC_no  MTRANS_Automobile  \
0       False            False           False     True              False   
1       False            False            True    False              False   
2       False             True           False    False              False   
3       False             True           False    False              False   
4       False            False            True    False              False   

   MTRANS_Bike  MTRANS_Motorbike  MTRANS_Public_Tr

In [None]:
df[df.columns].value_counts()

In [None]:
#Obesity column to numeric values
obesity_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

# apply the mapping to the Obesity column
df['Obesity'] = df['Obesity'].map(obesity_mapping)


df.sample(10)

In [229]:
X = df.drop(columns=['Obesity'])  # Drop the target column from features
y = df['Obesity']  # The target column


In [None]:
X.dtypes

In [None]:
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Get feature importances
importances = rf_model.feature_importances_

# Create a DataFrame to hold the feature names and their importance values
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance DataFrame
print(feature_importance_df)

#conclusion SCC is useless as it is all 0 and the rest i have to keep it

In [None]:
# Drop the 'SCC'
df = df.drop(columns=['SCC'])
print(df.head())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1  # Use all available cores
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best model
best_forest_model = grid_search.best_estimator_

# Make predictions
y_pred = best_forest_model.predict(X_test)

# Print best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
# Define parameter grid
from sklearn.tree import DecisionTreeClassifier


param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best model
best_dt_model = grid_search.best_estimator_

# Make predictions
y_pred = best_dt_model.predict(X_test)

# Print best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
# Save the best Decision Tree model
import joblib


joblib.dump(best_dt_model, 'obesity_decision_tree_model.pkl')

# Optional: Verify the saved model
loaded_model = joblib.load('obesity_decision_tree_model.pkl')

# Confirm model performance remains the same
loaded_predictions = loaded_model.predict(X_test)
print("Loaded Model Accuracy:", accuracy_score(y_test, loaded_predictions))