In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import time

In [None]:
#Load the dataset
obesity_df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
obesity_df

In [None]:
#Display basic information about the dataset
obesity_df.info()

In [None]:
# Display the unique obesity levels in the target column
unique_obesity_levels = obesity_df['NObeyesdad'].unique()
unique_obesity_levels

In [None]:
# Count the occurrences of each unique value in the target column
target_counts = obesity_df['NObeyesdad'].value_counts()
target_counts

In [None]:
#Check for missing values
obesity_df.isnull().sum()

In [None]:
#Display basic statistics of the dataset
obesity_df.describe()

In [None]:
#Distribution of target variable (Obesity Levels)
plt.figure(figsize=(12, 6))
sns.countplot(x='NObeyesdad', data=obesity_df)
plt.title('Distribution of Obesity Levels (Target Variable)')
plt.show()

In [None]:
#Correlation matrix heatmap of numerical features 
#Selecting only numerical columns from the dataset for correlation
numerical_cols = obesity_df.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(12, 6))
sns.heatmap(numerical_cols.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
#Visualizing relationships between Age, Weight, and Obesity Level
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Age', y='Weight', hue='NObeyesdad', data=obesity_df)
plt.title('Scatterplot of Age vs Weight by Obesity Level')
plt.show()

In [None]:
#Boxplot of Age vs Obesity Level
plt.figure(figsize=(12, 6))
sns.boxplot(x='NObeyesdad', y='Age', data=obesity_df)
plt.title('Boxplot of Age by Obesity Level')
plt.show()

In [None]:
#Distribution of numeric features (e.g., Age, Height, Weight)
plt.figure(figsize=(12, 6))
obesity_df[['Age', 'Height', 'Weight']].hist(bins=30, figsize=(15, 5))
plt.suptitle('Distributions of Age, Height, and Weight')
plt.show()

In [None]:
#Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC','CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']
for col in categorical_cols:
    obesity_df[col] = label_encoder.fit_transform(obesity_df[col])

In [None]:
#Separate features and target
x = obesity_df.drop('NObeyesdad', axis=1)
y = obesity_df['NObeyesdad']

In [None]:
x

In [None]:
y

In [None]:
#Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)
X_scaled

In [None]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
#Initialize models
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=5)
xgboost_model = XGBClassifier(random_state=42)

# Dictionary to store the models and their names
models = {
    "Logistic Regression": logistic_model,
    "Random Forest": random_forest_model,
    "K-Nearest Neighbors": knn_model,
    "XGBoost": xgboost_model
}


In [None]:
#Dictionary to store the results
results = {}

#Train each model and evaluate
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    end_time = time.time()
    
    # Store the results (accuracy and time taken)
    results[model_name] = {
        "Accuracy": accuracy,
        "Time Taken (s)": end_time - start_time,
        "Classification Report": classification_report(y_test, y_pred, target_names=['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'])
    }


In [None]:
#Displaying results for each model
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Time Taken: {metrics['Time Taken (s)']:.2f} seconds")
    print(f"Classification Report:\n{metrics['Classification Report']}")
    print("="*60)

In [None]:
#Summary of Findings:

	#1.	Best Model: XGBoost is the best overall performer, with the highest accuracy (96.22%) and excellent precision and recall across all classes. It is particularly effective for both majority and minority classes.
	#2.	Random Forest: Also performs very well, with high accuracy (95.74%). It’s a good alternative to XGBoost if interpretability or ensemble-based models are preferred.
	#3.	Logistic Regression: Offers good accuracy (87.47%) with the fastest training time. It’s a good choice when speed is a priority and moderate accuracy is acceptable.
	#4.	KNN: Has the lowest accuracy (82.03%), indicating that it may not be the best choice for this dataset, especially given its struggles with certain classes.




