This cell imports additional libraries, including numpy and os, and sets a seed for reproducibility of random operations.

In [None]:
import numpy as np
import os

# Set a seed for reproducibility
SEED = 42
np.random.seed(SEED)
import pandas as pd
import numpy as np

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

This cell mounts the Google Drive to access files stored there.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

This cell defines the file paths for the training, testing, and sample submission datasets and then loads them into pandas DataFrames.

In [None]:
# Define file paths based on the project structure
train_processed_path = '/content/drive/MyDrive/ml_project/data/train_preprocessed.csv'
test_processed_path = '/content/drive/MyDrive/ml_project/data/test_preprocessed.csv'
sample_submission_path = '/content/drive/MyDrive/ml_project/data/sample_submission.csv'

# Load the preprocessed datasets
train_df = pd.read_csv(train_processed_path)
test_df = pd.read_csv(test_processed_path)
sample_submission_df = pd.read_csv(sample_submission_path)

# check data shape
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission_df.shape)


In [None]:
train_df.columns

In [None]:
train_df.isnull.sum()

NameError: name 'train_df' is not defined


 INSPECT TRAINING DATA FEATURES

 Features in Training Data (22 total):

 ID & Target:
 - id: Record identifier
 - WeightCategory: TARGET VARIABLE (7 classes to predict)

 Numerical Features:
 - Age, Height, Weight: Physical measurements
 - FCVC: Frequency of consumption of vegetables (0-3 scale)
 - NCP: Number of main meals per day (1-4 scale)
 - CH2O: Daily water consumption in liters (1-3 scale)
 - FAF: Frequency of physical activity per week (0-3 scale)
 - TUE: Time using technology in hours (0-2 scale)
 - CALC: Frequency of alcohol consumption (0-3 scale)
 - BMI: Body Mass Index (calculated metric)

 Binary/Categorical Features (Can be one-Hot Encoded):
 - Gender_Male: 1 if male, 0 if female
 - family_history_with_overweight_yes: 1 if family history exists
 - FAVC_yes: 1 if frequently consumes high caloric food
 - SMOKE_yes: 1 if smoker
 - SCC_yes: 1 if monitors caloric intake
 - MTRANS_Bike: 1 if primary transport is bike
 - MTRANS_Motorbike: 1 if primary transport is motorbike
 - MTRANS_Public_Transportation: 1 if uses public transport
 - MTRANS_Walking: 1 if walks



In [None]:
from sklearn.model_selection import train_test_split

X = train_df.drop('WeightCategory', axis=1)  # features
y = train_df['WeightCategory']               # target

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,      # 20% validation, 80% training
    random_state=42,    # ensures reproducibility
    stratify=y          # preserves class distribution in train & val
)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 2️⃣ Define model and parameter grid
# -----------------------------
gnb = GaussianNB()

# GaussianNB has very few hyperparameters:
# - var_smoothing: portion of the largest variance added to variance for stability
param_grid = {
    'var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04]
}

# -----------------------------
# 3️⃣ Grid Search with cross-validation
# -----------------------------
grid = GridSearchCV(
    estimator=gnb,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',  # can also use 'f1_macro' for multi-class
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best hyperparameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

Best hyperparameters: {'var_smoothing': 1e-09}
Best cross-validation accuracy: 0.7793339603186294


In [None]:
print("Best hyperparameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

# -----------------------------
# 4️⃣ Evaluate on validation set
# -----------------------------
best_gnb = grid.best_estimator_
y_val_pred = best_gnb.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Best hyperparameters: {'var_smoothing': 1e-09}
Best cross-validation accuracy: 0.7793339603186294
Validation Accuracy: 0.7824267782426778

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.94      0.84       374
           1       0.80      0.64      0.71       469
           2       0.66      0.70      0.68       441
           3       0.84      0.93      0.88       481
           4       0.99      1.00      0.99       597
           5       0.67      0.54      0.60       369
           6       0.62      0.62      0.62       376

    accuracy                           0.78      3107
   macro avg       0.76      0.77      0.76      3107
weighted avg       0.78      0.78      0.78      3107



 Hyperparameter Tuning:
 Parameter: var_smoothing
 Purpose: Adds stability to variance estimates (Laplace smoothing)
 Range Tested: [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04]
 Smaller values = less smoothing, higher bias-variance tradeoff

 Cross-Validation Strategy:
 - CV Folds: 5 (5-Fold Cross-Validation)
 - Scoring Metric: Accuracy
 - Parallel Jobs: -1 (use all CPU cores)

 Results:
 - Best Parameter: var_smoothing = 1e-09
 - Best CV Accuracy: 77.93%

In [None]:
# Retrain best model on full training data
best_gnb_full = GaussianNB(var_smoothing=grid.best_params_['var_smoothing'])
best_gnb_full.fit(X, y)  # X and y are full train_df features and target

# Predict on test set
y_test_pred_full = best_gnb_full.predict(test_df)


 Per-Class Performance (7 Weight Categories):
 Class 0 (Insufficient_Weight):     Precision=0.76, Recall=0.94
   - Model is conservative (high recall, moderate precision)
   - Good at identifying this class but some false positives

 Class 1 (Normal_Weight):            Precision=0.80, Recall=0.64
   - Moderate recall (misses 36% of instances)
   - Reasonably accurate when predicted

 Class 2 (Obesity_Type_I):           Precision=0.66, Recall=0.70
   - Lower precision (many false positives)
   - Balanced but lower accuracy

 Class 3 (Obesity_Type_II):          Precision=0.84, Recall=0.93
   - Strong performance (high recall and precision)

 Class 4 (Obesity_Type_III):         Precision=0.99, Recall=1.00
   - Excellent performance (nearly perfect)
   - Most easily identifiable class

 Class 5 (Overweight_Level_I):       Precision=0.67, Recall=0.54
   - Weakest performance
   - Often confused with other classes

 Class 6 (Overweight_Level_II):      Precision=0.62, Recall=0.62
   - Lowest accuracy
   - High confusion with adjacent classes

In [None]:
# -----------------------------
# 1️⃣ Define numeric-to-string mapping for WeightCategory
# -----------------------------
label_map = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Obesity_Type_I',
    3: 'Obesity_Type_II',
    4: 'Obesity_Type_III',
    5: 'Overweight_Level_I',
    6: 'Overweight_Level_II'
}


# Predict on test set using the trained full model
# -----------------------------
# Make sure best_gnb_full is trained on the entire training data
y_test_pred = best_gnb_full.predict(test_df)


# Convert numeric predictions to string labels
# -----------------------------
y_test_labels = [label_map[num] for num in y_test_pred]

# Prepare Kaggle submission DataFrame
# -----------------------------
# Create an Id column starting from 1
submission = pd.DataFrame({
    'id': test_df['id'],
    'WeightCategory': y_test_labels
})

# Save submission to CSV
# -----------------------------
submission_path = '/content/drive/MyDrive/ml_project/data/nb_submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Kaggle submission saved successfully to: {submission_path}")


Kaggle submission saved successfully to: /content/drive/MyDrive/ml_project/data/nb_submission.csv


In [None]:
pd.read_csv(submission_path)

Unnamed: 0,id,WeightCategory
0,15533,Obesity_Type_III
1,15534,Overweight_Level_II
2,15535,Overweight_Level_II
3,15536,Obesity_Type_II
4,15537,Normal_Weight
...,...,...
5220,20753,Obesity_Type_II
5221,20754,Insufficient_Weight
5222,20755,Obesity_Type_II
5223,20756,Overweight_Level_II


In [None]:
sample_submission_df

Unnamed: 0,id,WeightCategory
0,20758,Normal_Weight
1,20759,Normal_Weight
2,20760,Normal_Weight
3,20761,Normal_Weight
4,20762,Normal_Weight
...,...,...
13835,34593,Normal_Weight
13836,34594,Normal_Weight
13837,34595,Normal_Weight
13838,34596,Normal_Weight


In [None]:
test_df.columns

Index(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CAEC', 'CH2O', 'FAF',
       'TUE', 'CALC', 'BMI', 'Gender_Male',
       'family_history_with_overweight_yes', 'FAVC_yes', 'SMOKE_yes',
       'SCC_yes', 'MTRANS_Bike', 'MTRANS_Motorbike',
       'MTRANS_Public_Transportation', 'MTRANS_Walking'],
      dtype='object')