This cell imports additional libraries, including numpy and os, and sets a seed for reproducibility of random operations.

In [None]:
import numpy as np
import os

# Set a seed for reproducibility
SEED = 42
np.random.seed(SEED)
import pandas as pd
import numpy as np

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

This cell mounts the Google Drive to access files stored there.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This cell defines the file paths for the training, testing, and sample submission datasets and then loads them into pandas DataFrames.

In [None]:
# Define file paths based on the project structure
train_processed_path = '/content/drive/MyDrive/ml_project/data/train_preprocessed.csv'
test_processed_path = '/content/drive/MyDrive/ml_project/data/test_preprocessed.csv'
sample_submission_path = '/content/drive/MyDrive/ml_project/data/sample_submission.csv'

# Load the preprocessed datasets
train_df = pd.read_csv(train_processed_path)
test_df = pd.read_csv(test_processed_path)
sample_submission_df = pd.read_csv(sample_submission_path)

# check data
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission_df.shape)


Train shape: (15533, 22)
Test shape: (5225, 21)
Sample submission shape: (13840, 2)


In [None]:
# Import xg boost and eval metrics
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Prepare features and target
X = train_df.drop('WeightCategory', axis=1)
y = train_df['WeightCategory']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=7,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)


XGBOOST HYPERPARAMETER GRID SEARCH

 Model Configuration:
 - objective: 'multi:softmax' (multiclass classification with softmax)
 - num_class: 7 (seven weight categories)
 - eval_metric: 'mlogloss' (multiclass log loss)
 - use_label_encoder: False (modern XGBoost parameter)
 - random_state: 42 (reproducibility)

 Hyperparameter Grid:

 n_estimators: [100, 200]
   Number of boosting rounds (trees to build)
   Higher values = more complex model, risk of overfitting

 max_depth: [3, 5, 7]
   Maximum tree depth (controls tree complexity)
   3 = shallow trees (underfitting risk)
   7 = deeper trees (overfitting risk)

 learning_rate: [0.05, 0.1]
   Shrinkage parameter controlling step size
   Lower values = slower learning but potentially better generalization
   Higher values = faster learning but risk of overshooting

 subsample: [0.8, 1.0]
   Fraction of samples used for building trees (row sampling)
   0.8 = 80% of data per tree (reduces overfitting)
   1.0 = use all data (risk of overfitting)

 colsample_bytree: [0.8, 1.0]
   Fraction of features used for building trees (column sampling)
   0.8 = 80% of features per tree (reduces overfitting)
   1.0 = use all features

 Total Combinations: 2 × 3 × 2 × 2 × 2 = 48 configurations
 CV Strategy: 3-Fold Cross-Validation on training set
 Total Model Fits: 48 configurations × 3 folds = 144 fits

In [None]:
grid.fit(X_train, y_train)

print("Best hyperparameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [None]:
best_xgb_full = XGBClassifier(
    **grid.best_params_,
    objective='multi:softmax',
    num_class=7,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
best_xgb_full.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
best_xgb = grid.best_estimator_
y_val_pred = best_xgb.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.9063405214032829

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       374
           1       0.89      0.90      0.89       469
           2       0.89      0.87      0.88       441
           3       0.96      0.98      0.97       481
           4       0.99      1.00      0.99       597
           5       0.81      0.75      0.78       369
           6       0.81      0.84      0.82       376

    accuracy                           0.91      3107
   macro avg       0.90      0.90      0.90      3107
weighted avg       0.91      0.91      0.91      3107




 VALIDATION SET PERFORMANCE - XGBOOST

 Overall Accuracy: 90.63%

 ( Comparison to Naive Bayes:
 NB Accuracy: 78.24%
 XGB Accuracy: 90.63% )

 **IMPROVEMENT: +12.39%**

 Per-Class Performance (7 Weight Categories):

 Class 0 (Insufficient_Weight):      Precision=0.93, Recall=0.95
   - Excellent performance (vs NB: 0.76, 0.94)
   - Better precision, maintained recall

 Class 1 (Normal_Weight):             Precision=0.89, Recall=0.90
   - Strong performance (vs NB: 0.80, 0.64)
   - Major improvement in recall (+26%)

 Class 2 (Obesity_Type_I):            Precision=0.89, Recall=0.87
   - Much improved (vs NB: 0.66, 0.70)
   - Better balanced performance

 Class 3 (Obesity_Type_II):           Precision=0.96, Recall=0.98
   - Exceptional performance (vs NB: 0.84, 0.93)

 Class 4 (Obesity_Type_III):          Precision=0.99, Recall=1.00
   - Nearly perfect (same as NB)
   - Easiest class to predict

 Class 5 (Overweight_Level_I):        Precision=0.81, Recall=0.75
   - Significant improvement (vs NB: 0.67, 0.54)
   - +14% recall improvement

 Class 6 (Overweight_Level_II):       Precision=0.81, Recall=0.84
   - Major improvement (vs NB: 0.62, 0.62)
   - +22% recall improvement

 Macro Average: 0.90 (NB: 0.76)
 Weighted Average: 0.91 (NB: 0.78)

In [None]:
y_test_pred = best_xgb_full.predict(test_df)


# Map numeric predictions to string labels
label_map = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Obesity_Type_I',
    3: 'Obesity_Type_II',
    4: 'Obesity_Type_III',
    5: 'Overweight_Level_I',
    6: 'Overweight_Level_II'
}
y_test_labels = [label_map[num] for num in y_test_pred]

In [None]:
# -----------------------------
# Prepare Kaggle submission
# -----------------------------
submission = pd.DataFrame({
    'id': test_df['id'],  # Use the 'id' column from the test_df
    'WeightCategory': y_test_labels
})

# save submission
submission_path = '/content/drive/MyDrive/ml_project/data/xgb_submission.csv'
submission.to_csv(submission_path, index=False)
print(f"XGBoost submission saved successfully to: {submission_path}")

XGBoost submission saved successfully to: /content/drive/MyDrive/ml_project/data/xgb_submission.csv


In [None]:
pd.read_csv(submission_path)

Unnamed: 0,id,WeightCategory
0,15533,Obesity_Type_III
1,15534,Overweight_Level_I
2,15535,Overweight_Level_II
3,15536,Obesity_Type_II
4,15537,Normal_Weight
...,...,...
5220,20753,Obesity_Type_II
5221,20754,Insufficient_Weight
5222,20755,Obesity_Type_I
5223,20756,Overweight_Level_II
