In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# train_test_split is used to split the data into training and validation sets
# RandomizedSearchCV is used for hyperparameter tuning of the model

from sklearn.model_selection import train_test_split, RandomizedSearchCV

# accuracy_score and classification_report are used to evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report

# drive is used to mount Google Drive to access files stored there
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_processed_path = '/content/drive/MyDrive/ml_project/data/train_preprocessed.csv'
test_processed_path = '/content/drive/MyDrive/ml_project/data/test_preprocessed.csv'
sample_submission_path = '/content/drive/MyDrive/ml_project/data/sample_submission.csv'

In [None]:
train_df = pd.read_csv(train_processed_path)
test_df = pd.read_csv(test_processed_path)
sample_submission_df = pd.read_csv(sample_submission_path)


print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission_df.shape)

Train shape: (15533, 22)
Test shape: (5225, 21)
Sample submission shape: (13840, 2)


In [None]:
X = train_df.drop('WeightCategory', axis=1)
y = train_df['WeightCategory']


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300, 400],     # number of trees
    'max_depth': [None, 5, 7, 10],            # None = full growth
    'min_samples_split': [2, 5, 10],          # min samples to split a node
    'min_samples_leaf': [1, 2, 4],            # min samples at leaf node
    'max_features': ['sqrt', 'log2', None]    # number of features to consider at split
}

 Use RandomizedSearchCV to find best hyperparameters
 - Tests 50 random combinations (faster than testing all 432)
 - Uses 3-fold cross-validation for reliable estimates
 - Optimizes for accuracy
 - Uses all CPU cores for speed (n_jobs=-1)

In [None]:
rand_search_rf = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=50,         # 50 random combinations → practical
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

 Random Forest: ensemble of multiple decision trees

 Why Random Forest?
 - Multiple independent trees reduce overfitting
 - Captures non-linear patterns and feature interactions
 - Fast inference, gives feature importance scores
 - Robust and generally performs well

 Hyperparameters to tune:
 - n_estimators: how many trees (100-400)
   More trees = better but slower
 - max_depth: max depth per tree (None=full, or limit to 5/7/10)
   Shallower = more regularization (less overfitting)
 - min_samples_split: min samples needed to split a node (2, 5, 10)
   Higher = less splitting = simpler trees
 - min_samples_leaf: min samples at leaf node (1, 2, 4)
   Prevents trees from memorizing individual samples
 - max_features: features per split ('sqrt', 'log2', None)
   Reduces correlation between trees, helps ensemble

In [None]:
rand_search_rf.fit(X_train, y_train)
print("Best hyperparameters:", rand_search_rf.best_params_)
print("Best CV accuracy:", rand_search_rf.best_score_)


best_rf = rand_search_rf.best_estimator_
y_val_pred = best_rf.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best hyperparameters: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}
Best CV accuracy: 0.896587799774666
Validation Accuracy: 0.899581589958159

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94       374
           1       0.87      0.90      0.88       469
           2       0.88      0.85      0.87       441
           3       0.96      0.98      0.97       481
           4       0.99      1.00      1.00       597
           5       0.80      0.73      0.76       369
           6       0.80      0.82      0.81       376

    accuracy                           0.90      3107
   macro avg       0.89      0.89      0.89      3107
weighted avg       0.90      0.90      0.90      3107



 Results: Best CV accuracy ~89.66%
 Best parameters found will be used for final model

Evaluate best model on validation set (held-out data)

 Expected Results: ~89.96% accuracy
 Strong on extreme classes (Type III obesity, Insufficient weight)
 Weaker on middle categories (Overweight levels)
 This is normal - intermediate classes are harder to distinguish

In [None]:
best_rf_full = RandomForestClassifier(
    **rand_search_rf.best_params_,
    random_state=42
)
best_rf_full.fit(X, y)

In [None]:
y_test_pred = best_rf_full.predict(test_df)

label_map = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Obesity_Type_I',
    3: 'Obesity_Type_II',
    4: 'Obesity_Type_III',
    5: 'Overweight_Level_I',
    6: 'Overweight_Level_II'
}
y_test_labels = [label_map[num] for num in y_test_pred]

In [None]:

label_map = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Obesity_Type_I',
    3: 'Obesity_Type_II',
    4: 'Obesity_Type_III',
    5: 'Overweight_Level_I',
    6: 'Overweight_Level_II'
}


y_test_labels = [label_map[num] for num in y_test_pred]



submission = pd.DataFrame({
    'id': test_df['id'],  # Use the 'id' column from the original test_df
    'WeightCategory': y_test_labels
})

submission_path = '/content/drive/MyDrive/ml_project/data/rf_submission.csv'

submission.to_csv(submission_path, index=False)

print(f"AdaBoost submission saved successfully to: {submission_path}")

Random Forest submission saved successfully to: /content/rf_submission.csv
