In [10]:
import pandas as pd
import numpy as np
import sys, os
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

sys.path.append(os.path.abspath('../src'))
from preprocessing import preprocess

import warnings
warnings.filterwarnings("ignore")


# Load & Preprocessing Data

In [22]:
train_df = pd.read_csv('../data/train.csv')
train_df = preprocess(train_df)

X = train_df.drop('Item_Outlet_Sales', axis=1)
y = train_df['Item_Outlet_Sales']

kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("Dataset Shape:", X.shape)


Dataset Shape: (8523, 13)


# Random Forest

In [23]:
# One-hot encode categorical features
X_rf = pd.get_dummies(X, drop_first=True)

rf_param_dist = {
    'n_estimators': [300, 400, 500, 600],
    'max_depth': [None, 8, 10, 12, 15],
    'min_samples_split': [2, 5, 8],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2']
}

rf_random = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_distributions=rf_param_dist,
    n_iter=20,
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_random.fit(X_rf, y)

rf_best_rmse = -rf_random.best_score_

print("Best RF Params:", rf_random.best_params_)
print("Best RF CV RMSE:", rf_best_rmse)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best RF Params: {'n_estimators': 300, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 12}
Best RF CV RMSE: 1095.8763680617271


# Xgboost

In [24]:
xgb_param_dist = {
    'n_estimators': [800, 1000, 1200],
    'learning_rate': [0.02, 0.03, 0.05],
    'max_depth': [4, 5, 6],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 1.5]
}

xgb_random = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42, n_jobs=-1),
    param_distributions=xgb_param_dist,
    n_iter=20,
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

xgb_random.fit(X_rf, y)

xgb_best_rmse = -xgb_random.best_score_

print("Best XGB Params:", xgb_random.best_params_)
print("Best XGB CV RMSE:", xgb_best_rmse)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best XGB Params: {'subsample': 0.9, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.02, 'colsample_bytree': 0.8}
Best XGB CV RMSE: 1107.8516511636878


# CatBoost

In [25]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cb_param_space = {
    "depth": [5, 6, 7, 8],
    "learning_rate": [0.02, 0.03, 0.05],
    "l2_leaf_reg": [1, 3, 5, 7]
}

import random

def sample_params(param_space, n_iter=8):
    param_list = []
    for _ in range(n_iter):
        params = {
            "depth": random.choice(param_space["depth"]),
            "learning_rate": random.choice(param_space["learning_rate"]),
            "l2_leaf_reg": random.choice(param_space["l2_leaf_reg"])
        }
        param_list.append(params)
    return param_list

random_params = sample_params(cb_param_space, n_iter=8)

best_cb_rmse = float("inf")
best_cb_params = None

for params in random_params:
    
    fold_scores = []
    
    for train_idx, val_idx in kf.split(X):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = CatBoostRegressor(
            iterations=5000,
            depth=params["depth"],
            learning_rate=params["learning_rate"],
            l2_leaf_reg=params["l2_leaf_reg"],
            loss_function='RMSE',
            random_state=42,
            verbose=False,
            thread_count=-1
        )
        
        model.fit(
            X_tr, y_tr,
            eval_set=(X_val, y_val),
            cat_features=cat_features,
            early_stopping_rounds=200
        )
        
        val_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        fold_scores.append(rmse)
    
    mean_rmse = np.mean(fold_scores)
    print("Params:", params, "CV RMSE:", mean_rmse)
    
    if mean_rmse < best_cb_rmse:
        best_cb_rmse = mean_rmse
        best_cb_params = params


print("\nBest CatBoost Params:", best_cb_params)
print("Best CatBoost CV RMSE:", best_cb_rmse)



Params: {'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 5} CV RMSE: 1075.7209085432
Params: {'depth': 5, 'learning_rate': 0.02, 'l2_leaf_reg': 1} CV RMSE: 1076.078013701927
Params: {'depth': 5, 'learning_rate': 0.05, 'l2_leaf_reg': 1} CV RMSE: 1075.8471662723587
Params: {'depth': 6, 'learning_rate': 0.02, 'l2_leaf_reg': 7} CV RMSE: 1075.8802333718024
Params: {'depth': 5, 'learning_rate': 0.02, 'l2_leaf_reg': 5} CV RMSE: 1075.8419695856714
Params: {'depth': 5, 'learning_rate': 0.03, 'l2_leaf_reg': 1} CV RMSE: 1075.990512010836
Params: {'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 3} CV RMSE: 1075.8339973694633
Params: {'depth': 8, 'learning_rate': 0.05, 'l2_leaf_reg': 1} CV RMSE: 1076.2302822450563

Best CatBoost Params: {'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 5}
Best CatBoost CV RMSE: 1075.7209085432


# Model Comparison Summary

In [28]:
results = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost", "CatBoost"],
    "Best_CV_RMSE": [
        rf_best_rmse,
        xgb_best_rmse,
        best_cb_rmse
    ]
})

print("\nFinal Model Comparison:")
print(results.sort_values("Best_CV_RMSE"))


Final Model Comparison:
           Model  Best_CV_RMSE
2       CatBoost   1075.720909
0  Random Forest   1095.876368
1        XGBoost   1107.851651


# Conclusion

CatBoost consistently achieved the lowest cross validation RMSE (1075.72), outperforming both Random Forest and XGBoost under the same evaluation setup. CatBoost was selected as the final model for submission.