### 01. Import Dependecies

In [18]:
import os
import joblib
import warnings
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error,root_mean_squared_error,mean_squared_error
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
                                    KFold, 
                                    GridSearchCV
                                    )
warnings.filterwarnings('ignore')

### 02. Load the data

In [19]:
X_train = np.load('../../EDA/artifacts/linear/X_train_reg.npz')['arr_0']
X_test = np.load('../../EDA/artifacts/linear/X_test_reg.npz')['arr_0']
Y_train = np.load('../../EDA/artifacts/linear/Y_train_reg.npz')['arr_0']
Y_test = np.load('../../EDA/artifacts/linear/Y_test_reg.npz')['arr_0']

In [20]:
X_train.shape[1]

32

### 03. Define Paramters

In [21]:
# Linear Regression
lr_param_grid = {
     'fit_intercept': [True, False],
     'n_jobs': [1,5,10,15,None], 
      'positive': [True,False]
}

# Random Forest Regressor
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [8, 12, 16],
    'min_samples_leaf': [1, 2, 5]
}

# XGBoost Regressor
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [6, 8, 10],
    'subsample': [0.8, 1]
}

# Combine into a single dictionary
param_grids = {
    'Linear Regression': lr_param_grid,
    'Random Forest': rf_param_grid,
    'XGBoost': xgb_param_grid
}


### 04. Define Multi Models

In [22]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}


### 05. Configure K-Fold CV

In [23]:
cv = KFold(
    n_splits=6,
    random_state=42,
    shuffle=True
    )

In [24]:
grid_search_results={}
model_dir = './trained_models'

for model_name, model in models.items():

    print(f"\n--- Tuning {model_name} ---")

    param_grid = param_grids[model_name]

    grid_search = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                cv=cv,
                scoring='r2',  # <--- Use a regression metric
                verbose=1,
                return_train_score=False
        )

    
    print(f"Fitting gridSearchCV for {model_name}")

    grid_search.fit(X_train, Y_train)

    grid_search_results[model_name] = grid_search
    
    print(f"{model_name} gridSearchCV completed ...")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_}")

     # Save the best trained model to joblib
    model_path = os.path.join(model_dir, f"{model_name.replace(' ', '_')}_best_model.joblib")
    joblib.dump(grid_search.best_estimator_, model_path)
    print(f"Saved best model for {model_name} at: {model_path}")


--- Tuning Linear Regression ---
Fitting gridSearchCV for Linear Regression
Fitting 6 folds for each of 20 candidates, totalling 120 fits
Linear Regression gridSearchCV completed ...
Best parameters: {'fit_intercept': True, 'n_jobs': 1, 'positive': True}
Best CV score: -0.013613487170770303
Saved best model for Linear Regression at: ./trained_models\Linear_Regression_best_model.joblib

--- Tuning Random Forest ---
Fitting gridSearchCV for Random Forest
Fitting 6 folds for each of 18 candidates, totalling 108 fits
Random Forest gridSearchCV completed ...
Best parameters: {'max_depth': 8, 'min_samples_leaf': 5, 'n_estimators': 200}
Best CV score: -0.015634133289016233
Saved best model for Random Forest at: ./trained_models\Random_Forest_best_model.joblib

--- Tuning XGBoost ---
Fitting gridSearchCV for XGBoost
Fitting 6 folds for each of 36 candidates, totalling 216 fits
XGBoost gridSearchCV completed ...
Best parameters: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'sub

### 06. Model Evaluation

#### 6.1 Loading models

In [25]:
lr_model = joblib.load('./trained_models/Linear_Regression_best_model.joblib')
rf_model = joblib.load('./trained_models/Random_Forest_best_model.joblib')
xgb_model = joblib.load('./trained_models/XGBoost_best_model.joblib')

# Example predictions
lr_preds = lr_model.predict(X_test)
rf_preds = rf_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)

In [26]:
print("-----------------R2 Score------------------\n")
print("Linear Regression R²:", r2_score(Y_test, lr_preds))
print("Random Forest R²:", r2_score(Y_test, rf_preds))
print("XGBoost R²:", r2_score(Y_test, xgb_preds))

print("\n-----------------MAE Score------------------\n")
print("Linear Regression MAE:", mean_absolute_error(Y_test, lr_preds))
print("Random Forest MAE:", mean_absolute_error(Y_test, rf_preds))
print("XGBoost MAE:", mean_absolute_error(Y_test, xgb_preds))

print("\n-----------------RMSE Score------------------\n")
print("Linear Regression RMSE:", root_mean_squared_error(Y_test, lr_preds))
print("Random Forest RMSE:", root_mean_squared_error(Y_test, rf_preds))
print("XGBoost RMSE:", root_mean_squared_error(Y_test, xgb_preds))

-----------------R2 Score------------------

Linear Regression R²: -0.005828540865275622
Random Forest R²: -0.004247784075397609
XGBoost R²: -0.05165750433233862

-----------------MAE Score------------------

Linear Regression MAE: 0.2511910539730476
Random Forest MAE: 0.2516645850661472
XGBoost MAE: 0.25632327148528106

-----------------RMSE Score------------------

Linear Regression RMSE: 0.28986014248688
Random Forest RMSE: 0.28963228130721724
XGBoost RMSE: 0.2963900954163004
