# 04 – Model Training

In [4]:
# Import all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import joblib

# Load processed data
df = pd.read_csv("../data/processed/cleaned_data.csv")
print(f"Dataset shape: {df.shape}")

Dataset shape: (19051, 108)


In [5]:
# Prepare features and target
X = df.drop("price_lkr", axis=1)
y = df["price_lkr"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

Training set: (15240, 107), Test set: (3811, 107)


In [6]:
# Scale features for models that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for later use
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

In [7]:
# Define all models to train
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'SVR': SVR(kernel='rbf'),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, verbosity=0)
}

print("Models to evaluate:", list(models.keys()))

Models to evaluate: ['Linear Regression', 'Ridge', 'Lasso', 'SVR', 'Random Forest', 'Gradient Boosting', 'XGBoost']


In [8]:
# Model evaluation with cross-validation
print("Training and evaluating models...")

results = {}
for name, model in models.items():
    try:
        if name in ['Linear Regression', 'Ridge', 'Lasso', 'SVR']:
            scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
        else:
            scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

        results[name] = scores.mean()
        print(f"{name}: R² = {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    except Exception as e:
        print(f"Error training {name}: {e}")
        results[name] = -float('inf')

# Find best model (excluding XGBoost due to compatibility issues)
best_model_name = max(results.items(), key=lambda x: x[1] if x[0] != 'XGBoost' else -float('inf'))[0]
best_score = results[best_model_name]

print(f"\nBest model: {best_model_name} with R² = {best_score:.4f}")

# Train the best model on full training data
if best_model_name in ['Linear Regression', 'Ridge', 'Lasso', 'SVR']:
    best_model = models[best_model_name]
    best_model.fit(X_train_scaled, y_train)
else:
    best_model = models[best_model_name]
    best_model.fit(X_train, y_train)

Training and evaluating models...


Linear Regression: R² = 0.9177 (+/- 0.0031)
Ridge: R² = 0.9177 (+/- 0.0031)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso: R² = 0.9177 (+/- 0.0031)
SVR: R² = -0.0707 (+/- 0.0277)
Random Forest: R² = 0.9458 (+/- 0.0033)
Gradient Boosting: R² = 0.9431 (+/- 0.0033)
XGBoost: R² = 0.9518 (+/- 0.0023)

Best model: Random Forest with R² = 0.9458


In [9]:
if best_model_name == 'Random Forest':
    print("Performing hyperparameter tuning for Random Forest...")
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }

    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")

elif best_model_name == 'Gradient Boosting':
    print("Performing hyperparameter tuning for Gradient Boosting...")
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.1, 0.2]
    }

    grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")

else:
    print(f"Using default {best_model_name} model (no hyperparameter tuning needed)")
    best_model = models[best_model_name]
    if best_model_name in ['Linear Regression', 'Ridge', 'Lasso', 'SVR']:
        best_model.fit(X_train_scaled, y_train)
    else:
        best_model.fit(X_train, y_train)

Performing hyperparameter tuning for Random Forest...
Best parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}
Best CV score: 0.9463


In [10]:
# Save the best model
model_filename = f'best_{best_model_name.lower().replace(" ", "_")}_model.pkl'
joblib.dump(best_model, f'../models/{model_filename}')
print(f"Best model ({best_model_name}) saved as {model_filename}")

# Also save the scaler for preprocessing
joblib.dump(scaler, '../models/scaler.pkl')
print("Scaler saved as scaler.pkl")

# Save feature names for the web app
feature_names = X.columns.tolist()
joblib.dump(feature_names, '../models/feature_names.pkl')
print("Feature names saved as feature_names.pkl")

print("\nModel training completed successfully!")
print(f"Best performing model: {best_model_name}")
print(f"Model saved to: models/{model_filename}")

Best model (Random Forest) saved as best_random_forest_model.pkl
Scaler saved as scaler.pkl
Feature names saved as feature_names.pkl

Model training completed successfully!
Best performing model: Random Forest
Model saved to: models/best_random_forest_model.pkl
