# ðŸ¤– SECTION 4: ML MODELING

This notebook trains and evaluates multiple machine learning models.

In [9]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

import sys
from pathlib import Path

# Add project root to path (so we can import src.*)
# Get the directory containing this notebook
notebook_dir = Path.cwd()  # Current working directory (usually notebook's directory)
# If we're in notebooks/, go up one level to project root
if notebook_dir.name == 'notebooks':
# Add project root to path - try multiple methods
    def find_project_root():
        """Find the project root directory."""
        cwd = Path.cwd()
        if cwd.name == 'notebooks' and (cwd.parent / 'src').exists():
            return cwd.parent
        if (cwd / 'src').exists():
            return cwd
        if (cwd.parent / 'src').exists():
            return cwd.parent
        if 'notebooks' in str(cwd):
            parts = cwd.parts
            if 'notebooks' in parts:
                idx = parts.index('notebooks')
                candidate = Path(*parts[:idx])
                if (candidate / 'src').exists():
                    return candidate
        raise FileNotFoundError(f"Cannot find project root. Current dir: {cwd}")

    project_root = find_project_root().resolve()
    sys.path.insert(0, str(project_root))
    from src.data_processing.preprocessing import load_processed_data
    from src.models.training import (
        train_linear_regression, train_random_forest, train_xgboost,
        time_based_split, evaluate_model
    )
    from src.models.time_series import train_sarima, train_prophet, prepare_time_series, time_series_split

    print("âœ… Setup complete!")

âœ… Setup complete!


## Load Preprocessed Data

In [10]:
# Load preprocessed data
from src.data_processing.preprocessing import preprocess_pipeline
df_model, X, y, feature_cols, le_state = preprocess_pipeline(sector='ALL', encode_stateid=True)

print(f"âœ… Data loaded: {len(df_model):,} rows")

ðŸ“‚ Loading from: eia_retail_sales_raw_20251204_100134.csv
ðŸ“Š Before filtering: 110,484 rows
   After state filter: 92,664 rows
   After sector filter (ALL): 15,444 rows
âœ… Preprocessing complete: 15,444 rows
   Features: 8
   Date range: 2001-01-01 00:00:00 to 2025-09-01 00:00:00
   States: 52
âœ… Data loaded: 15,444 rows


## Time-Based Train/Test Split

In [11]:
# Time-based split
X_train, X_test, y_train, y_test = time_based_split(df_model, X, y)

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

Training set: 12,355 samples
Test set: 3,089 samples


## Train Models

In [12]:
# Train Linear Regression
lr_model, lr_train_metrics, lr_test_metrics = train_linear_regression(X_train, y_train, X_test, y_test)
print(f"âœ… Linear Regression - Test RMSE: {lr_test_metrics['rmse']:.2f}")

âœ… Linear Regression - Test RMSE: 14143.04


In [13]:
# Train Random Forest
rf_model, rf_train_metrics, rf_test_metrics = train_random_forest(X_train, y_train, X_test, y_test)
print(f"âœ… Random Forest - Test RMSE: {rf_test_metrics['rmse']:.2f}")

âœ… Random Forest - Test RMSE: 4065.74


In [14]:
# Train XGBoost
xgb_model, xgb_train_metrics, xgb_test_metrics = train_xgboost(X_train, y_train, X_test, y_test)
print(f"âœ… XGBoost - Test RMSE: {xgb_test_metrics['rmse']:.2f}")

âœ… XGBoost - Test RMSE: 41271.05


In [7]:
# Train SARIMA
ts = prepare_time_series(df_model, value_col='sales')
ts_train, ts_test = time_series_split(ts)
sarima_model, sarima_train_metrics, sarima_test_metrics = train_sarima(ts_train, ts_test)
if sarima_model:
    print(f"âœ… SARIMA - Test RMSE: {sarima_test_metrics.get('rmse', 0):.2f}")

   Trying common SARIMA configurations for monthly data...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


   âœ… Best model: (1, 1, 1) x (1, 1, 1, 12) (AIC: 5015.91)
âœ… SARIMA - Test RMSE: 90536.05


In [15]:
# Train Prophet
prophet_df = df_model[['period', 'sales']].copy()
prophet_df.columns = ['ds', 'y']
prophet_train = prophet_df.iloc[:int(len(prophet_df)*0.8)]
prophet_test = prophet_df.iloc[int(len(prophet_df)*0.8):]
prophet_model, prophet_train_metrics, prophet_test_metrics = train_prophet(prophet_train, prophet_test)
if prophet_model:
    print(f"âœ… Prophet - Test RMSE: {prophet_test_metrics.get('rmse', 0):.2f}")

10:02:30 - cmdstanpy - INFO - Chain [1] start processing
10:02:30 - cmdstanpy - INFO - Chain [1] done processing


âœ… Prophet - Test RMSE: 44899.61
