In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.pyfunc
import optuna
import os

from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# LOAD PROCESSED DATA

In [None]:
from src.preprocessing import advanced_feature_engineering

PROCESSED_DIR = 'data/processed'
TRAIN_PATH = os.path.join(PROCESSED_DIR, 'train_processed_final.csv')

try:
    train_df = pd.read_csv(TRAIN_PATH)
    print("Successfully loaded processed training data.")
except FileNotFoundError:
    print(f"ERROR: Processed data not found at '{TRAIN_PATH}'.")
    print("Please run the '00_initial_data_exploration.ipynb' notebook first to generate it.")
    exit()

# MLFLOW SETUP AND MODEL PREPARATION

In [None]:
EXPERIMENT_NAME = "LightGBM_Training"
mlflow.set_experiment(EXPERIMENT_NAME)
print(f"MLflow experiment set to: '{EXPERIMENT_NAME}'")

# Define Features (X) and Target (y)
TARGET = 'Weekly_Sales'
features_to_drop = [TARGET, 'Date']
features = [col for col in train_df.columns if col not in features_to_drop]

X = train_df[features]
y = train_df[TARGET]

# Define WMAE Evaluation Metric
def wmae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

# MLFLOW EXPERIMENT RUNS

In [None]:


# --- SECTION 3:  ---
print("\n--- SECTION 3: MLFLOW EXPERIMENT RUNS ---")
# The rest of the notebook proceeds exactly as before, using the 'X' and 'y' DataFrames.

# == Run 1: Baseline Model ==
with mlflow.start_run(run_name="LGBM_Baseline"):
    print("\n--- Starting Run: LGBM_Baseline ---")
    model = lgb.LGBMRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=3)
    wmae_scores = []
    for train_index, val_index in tscv.split(X):
        X_t, X_v = X.iloc[train_index], X.iloc[val_index]
        y_t, y_v = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_t, y_t)
        preds = model.predict(X_v)
        score = wmae(y_v, preds, X_v['IsHoliday'].astype(bool))
        wmae_scores.append(score)
    avg_wmae = np.mean(wmae_scores)
    print(f"Baseline Average WMAE: {avg_wmae:.2f}")
    mlflow.log_metric("avg_wmae_cv", avg_wmae)

# == Run 2: Hyperparameter Tuning (Optional, can be slow) ==
# ... (Optuna tuning code would go here) ...
best_params = {'learning_rate': 0.05, 'num_leaves': 80, 'feature_fraction': 0.8} # Using placeholder

# == Run 3: Final Model & Registration ==
with mlflow.start_run(run_name="LGBM_Final_Pipeline"):
    print("\n--- Starting Run: LGBM_Final_Pipeline ---")
    final_params = best_params
    final_params['n_estimators'] = 2000
    final_params['random_state'] = 42
    mlflow.log_params(final_params)

    final_model = lgb.LGBMRegressor(**final_params)
    print("Training final model on all data...")
    final_model.fit(X, y)
    print("Training complete.")

    # Create the custom pipeline class for inference
    class WalmartSalesPipeline(mlflow.pyfunc.PythonModel):
        def __init__(self, model, feature_engineering_fn, training_columns):
            self.model = model
            self._feature_engineering_fn = feature_engineering_fn
            self._training_columns = training_columns
        
        def predict(self, context, model_input):
            processed_input = self._feature_engineering_fn(model_input)
            processed_input = processed_input.reindex(columns=self._training_columns, fill_value=0)
            return self.model.predict(processed_input)

    # Log the custom pipeline that includes the imported preprocessing function
    print("Logging and registering the final model pipeline...")
    mlflow.pyfunc.log_model(
        artifact_path="lightgbm-full-pipeline",
        python_model=WalmartSalesPipeline(final_model, advanced_feature_engineering, features),
        code_path=["src/preprocessing.py"],
        registered_model_name="LightGBM-Walmart-Sales-Pipeline",
        input_example=X.head(5)
    )
    print("Model Pipeline successfully logged and registered!")
