In [None]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentele

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0


In [None]:

import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import zipfile
import os
import optuna # For hyperparameter tuning

from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
# 2.1: MLflow Setup
EXPERIMENT_NAME = "LightGBM_Training"
mlflow.set_experiment(EXPERIMENT_NAME)
print(f"MLflow experiment set to: '{EXPERIMENT_NAME}'")

# 2.2: Load Raw Merged Data
# The pipeline will handle all preprocessing, so we start with the raw merged data.
try:
    # We re-create the merged data here to ensure this notebook is self-contained
    # In a real project, this might be a function imported from a script.
    print("Loading raw data files...")
    with zipfile.ZipFile('data/raw/train.csv.zip', 'r') as z:
        train_df = pd.read_csv(z.open('train.csv'))
    with zipfile.ZipFile('data/raw/features.csv.zip', 'r') as z:
        features_df = pd.read_csv(z.open('features.csv'))
    stores_df = pd.read_csv('data/raw/stores.csv')

    raw_train_data = train_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
    raw_train_data = raw_train_data.merge(stores_df, on='Store', how='left')
    print("Raw training data loaded and merged.")
except FileNotFoundError:
    print("ERROR: Raw data files not found in 'data/raw'. Please run the master notebook first.")
    exit()



MLflow experiment set to: 'LightGBM_Training'
Loading raw data files...
ERROR: Raw data files not found in 'data/raw'. Please run the master notebook first.


In [None]:
# 2.3: Define Custom Evaluation Metric
# Weighted Mean Absolute Error (WMAE) as per Kaggle competition
def wmae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)


# --- SECTION 3: PREPROCESSING PIPELINE DEFINITION ---
print("\n--- SECTION 3: PREPROCESSING PIPELINE DEFINITION ---")
# This is the core of our reproducible model. This function will be part of the pipeline.

def feature_engineering_transformer(df):
    """
    Function to be wrapped in a scikit-learn FunctionTransformer.
    Mirrors the logic from the master notebook.
    """
    df_copy = df.copy()
    df_copy['Date'] = pd.to_datetime(df_copy['Date'])
    if 'Weekly_Sales' in df_copy.columns:
        df_copy['Weekly_Sales'] = df_copy['Weekly_Sales'].clip(lower=0)

    # Holiday Features
    thanksgiving_dates = pd.to_datetime(["2010-11-26", "2011-11-25"])
    super_bowl_dates = pd.to_datetime(["2010-02-12", "2011-02-11", "2012-02-10"])
    labor_day_dates = pd.to_datetime(["2010-09-10", "2011-09-09", "2012-09-07"])
    christmas_dates = pd.to_datetime(["2010-12-31", "2011-12-30"])
    df_copy['IsBlackFridayWeek'] = df_copy.Date.isin(thanksgiving_dates).astype(int)
    df_copy['IsSuperBowlWeek'] = df_copy.Date.isin(super_bowl_dates).astype(int)

    # Time-Based Features
    df_copy['Year'] = df_copy['Date'].dt.year
    df_copy['Month'] = df_copy['Date'].dt.month
    df_copy['WeekOfYear'] = df_copy['Date'].dt.isocalendar().week.astype(int)
    df_copy['DayOfWeek'] = df_copy['Date'].dt.dayofweek

    # Lag Features (requires Weekly_Sales)
    df_copy.sort_values(by=['Store', 'Dept', 'Date'], inplace=True)
    if 'Weekly_Sales' in df_copy.columns:
        df_copy['Sales_Lag_1'] = df_copy.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
        df_copy['Sales_Lag_52'] = df_copy.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(52)

    # Missing Value Handling
    df_copy[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']] = df_copy[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
    df_copy.fillna(method='ffill', inplace=True)
    df_copy.fillna(method='bfill', inplace=True)
    df_copy.fillna(0, inplace=True)

    # One-Hot Encoding for 'Type'
    df_copy = pd.get_dummies(df_copy, columns=['Type'], prefix='Type')

    # Define and select final features
    final_features = [
        'Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size',
        'IsBlackFridayWeek', 'IsSuperBowlWeek', 'Year', 'Month', 'WeekOfYear', 'DayOfWeek',
        'Sales_Lag_1', 'Sales_Lag_52', 'Type_A', 'Type_B', 'Type_C'
    ]
    # Ensure all feature columns exist, fill with 0 if not (can happen in test set)
    for col in final_features:
        if col not in df_copy.columns:
            df_copy[col] = 0

    return df_copy[final_features]

# Wrap the function in a scikit-learn compatible transformer
preprocessor = FunctionTransformer(feature_engineering_transformer)

# --- SECTION 4: MLFLOW EXPERIMENT RUNS ---
print("\n--- SECTION 4: MLFLOW EXPERIMENT RUNS ---")

# Prepare data for modeling
X = raw_train_data.drop('Weekly_Sales', axis=1)
y = raw_train_data['Weekly_Sales']

# == Run 1: Baseline Model ==
with mlflow.start_run(run_name="LGBM_Baseline"):
    print("\n--- Starting Run: LGBM_Baseline ---")
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("tuning", "default_hyperparameters")

    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', lgb.LGBMRegressor(random_state=42))
    ])

    # Time series cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    wmae_scores = []

    for train_index, val_index in tscv.split(X):
        X_t, X_v = X.iloc[train_index], X.iloc[val_index]
        y_t, y_v = y.iloc[train_index], y.iloc[val_index]

        # Fit the pipeline on the training fold
        pipeline.fit(X_t.assign(Weekly_Sales=y_t), y_t)

        # Predict on the validation fold
        preds = pipeline.predict(X_v.assign(Weekly_Sales=y_v))

        # We need the 'IsHoliday' column from the original validation set for WMAE
        is_holiday_val = X_v['IsHoliday'].astype(bool)
        score = wmae(y_v, preds, is_holiday_val)
        wmae_scores.append(score)

    avg_wmae = np.mean(wmae_scores)
    print(f"Baseline Average WMAE: {avg_wmae:.2f}")
    mlflow.log_metric("avg_wmae_cv", avg_wmae)

# == Run 2: Hyperparameter Tuning with Optuna ==
with mlflow.start_run(run_name="LGBM_Hyperparameter_Tuning"):
    print("\n--- Starting Run: LGBM_Hyperparameter_Tuning ---")

    # Use a single, fixed split for faster tuning
    train_idx, val_idx = list(tscv.split(X))[-1]
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Preprocess the data once before the tuning loop
    X_train_processed = preprocessor.fit_transform(X_train.assign(Weekly_Sales=y_train))
    X_val_processed = preprocessor.transform(X_val.assign(Weekly_Sales=y_val))
    is_holiday_val = X_val['IsHoliday'].astype(bool)

    def objective(trial):
        params = {
            'objective': 'regression_l1',
            'metric': 'mae',
            'n_estimators': 1000,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
            'verbose': -1,
            'n_jobs': -1,
            'seed': 42
        }

        model = lgb.LGBMRegressor(**params)
        model.fit(X_train_processed, y_train,
                  eval_set=[(X_val_processed, y_val)],
                  eval_metric='mae',
                  callbacks=[lgb.early_stopping(50, verbose=False)])

        preds = model.predict(X_val_processed)
        score = wmae(y_val, preds, is_holiday_val)
        return score

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20) # Increase n_trials for better results

    print(f"Best WMAE from tuning: {study.best_value:.2f}")
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_wmae_tuned", study.best_value)

# == Run 3: Final Model Pipeline & Registration ==
with mlflow.start_run(run_name="LGBM_Final_Pipeline"):
    print("\n--- Starting Run: LGBM_Final_Pipeline ---")

    # Get best params from the tuning run
    best_params = study.best_params
    best_params['n_estimators'] = 2000 # Train on full data with more estimators
    best_params['random_state'] = 42
    mlflow.log_params(best_params)

    # Create the final pipeline with the best hyperparameters
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', lgb.LGBMRegressor(**best_params))
    ])

    # Train the final pipeline on the ENTIRE dataset
    print("Training final pipeline on all data...")
    final_pipeline.fit(X.assign(Weekly_Sales=y), y)
    print("Final pipeline training complete.")

    # Log the final pipeline to MLflow and register it
    print("Logging and registering the final model...")
    mlflow.sklearn.log_model(
        sk_model=final_pipeline,
        artifact_path="lightgbm-pipeline",
        registered_model_name="LightGBM-Walmart-Sales",
        input_example=X.head(5) # Provide an input example for schema inference
    )
    print("Model successfully logged and registered!")


In [None]:
print(train_df)

NameError: name 'train_df' is not defined