In [0]:
!pip install -q mlflow lightgbm
!pip install optuna-integration[mlflow]

In [0]:
dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
%matplotlib inline

## Configuration

In [0]:
ex = "store_sales_experiment"
mdl = "mdl_store_sales"

mlflow.set_experiment('/' + ex)

## Load Pre-processed Data

Load silver_training and silver_test tables which already contain all necessary features.

In [0]:
spark.sql("USE cscie103_catalog.final_project")

tr = spark.table("silver_training").toPandas()
ts = spark.table("silver_test").toPandas()

# tr.drop(columns=['id', 'transactions'], inplace=True)
# ts.drop(columns=['id', 'transactions'], inplace=True)

# Convert date columns
tr["date"] = pd.to_datetime(tr["date"])
ts["date"] = pd.to_datetime(ts["date"])

print(f"Training data shape: {tr.shape}")
print(f"Test data shape: {ts.shape}")
print(f"\nTraining columns: {list(tr.columns)}")

In [0]:
tr.head(2)

In [0]:
ts.head(2)

In [0]:
set(tr.columns) - set(ts.columns), set(ts.columns) - set(tr.columns)

## Prepare Features

Select feature columns (excluding oil since it's not in the pre-processed tables).

In [0]:
# Define feature columns 
fc = ["strIndxer_family", "store_nbr", "strIndxer_city", "strIndxer_state", 
      "strIndxer_type", "cluster", "is_holiday", "is_salary_day", 
      # "transactions", 
      "onpromotion", "day_of_week", "day_of_month", "month"]

# Extract features and target
X = tr[fc]
y = tr["sales"].astype(float)
Xt = ts[fc]

print(f"Feature columns: {fc}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

## Create Train/Validation Split

Split the training data using the last 28 days as validation.

In [0]:
cut = tr["date"].max() - pd.Timedelta(28, "D")
m1 = tr["date"] <= cut
m2 = tr["date"] > cut

Xtr = X[m1]
ytr = y[m1]
Xv = X[m2]
yv = y[m2]

# Apply log transformation
ytrlog = np.log1p(ytr)
yvlog = np.log1p(yv)

print(f"Training set: {Xtr.shape[0]} samples")
print(f"Validation set: {Xv.shape[0]} samples")
print(f"Validation split date: {cut}")

## Train Model with MLflow

Train LightGBM model with MLflow tracking.

In [0]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_log_error

# Quick Optuna optimization
mlflow.lightgbm.autolog()
# mlflow.lightgbm.autolog(log_models=False)

with mlflow.start_run(run_name="lightgbm_optuna_quick") as parent_run:
    
    def objective(trial):
        # Simplified parameter search space
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [500, 1000]),
            'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.03, 0.05]),
            'num_leaves': trial.suggest_categorical('num_leaves', [31, 64]),
            'max_depth': trial.suggest_categorical('max_depth', [5, 7, 9]),
            'feature_fraction': trial.suggest_categorical('feature_fraction', [0.7, 0.8, 0.9]),
            'bagging_fraction': trial.suggest_categorical('bagging_fraction', [0.7, 0.8]),
            'random_state': 42,
            'verbose': -1
        }
        
        # Train model
        model = lgb.LGBMRegressor(**params)
        model.fit(
            Xtr, ytrlog,
            eval_set=[(Xv, yvlog)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(30, verbose=False)]  # Reduced patience
        )
        
        # Predict and calculate RMSLE
        vp = model.predict(Xv)
        vp = np.expm1(vp).clip(0, None)
        rmsle = np.sqrt(mean_squared_log_error(yv, vp))
        
        return rmsle
    
    # Create Optuna study
    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    # Quick optimization - only 15 trials
    study.optimize(
        objective, 
        n_trials=5,
        show_progress_bar=True
    )
    
    # Log best parameters and score
    print(f"\nBest trial:")
    print(f"  RMSLE: {study.best_trial.value:.6f}")
    print(f"  Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
        mlflow.log_param(f"best_{key}", value)
    
    mlflow.log_metric("best_rmsle", study.best_trial.value)
    mlflow.log_metric("n_trials", len(study.trials))
    
    # Train final model with best parameters
    best_params = study.best_trial.params.copy()
    best_params['random_state'] = 42
    best_params['verbose'] = -1
    
    final_model = lgb.LGBMRegressor(**best_params)
    final_model.fit(
        Xtr, ytrlog,
        eval_set=[(Xtr, ytrlog), (Xv, yvlog)],
        eval_metric='rmse'
    )
    
    # Final validation predictions
    vp = final_model.predict(Xv)
    vp = np.expm1(vp).clip(0, None)
    sc = np.sqrt(mean_squared_log_error(yv, vp))
    
    mlflow.log_metric("final_validation_rmsle", sc)
    mlflow.lightgbm.log_model(final_model, "model")
    
    print(f"\nFinal Validation RMSLE: {sc:.6f}")
    
    rid = parent_run.info.run_id
    muri = f"runs:/{rid}/model"

## Register Model

In [0]:
rv = None
try:
    x = mlflow.register_model(muri, mdl)
    rv = x.version
    print(f"Model registered: version {rv}")
except Exception as e:
    print(f"No registry available: {e}")

if rv:
    loadu = f"models:/{mdl}/{rv}"
else:
    loadu = muri

## Load and Verify Model

In [0]:
m2load = mlflow.pyfunc.load_model(loadu)
vp2 = m2load.predict(Xv)
vp2 = np.expm1(vp2).clip(0, None)
print(f"Mean absolute difference: {np.abs(vp - vp2).mean():.6f}")