In [0]:
!pip install -q mlflow lightgbm

In [0]:
dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import yaml
import logging
%matplotlib inline

## Configuration

In [0]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def load_config(config_path="../config.yaml"):
    """Load configuration from YAML file"""
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

# Load config
config = load_config("../config.yaml")
config

In [0]:
ex = "store_sales_experiment"
mdl = "mdl_store_sales"

mlflow.set_experiment('/' + ex)

## Load Pre-processed Data

Load silver_training and silver_test tables which already contain all necessary features.

In [0]:

CATALOG_NAME = config['databricks']['catalog']
SCHEMA_NAME = config['databricks']['schema']

spark.sql(f"USE {CATALOG_NAME}.{SCHEMA_NAME}")

tr = spark.table("silver_training").toPandas()
ts = spark.table("silver_testing").toPandas()

# tr.drop(columns=['id', 'transactions'], inplace=True)
# ts.drop(columns=['id', 'transactions'], inplace=True)

# Convert date columns
tr["date"] = pd.to_datetime(tr["date"])
ts["date"] = pd.to_datetime(ts["date"])

print(f"Training data shape: {tr.shape}")
print(f"Test data shape: {ts.shape}")
print(f"\nTraining columns: {list(tr.columns)}")

In [0]:
tr.head(2)

In [0]:
ts.head(2)

In [0]:
set(tr.columns) - set(ts.columns), set(ts.columns) - set(tr.columns)

## Prepare Features

Select feature columns (excluding oil since it's not in the pre-processed tables).

In [0]:
# Define feature columns 
fc = ["strIndxer_family", "store_nbr", "strIndxer_city", "strIndxer_state", 
      "strIndxer_type", "cluster", "is_holiday", "is_salary_day", 
      # "transactions", 
      "onpromotion", "day_of_week", "day_of_month", "month"]

# Extract features and target
X = tr[fc]
y = tr["sales"].astype(float)
Xt = ts[fc]

print(f"Feature columns: {fc}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

## Create Train/Validation Split

Split the training data using the last 28 days as validation.

In [0]:
cut = tr["date"].max() - pd.Timedelta(28, "D")
m1 = tr["date"] <= cut
m2 = tr["date"] > cut

Xtr = X[m1]
ytr = y[m1]
Xv = X[m2]
yv = y[m2]

# Apply log transformation
ytrlog = np.log1p(ytr)
yvlog = np.log1p(yv)

print(f"Training set: {Xtr.shape[0]} samples")
print(f"Validation set: {Xv.shape[0]} samples")
print(f"Validation split date: {cut}")

## Train Model with MLflow

Train LightGBM model with MLflow tracking.

In [0]:
mlflow.lightgbm.autolog()

with mlflow.start_run(run_name="lightgbm_baseline") as rr:
    # Model parameters
    p = {
        "n_estimators": 1000,
        "learning_rate": 0.03,
        "num_leaves": 64,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "random_state": 42
    }
    
    # Train model
    mdl1 = lgb.LGBMRegressor(**p)
    mdl1.fit(
        Xtr, ytrlog,
        eval_set=[(Xtr, ytrlog), (Xv, yvlog)],
        eval_metric="rmse"
    )
    
    # Validation predictions
    vp = mdl1.predict(Xv)
    vp = np.expm1(vp).clip(0, None)
    
    # Calculate RMSLE
    sc = np.sqrt(mean_squared_log_error(yv, vp))
    mlflow.log_metric("rmsle", sc)
    print(f"RMSLE: {sc:.6f}")
    
    rid = rr.info.run_id
    muri = f"runs:/{rid}/model"

## Register Model

In [0]:
rv = None
try:
    x = mlflow.register_model(muri, mdl)
    rv = x.version
    print(f"Model registered: version {rv}")
except Exception as e:
    print(f"No registry available: {e}")

if rv:
    loadu = f"models:/{mdl}/{rv}"
else:
    loadu = muri

## Load and Verify Model

In [0]:
m2load = mlflow.pyfunc.load_model(loadu)
vp2 = m2load.predict(Xv)
vp2 = np.expm1(vp2).clip(0, None)
print(f"Mean absolute difference: {np.abs(vp - vp2).mean():.6f}")