## Bakehouse Sales Forecasting
Time series forecasting model using XGBoost to predict next-day product sales by store location.

In [0]:
%pip install xgboost databricks-feature-engineering --quiet
dbutils.library.restartPython()

### Data Loading

In [0]:
transactions = spark.read.table("samples.bakehouse.sales_transactions")
franchises = spark.read.table("samples.bakehouse.sales_franchises")
display(transactions)
display(franchises)

### Feature Engineering

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1. Compute daily sales per product per store
product_daily_sales = transactions.withColumn(
    "transaction_date", F.to_date("dateTime")
).groupBy(
    "franchiseID", "product", "transaction_date"
).agg(
    F.sum("quantity").alias("units_sold")
)

# 2. Compute lag features (sales 1 day, 2 days ago, etc.)
window_spec = Window.partitionBy("franchiseID", "product").orderBy("transaction_date")

for lag_day in [1, 2, 3, 7]:
    product_daily_sales = product_daily_sales.withColumn(
        f"units_sold_d{lag_day}",
        F.lag("units_sold", lag_day).over(window_spec)
    )

# 3. Add rolling average of recent sales (last 7 days)
product_daily_sales = product_daily_sales.withColumn(
    "mean_7d_units_sold",
    F.avg("units_sold").over(window_spec.rowsBetween(-6, 0))
)

# 4. Add contextual features: day of week (1=Sunday, ..., 7=Saturday)
product_daily_sales = product_daily_sales.withColumn(
    "dow", F.dayofweek("transaction_date")
)
display(product_daily_sales)

In [0]:
# Window for product/store grouping
group_window = Window.partitionBy("franchiseID", "product")

# List of lag feature columns
lag_cols = ["units_sold_d1", "units_sold_d2", "units_sold_d3", "units_sold_d7", "mean_7d_units_sold"]

# For each lag column, create a filled column (fillna with group mean)
for col in lag_cols:
    mean_col = f"{col}_mean"
    # Compute the mean for this group
    product_daily_sales = product_daily_sales.withColumn(mean_col, F.avg(col).over(group_window))
    # Fill NA in lag column with group mean
    product_daily_sales = product_daily_sales.withColumn(
        col,
        F.when(F.col(col).isNull(), F.col(mean_col)).otherwise(F.col(col))
    )
    # Optionally drop the mean column afterwards to keep dataframe tidy
    product_daily_sales = product_daily_sales.drop(mean_col)


display(product_daily_sales)

In [0]:
# In order to predict NEXT DAY sales, we shift "units_sold" up by 1 day per group.
product_daily_sales = product_daily_sales.withColumn(
    "target_units_sold", F.lead("units_sold", 1).over(window_spec)
)
feature_table_df = product_daily_sales.dropna(subset=[
    "units_sold_d1", "units_sold_d2", "units_sold_d3",
    "units_sold_d7", "mean_7d_units_sold", "dow", "target_units_sold"
])
display(feature_table_df)

In [0]:
# Register the feature table
from databricks.feature_engineering import FeatureEngineeringClient
spark.sql("DROP DATABASE IF EXISTS workspace.bakehouse_features CASCADE;")
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.bakehouse_features;")


# Create a FeatureEngineeringClient with the desired catalog
fe_client = FeatureEngineeringClient()

# Now you can use fe_client to work with feature tables in the 'bakehouse' catalog
#fe_client = FeatureEngineeringClient(model_registry_uri="databricks-uc")
feature_table_name = "workspace.bakehouse_features.store_product_daily_features"
fe_client.create_table(
    name=feature_table_name,
    primary_keys=["franchiseID", "product", "transaction_date"],
    schema=feature_table_df.schema,
    description="Store-product daily sales with lags and rolling features for time series forecasting"
)
fe_client.write_table(name=feature_table_name, df=feature_table_df, mode="merge")


### Model Training & Evaluation

In [0]:
# Train the model
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
# Convert to Pandas for model training
pandas_df = feature_table_df.toPandas()
# Define feature columns and target
feature_cols = ["units_sold_d1", "units_sold_d2", "units_sold_d3",
                "units_sold_d7", "mean_7d_units_sold", "dow"]
X = pandas_df[feature_cols]
y = pandas_df["target_units_sold"]
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the XGBoost regressor
xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train, y_train)
# Make predictions and evaluate
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on test set: {mse}") 

In [0]:
# Log the model with MLflow
import mlflow
import mlflow.xgboost
from mlflow.models.signature import infer_signature

signature = infer_signature(X_train, y_train)
with mlflow.start_run() as run:
    mlflow.xgboost.log_model(xgb_model, artifact_path="xgboost_model", signature=signature, input_example=X_test.head())
    mlflow.log_metric("mse", mse)
    run_id = run.info.run_id
print(f"Model logged in MLflow. Run ID: {run_id}")

### Model Deployment

In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.bakehouse_ai;")

# Register the model in the Model Registry
model_uri = f"runs:/{run_id}/xgboost_model"
result = mlflow.register_model(
    model_uri=model_uri,
    name="workspace.bakehouse_ai.xgboost_model"  
)
version = result.version
print(f"Model {result.name} registered. Model version: {version}")

In [0]:
# create a serving endpoint for the registered model
from mlflow.deployments import get_deploy_client

mlflow.set_registry_uri("databricks-uc")
client = get_deploy_client("databricks")

endpoint = client.create_endpoint(
    name="xgboost-model-01",
    config={
        "served_entities": [{
            "entity_name": "workspace.bakehouse_ai.xgboost_model",
            "entity_version": version,
            "workload_type": "CPU",
            "workload_size": "Small",
            "scale_to_zero_enabled": True
        }]
    }
)
print(f"Serving endpoint created: {endpoint.name}")

In [0]:
# Query the endpoint
# Format your inputs: dataframe_split is the preferred input style
inputs = {
    "dataframe_split": {
        "columns": X_test.columns.tolist(),
        "data": X_test.values.tolist()
    }
}

response = client.predict(endpoint="xgboost_model", inputs=inputs)

# The prediction results will be in response['predictions'] or response['outputs']
print(response)

In [0]:
# --- Cleanup: Remove Feature Table from Feature Store ---
from databricks.feature_engineering import FeatureEngineeringClient
fe_client = FeatureEngineeringClient()
feature_table_name = "workspace.bakehouse_features.store_product_daily_features"
try:
    fe_client.drop_table(name=feature_table_name)
    print(f"Feature table '{feature_table_name}' dropped.")
except Exception as e:
    print(f"Could not drop feature table '{feature_table_name}': {e}")

# --- Cleanup: Drop Feature Database ---
try:
    spark.sql("DROP DATABASE IF EXISTS workspace.bakehouse_features CASCADE;")
    print("Dropped database workspace.bakehouse_features.")
except Exception as e:
    print(f"Could not drop database: {e}")

# --- Cleanup: Delete Registered Model (from Model Registry) ---
import mlflow
try:
    client = mlflow.tracking.MlflowClient()
    # Find all versions, delete each version first
    model_name = "workspace.bakehouse_ai.xgboost_model"
    versions = [v.version for v in client.get_latest_versions(model_name, stages=["None", "Staging", "Production"])]
    for v in versions:
        client.delete_model_version(name=model_name, version=v)
        print(f"Deleted model version {v} for '{model_name}'")
    # Then delete the registered model itself
    client.delete_registered_model(model_name)
    print(f"Deleted registered model: {model_name}")
except Exception as e:
    print(f"Could not delete registered model: {e}")

# --- Cleanup: Delete Model Serving Endpoint ---
from mlflow.deployments import get_deploy_client
try:
    deploy_client = get_deploy_client("databricks")
    deploy_client.delete_endpoint(endpoint="xgboost_model")
    print("Deleted serving endpoint 'xgboost_model'.")
except Exception as e:
    print(f"Could not delete serving endpoint: {e}")

# --- Cleanup: Remove Model File from DBFS ---
try:
    dbutils.fs.rm("/dbfs/models/bakery_xgboost_model.joblib", True)
    print("Deleted model file /dbfs/models/bakery_xgboost_model.joblib from DBFS.")
except Exception as e:
    print(f"Could not delete model file: {e}")