### Canonical Data

This notebook bootstraps Caspers canonical data into the provided catalog and schema.

Unlike the old generator, this uses pre-generated data from the canonical dataset that is replayed at configurable speeds.

In [None]:
%pip install --upgrade databricks-sdk

In [None]:
dbutils.library.restartPython()

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")
EVENTS_VOLUME = dbutils.widgets.get("EVENTS_VOLUME")
SIMULATOR_SCHEMA = dbutils.widgets.get("SIMULATOR_SCHEMA")
START_DAY = dbutils.widgets.get("START_DAY") if dbutils.widgets.get("START_DAY") else "20"
SPEED_MULTIPLIER = dbutils.widgets.get("SPEED_MULTIPLIER") if dbutils.widgets.get("SPEED_MULTIPLIER") else "1.0"
SCHEDULE_MINUTES = dbutils.widgets.get("SCHEDULE_MINUTES") if dbutils.widgets.get("SCHEDULE_MINUTES") else "5"

##### Create main catalog, simulator related schemas and volumes

In [None]:
%sql
CREATE CATALOG IF NOT EXISTS ${CATALOG};
CREATE SCHEMA IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA};
CREATE VOLUME IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA}.${EVENTS_VOLUME};
CREATE VOLUME IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA}.misc;

##### Create tables from canonical dataset parquet files

Load dimensional data from the canonical dataset (not from ./data/dimensional)

In [None]:
import pandas as pd

# Load dimension tables from canonical dataset
spark.createDataFrame(pd.read_parquet("../data/canonical/canonical_dataset/brands.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.brands")

spark.createDataFrame(pd.read_parquet("../data/canonical/canonical_dataset/locations.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.locations")

spark.createDataFrame(pd.read_parquet("../data/canonical/canonical_dataset/menus.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.menus")

spark.createDataFrame(pd.read_parquet("../data/canonical/canonical_dataset/categories.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.categories")

spark.createDataFrame(pd.read_parquet("../data/canonical/canonical_dataset/items.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.items")

spark.createDataFrame(pd.read_parquet("../data/canonical/canonical_dataset/brand_locations.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.brand_locations")

print("‚úÖ Dimensional tables created from canonical dataset")

##### Start canonical data replay

Create a scheduled job that runs the canonical generator notebook at the specified interval

In [None]:
from databricks.sdk import WorkspaceClient
import databricks.sdk.service.jobs as j
import os

w = WorkspaceClient()

notebook_abs_path = os.path.abspath("../data/canonical/canonical_generator_simple")
notebook_dbx_path = notebook_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

import sys
sys.path.append('../utils')
from uc_state import add

job_name = f"Canonical Data Replay ({CATALOG})"
schedule_minutes = int(SCHEDULE_MINUTES)
cron_expression = f"0 0/{schedule_minutes} * * * ?"

task_def = [
    j.Task(
        task_key="canonical_data_replay",
        notebook_task=j.NotebookTask(
            notebook_path=notebook_dbx_path,
            base_parameters={
                "CATALOG": CATALOG,
                "VOLUME": EVENTS_VOLUME,
                "SCHEMA": SIMULATOR_SCHEMA,
                "START_DAY": START_DAY,
                "SPEED_MULTIPLIER": SPEED_MULTIPLIER,
            },
        )
    )
]
schedule_def = j.CronSchedule(
    quartz_cron_expression=cron_expression,
    timezone_id="UTC",
    pause_status=j.PauseStatus.UNPAUSED,
)

existing = [jb for jb in w.jobs.list(name=job_name) if jb.settings.name == job_name]
if existing:
    job_id = existing[0].job_id
    w.jobs.reset(job_id=job_id, new_settings=j.JobSettings(
        name=job_name, tasks=task_def, schedule=schedule_def,
    ))
    print(f"‚ôªÔ∏è Updated existing job_id={job_id} for {job_name}")
else:
    job = w.jobs.create(name=job_name, tasks=task_def, schedule=schedule_def)
    job_id = job.job_id
    add(CATALOG, "jobs", job)
    print(f"‚úÖ Created scheduled job_id={job_id} for {job_name}")

print(f"   Schedule: Every {schedule_minutes} minutes")

w.jobs.run_now(job_id=job_id)
print(f"üöÄ Started initial run of job {job_id}")

##### Blocking cell to wait for some data to arrive at the volume.

The lakeflow declarative pipeline that comes next infers the schema from existing data.

Lakeflow Jobs doesn't have a file arrival trigger at the task level (yet?)

In [None]:
import time

# Construct the path to the volume where JSONs will arrive
volume_path = f"/Volumes/{CATALOG}/{SIMULATOR_SCHEMA}/{EVENTS_VOLUME}"

def wait_for_data(path, timeout=300, poll_interval=5):
    """
    Wait until at least one file appears in the given path.
    Args:
        path (str): The directory to watch.
        timeout (int): Maximum seconds to wait.
        poll_interval (int): Seconds between checks.
    Raises:
        TimeoutError: If no file appears within the timeout.
    """
    start = time.time()
    while time.time() - start < timeout:
        files = dbutils.fs.ls(path)
        if any(f.size > 0 for f in files if not f.path.endswith('/')):
            print("‚úÖ Data arrived. Safe to proceed.")
            return
        time.sleep(poll_interval)
    raise TimeoutError(f"No data found in {path} after {timeout} seconds.")

wait_for_data(volume_path)