
### Raw Data

This notebook assumes raw_data has already run and creates a medallion architecture declarative pipeline to normalize the event stream and create summary tables

In [0]:
%pip install --upgrade databricks-sdk

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")
EVENTS_VOLUME = dbutils.widgets.get("EVENTS_VOLUME")
SIMULATOR_SCHEMA = dbutils.widgets.get("SIMULATOR_SCHEMA")
PIPELINE_SCHEDULE_MINUTES = int(dbutils.widgets.get("PIPELINE_SCHEDULE_MINUTES"))

# 0 = continuous mode, N > 0 = triggered mode with schedule every N minutes
continuous_mode = (PIPELINE_SCHEDULE_MINUTES == 0)

print(f"Pipeline mode: {'Continuous' if continuous_mode else f'Triggered (every {PIPELINE_SCHEDULE_MINUTES} minutes)'}")

In [None]:
import os

from databricks.sdk import WorkspaceClient
from databricks.sdk.service import pipelines

w = WorkspaceClient()

root_abs_path = os.path.abspath("../pipelines/order_items")
root_dbx_path = root_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

created = w.pipelines.create(
    catalog=CATALOG,
    schema='lakeflow',
    continuous=continuous_mode,
    name=f"Order Items Medallion Declarative Pipeline",
    serverless=True,
    configuration={
        "RAW_DATA_CATALOG":CATALOG,
        "RAW_DATA_SCHEMA":SIMULATOR_SCHEMA,
        "RAW_DATA_VOLUME":EVENTS_VOLUME
    },
    root_path=root_dbx_path,
    libraries=[pipelines.PipelineLibrary(glob=pipelines.PathPattern(include=f"{root_dbx_path}/**"))],
    allow_duplicate_names=True
)

print(f"Created pipeline_id={created.pipeline_id} (continuous={continuous_mode})")

# If triggered mode, create a scheduled job to run pipeline updates
if not continuous_mode:
    import databricks.sdk.service.jobs as j
    
    cron_expression = f"0 0/{PIPELINE_SCHEDULE_MINUTES} * * * ?"
    
    pipeline_job = w.jobs.create(
        name=f"Pipeline Update Scheduler (every {PIPELINE_SCHEDULE_MINUTES} min)",
        tasks=[
            j.Task(
                task_key="update_pipeline",
                pipeline_task=j.PipelineTask(
                    pipeline_id=created.pipeline_id
                )
            )
        ],
        schedule=j.CronSchedule(
            quartz_cron_expression=cron_expression,
            timezone_id="UTC",
            pause_status=j.PauseStatus.UNPAUSED
        )
    )
    
    print(f"Created scheduled job_id={pipeline_job.job_id} to run pipeline every {PIPELINE_SCHEDULE_MINUTES} minutes")
    
    # Register the job with uc_state
    import sys
    sys.path.append('../utils')
    from uc_state import add
    add(CATALOG, "jobs", pipeline_job)
    
    # Run immediately once
    w.jobs.run_now(job_id=pipeline_job.job_id)
    print(f"Started initial pipeline run")

In [0]:
# wait for the tables to be created
# future stages may require their existence before being able to be run

import time

while True:
    try:
        if spark.catalog.tableExists(f"{CATALOG}.lakeflow.all_events"):
            break
    except Exception:
        pass
    time.sleep(5)

In [None]:
# Also add to UC-state
import sys
sys.path.append('../utils')
from uc_state import add

add(CATALOG, "pipelines", created)