
### Raw Data

This notebook assumes raw_data has already run and creates a medallion architecture declarative pipeline to normalize the event stream and create summary tables

In [0]:
%pip install --upgrade databricks-sdk

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")
EVENTS_VOLUME = dbutils.widgets.get("EVENTS_VOLUME")
SIMULATOR_SCHEMA = dbutils.widgets.get("SIMULATOR_SCHEMA")
PIPELINE_SCHEDULE_MINUTES = int(dbutils.widgets.get("PIPELINE_SCHEDULE_MINUTES"))

# 0 = continuous mode, N > 0 = triggered mode with schedule every N minutes
continuous_mode = (PIPELINE_SCHEDULE_MINUTES == 0)

print(f"Pipeline mode: {'Continuous' if continuous_mode else f'Triggered (every {PIPELINE_SCHEDULE_MINUTES} minutes)'}")

In [None]:
import os

from databricks.sdk import WorkspaceClient
from databricks.sdk.service import pipelines as p

w = WorkspaceClient()

root_abs_path = os.path.abspath("../pipelines/order_items")
root_dbx_path = root_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

PIPELINE_NAME = f"Order Items Medallion Pipeline ({CATALOG})"

pipeline_config = dict(
    catalog=CATALOG,
    schema='lakeflow',
    continuous=continuous_mode,
    name=PIPELINE_NAME,
    serverless=True,
    configuration={
        "RAW_DATA_CATALOG": CATALOG,
        "RAW_DATA_SCHEMA": SIMULATOR_SCHEMA,
        "RAW_DATA_VOLUME": EVENTS_VOLUME,
    },
    root_path=root_dbx_path,
    libraries=[p.PipelineLibrary(glob=p.PathPattern(include=f"{root_dbx_path}/**"))],
)

existing_pipelines = [
    pl for pl in w.pipelines.list_pipelines(filter=f"name LIKE '{PIPELINE_NAME}'")
    if pl.name == PIPELINE_NAME
]

if existing_pipelines:
    pipeline_id = existing_pipelines[0].pipeline_id
    w.pipelines.update(pipeline_id=pipeline_id, **pipeline_config)
    print(f"‚ôªÔ∏è Updated existing pipeline: {pipeline_id}")
else:
    created = w.pipelines.create(**pipeline_config)
    pipeline_id = created.pipeline_id
    import sys
    sys.path.append('../utils')
    from uc_state import add
    add(CATALOG, "pipelines", created)
    print(f"‚úÖ Created pipeline: {pipeline_id}")

if not continuous_mode:
    import databricks.sdk.service.jobs as j

    job_name = f"Pipeline Update Scheduler ({CATALOG})"
    cron_expression = f"0 0/{PIPELINE_SCHEDULE_MINUTES} * * * ?"

    task_def = [j.Task(
        task_key="update_pipeline",
        pipeline_task=j.PipelineTask(pipeline_id=pipeline_id),
    )]
    schedule_def = j.CronSchedule(
        quartz_cron_expression=cron_expression,
        timezone_id="UTC",
        pause_status=j.PauseStatus.UNPAUSED,
    )

    existing_jobs = [jb for jb in w.jobs.list(name=job_name) if jb.settings.name == job_name]
    if existing_jobs:
        job_id = existing_jobs[0].job_id
        w.jobs.reset(job_id=job_id, new_settings=j.JobSettings(
            name=job_name, tasks=task_def, schedule=schedule_def,
        ))
        print(f"‚ôªÔ∏è Updated existing scheduler job: {job_id}")
    else:
        pipeline_job = w.jobs.create(name=job_name, tasks=task_def, schedule=schedule_def)
        job_id = pipeline_job.job_id
        import sys
        sys.path.append('../utils')
        from uc_state import add
        add(CATALOG, "jobs", pipeline_job)
        print(f"‚úÖ Created scheduler job: {job_id}")

    w.jobs.run_now(job_id=job_id)
    print(f"üöÄ Started initial pipeline run")

In [0]:
# wait for the tables to be created
# future stages may require their existence before being able to be run

import time

while True:
    try:
        if spark.catalog.tableExists(f"{CATALOG}.lakeflow.all_events"):
            break
    except Exception:
        pass
    time.sleep(5)

In [None]:
print(f"‚úÖ Lakeflow stage complete (pipeline_id={pipeline_id})")