# 01 â€” Seed reference data (Unity Catalog + Delta)

Creates small reference tables under the `ref` schema used by subsequent notebooks.


In [None]:
%pip install faker==25.2.0


In [None]:
# Configuration (Databricks widgets)
# These widgets make the demo portable across workspaces/accounts.
# If you're running this outside a Databricks notebook, it will fall back to defaults.

DEFAULT_CATALOG = "rtpa_catalog"
DEFAULT_SCHEMA_REF = "credentialing_ref"
DEFAULT_SCHEMA_BRONZE = "credentialing_bronze"
DEFAULT_SCHEMA_SILVER = "credentialing_silver"
DEFAULT_SCHEMA_GOLD = "credentialing_gold"

DEFAULT_N_PROVIDERS = 200
DEFAULT_DAYS_SCHEDULE = 14
DEFAULT_SEED = 42

try:
    dbutils.widgets.text("catalog", DEFAULT_CATALOG, "Catalog")
    dbutils.widgets.text("schema_ref", DEFAULT_SCHEMA_REF, "Schema (ref)")
    dbutils.widgets.text("schema_bronze", DEFAULT_SCHEMA_BRONZE, "Schema (bronze)")
    dbutils.widgets.text("schema_silver", DEFAULT_SCHEMA_SILVER, "Schema (silver)")
    dbutils.widgets.text("schema_gold", DEFAULT_SCHEMA_GOLD, "Schema (gold)")

    dbutils.widgets.text("n_providers", str(DEFAULT_N_PROVIDERS), "N providers")
    dbutils.widgets.text("days_schedule", str(DEFAULT_DAYS_SCHEDULE), "Days schedule")
    dbutils.widgets.text("seed", str(DEFAULT_SEED), "Random seed")

    catalog = dbutils.widgets.get("catalog") or DEFAULT_CATALOG
    schema_ref = dbutils.widgets.get("schema_ref") or DEFAULT_SCHEMA_REF
    schema_bronze = dbutils.widgets.get("schema_bronze") or DEFAULT_SCHEMA_BRONZE
    schema_silver = dbutils.widgets.get("schema_silver") or DEFAULT_SCHEMA_SILVER
    schema_gold = dbutils.widgets.get("schema_gold") or DEFAULT_SCHEMA_GOLD

    N_PROVIDERS = int(dbutils.widgets.get("n_providers") or DEFAULT_N_PROVIDERS)
    DAYS_SCHEDULE = int(dbutils.widgets.get("days_schedule") or DEFAULT_DAYS_SCHEDULE)
    SEED = int(dbutils.widgets.get("seed") or DEFAULT_SEED)
except Exception:
    catalog = DEFAULT_CATALOG
    schema_ref = DEFAULT_SCHEMA_REF
    schema_bronze = DEFAULT_SCHEMA_BRONZE
    schema_silver = DEFAULT_SCHEMA_SILVER
    schema_gold = DEFAULT_SCHEMA_GOLD

    N_PROVIDERS = DEFAULT_N_PROVIDERS
    DAYS_SCHEDULE = DEFAULT_DAYS_SCHEDULE
    SEED = DEFAULT_SEED

# Derived helpers
fq = lambda sch, tbl: f"{catalog}.{sch}.{tbl}"


In [None]:
# Unity Catalog bootstrap (you may need permissions to create catalogs/schemas)
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
spark.sql(f"USE CATALOG {catalog}")
for sch in [schema_ref, schema_bronze, schema_silver, schema_gold]:
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{sch}")


## Create reference tables
Tables created: `ref.facility`, `ref.department`, `ref.procedure`, `ref.credential_type`, `ref.payer`, `ref.unit`, `ref.unit_certification`.


In [None]:
from pyspark.sql.types import *

# ref.facility
facility_rows = [
    ("FAC-001", "Manhattan General", "NYC"),
    ("FAC-002", "Brooklyn Community", "NYC"),
    ("FAC-003", "Queens Regional", "NYC"),
    ("FAC-004", "Bronx Medical Center", "NYC"),
]
facility_schema = StructType([
    StructField("facility_id", StringType(), False),
    StructField("facility_name", StringType(), False),
    StructField("region", StringType(), False),
])
facility_df = spark.createDataFrame(facility_rows, facility_schema)

# ref.department
department_rows = [
    ("D-001", "FAC-001", "Emergency"),
    ("D-002", "FAC-001", "Surgery"),
    ("D-003", "FAC-002", "Emergency"),
    ("D-004", "FAC-002", "Anesthesiology"),
    ("D-005", "FAC-003", "ICU"),
    ("D-006", "FAC-004", "Cardiology"),
]
department_schema = StructType([
    StructField("dept_id", StringType(), False),
    StructField("facility_id", StringType(), False),
    StructField("dept_name", StringType(), False),
])
department_df = spark.createDataFrame(department_rows, department_schema)

# ref.procedure
procedure_rows = [
    ("PROC-ER-INT",   "ER Initial Triage",        "D-001", False, True),
    ("PROC-ER-SEDS",  "Moderate Sedation",        "D-001", True,  True),
    ("PROC-SURG-APP", "Appendectomy",             "D-002", True,  False),
    ("PROC-ANES-GEN", "General Anesthesia",        "D-004", True,  True),
    ("PROC-ICU-VNT",  "Ventilator Management",     "D-005", True,  True),
    ("PROC-CARD-ECG", "ECG Interpretation",        "D-006", False, False),
]
procedure_schema = StructType([
    StructField("procedure_code", StringType(), False),
    StructField("procedure_name", StringType(), False),
    StructField("dept_id", StringType(), False),
    StructField("requires_privilege", BooleanType(), False),
    StructField("requires_acls", BooleanType(), False),
])
procedure_df = spark.createDataFrame(procedure_rows, procedure_schema)

# ref.credential_type
credential_type_rows = [
    ("STATE_MED_LICENSE", "State medical license (required for practice)", 365*2),
    ("ACLS",              "Advanced Cardiovascular Life Support",         365*2),
    ("DEA",               "Controlled substance registration (DEA)",      365*3),
]
credential_type_schema = StructType([
    StructField("cred_type", StringType(), False),
    StructField("cred_desc", StringType(), False),
    StructField("renewal_cycle_days", IntegerType(), False),
])
credential_type_df = spark.createDataFrame(credential_type_rows, credential_type_schema)

# ref.payer
payer_rows = [
    ("PAY-001", "Medicare"),
    ("PAY-002", "Medicaid"),
    ("PAY-003", "Aetna"),
    ("PAY-004", "UnitedHealthcare"),
    ("PAY-005", "Blue Cross Blue Shield"),
]
payer_schema = StructType([
    StructField("payer_id", StringType(), False),
    StructField("payer_name", StringType(), False),
])
payer_df = spark.createDataFrame(payer_rows, payer_schema)

# ref.unit (hospital units for nurse staffing)
unit_rows = [
    ("UNIT-ICU-001", "FAC-001", "ICU Tower A", "ICU", 20, 2.0),
    ("UNIT-ICU-002", "FAC-002", "ICU West", "ICU", 16, 2.0),
    ("UNIT-MEDSURG-001", "FAC-001", "Med-Surg 3rd Floor", "MED_SURG", 40, 5.0),
    ("UNIT-MEDSURG-002", "FAC-002", "Med-Surg East", "MED_SURG", 32, 5.0),
    ("UNIT-TELE-001", "FAC-001", "Telemetry Unit", "TELEMETRY", 24, 4.0),
    ("UNIT-ED-001", "FAC-001", "Emergency Department", "ED", 30, 4.0),
    ("UNIT-ED-002", "FAC-002", "Emergency Room", "ED", 24, 4.0),
    ("UNIT-STEPDOWN-001", "FAC-001", "Step-Down Unit", "STEP_DOWN", 18, 3.0),
    ("UNIT-OR-001", "FAC-001", "Operating Rooms", "OR", 12, 1.0),
    ("UNIT-NICU-001", "FAC-003", "Neonatal ICU", "NICU", 24, 2.0),
]
unit_schema = StructType([
    StructField("unit_id", StringType(), False),
    StructField("facility_id", StringType(), False),
    StructField("unit_name", StringType(), False),
    StructField("unit_type", StringType(), False),  # ICU, STEP_DOWN, MED_SURG, TELEMETRY, ED, OR, NICU, etc.
    StructField("bed_count", IntegerType(), False),
    StructField("target_ratio", FloatType(), False),  # Target nurse-to-patient ratio (patients per nurse)
])
unit_df = spark.createDataFrame(unit_rows, unit_schema)

# ref.unit_certification (required certifications by unit type)
unit_cert_rows = [
    ("ICU", "ACLS", True),
    ("ICU", "BLS", True),
    ("ICU", "Critical Care Certification", True),
    ("STEP_DOWN", "ACLS", True),
    ("STEP_DOWN", "BLS", True),
    ("MED_SURG", "BLS", True),
    ("TELEMETRY", "ACLS", True),
    ("TELEMETRY", "BLS", True),
    ("ED", "ACLS", True),
    ("ED", "BLS", True),
    ("ED", "TNCC", True),
    ("ED", "PALS", True),
    ("OR", "BLS", True),
    ("OR", "ACLS", True),
    ("NICU", "BLS", True),
    ("NICU", "NRP", True),
]
unit_cert_schema = StructType([
    StructField("unit_type", StringType(), False),
    StructField("cred_type", StringType(), False),
    StructField("is_required", BooleanType(), False),
])
unit_cert_df = spark.createDataFrame(unit_cert_rows, unit_cert_schema)

# Write as Delta tables in Unity Catalog
facility_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "facility"))
department_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "department"))
procedure_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "procedure"))
credential_type_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "credential_type"))
payer_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "payer"))
unit_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "unit"))
unit_cert_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_ref, "unit_certification"))


## Validate
Count each table and display at least one.


In [None]:
tables = ["facility", "department", "procedure", "credential_type", "payer", "unit", "unit_certification"]
for t in tables:
    c = spark.read.table(fq(schema_ref, t)).count()
    print(f"{fq(schema_ref, t)}: {c:,}")

display(spark.read.table(fq(schema_ref, "unit")).orderBy("unit_id"))
