# 02 â€” Generate providers + credentials (bronze)

Writes synthetic provider master data, append-only credential events, privileges, and payer enrollment into `bronze` tables.


In [None]:
%pip install faker==25.2.0


In [None]:
# Configuration (Databricks widgets)
# These widgets make the demo portable across workspaces/accounts.
# If you're running this outside a Databricks notebook, it will fall back to defaults.

DEFAULT_CATALOG = "staffing_catalog"
DEFAULT_SCHEMA_REF = "credentialing_ref"
DEFAULT_SCHEMA_BRONZE = "credentialing_bronze"
DEFAULT_SCHEMA_SILVER = "credentialing_silver"
DEFAULT_SCHEMA_GOLD = "credentialing_gold"

DEFAULT_N_PROVIDERS = 200
DEFAULT_DAYS_SCHEDULE = 14
DEFAULT_SEED = 42

try:
    dbutils.widgets.text("catalog", DEFAULT_CATALOG, "Catalog")
    dbutils.widgets.text("schema_ref", DEFAULT_SCHEMA_REF, "Schema (ref)")
    dbutils.widgets.text("schema_bronze", DEFAULT_SCHEMA_BRONZE, "Schema (bronze)")
    dbutils.widgets.text("schema_silver", DEFAULT_SCHEMA_SILVER, "Schema (silver)")
    dbutils.widgets.text("schema_gold", DEFAULT_SCHEMA_GOLD, "Schema (gold)")

    dbutils.widgets.text("n_providers", str(DEFAULT_N_PROVIDERS), "N providers")
    dbutils.widgets.text("days_schedule", str(DEFAULT_DAYS_SCHEDULE), "Days schedule")
    dbutils.widgets.text("seed", str(DEFAULT_SEED), "Random seed")

    catalog = dbutils.widgets.get("catalog") or DEFAULT_CATALOG
    schema_ref = dbutils.widgets.get("schema_ref") or DEFAULT_SCHEMA_REF
    schema_bronze = dbutils.widgets.get("schema_bronze") or DEFAULT_SCHEMA_BRONZE
    schema_silver = dbutils.widgets.get("schema_silver") or DEFAULT_SCHEMA_SILVER
    schema_gold = dbutils.widgets.get("schema_gold") or DEFAULT_SCHEMA_GOLD

    N_PROVIDERS = int(dbutils.widgets.get("n_providers") or DEFAULT_N_PROVIDERS)
    DAYS_SCHEDULE = int(dbutils.widgets.get("days_schedule") or DEFAULT_DAYS_SCHEDULE)
    SEED = int(dbutils.widgets.get("seed") or DEFAULT_SEED)
except Exception:
    catalog = DEFAULT_CATALOG
    schema_ref = DEFAULT_SCHEMA_REF
    schema_bronze = DEFAULT_SCHEMA_BRONZE
    schema_silver = DEFAULT_SCHEMA_SILVER
    schema_gold = DEFAULT_SCHEMA_GOLD

    N_PROVIDERS = DEFAULT_N_PROVIDERS
    DAYS_SCHEDULE = DEFAULT_DAYS_SCHEDULE
    SEED = DEFAULT_SEED

# Derived helpers
fq = lambda sch, tbl: f"{catalog}.{sch}.{tbl}"


In [None]:
# Unity Catalog bootstrap (you may need permissions to create catalogs/schemas)
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
spark.sql(f"USE CATALOG {catalog}")
for sch in [schema_ref, schema_bronze, schema_silver, schema_gold]:
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{sch}")


## Load reference data
Requires that Notebook 01 has run to create `ref.*` tables.


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
import random
import uuid
from datetime import datetime, timedelta
from faker import Faker

random.seed(SEED)
fake = Faker()
fake.seed_instance(SEED)

facility_ids = [r["facility_id"] for r in spark.read.table(fq(schema_ref, "facility")).select("facility_id").collect()]
cred_types = [r["cred_type"] for r in spark.read.table(fq(schema_ref, "credential_type")).select("cred_type").collect()]
payer_ids = [r["payer_id"] for r in spark.read.table(fq(schema_ref, "payer")).select("payer_id").collect()]
procedures_df = spark.read.table(fq(schema_ref, "procedure")).select("procedure_code", "requires_privilege")

SPECIALTIES = ["Emergency Medicine", "Surgery", "Anesthesiology", "Critical Care", "Cardiology"]
PROVIDER_STATUSES = ["ACTIVE", "ACTIVE", "ACTIVE", "INACTIVE", "ON_LEAVE"]

# Fixed base timestamp keeps demo reproducible (even across reruns)
base_ts = datetime(2026, 1, 1, 8, 0, 0)


## bronze.provider_raw
Columns: (id, name, specialty, home_facility_id, hired_at, provider_status, created_at, employment_type, hourly_rate, primary_unit_id)


In [None]:
# Load unit IDs for nurse staffing assignment
unit_ids = [r["unit_id"] for r in spark.read.table(fq(schema_ref, "unit")).select("unit_id").collect()]

# Employment types and hourly rates for nurse staffing cost analysis
EMPLOYMENT_TYPES = ["INTERNAL", "INTERNAL", "INTERNAL", "CONTRACT", "AGENCY"]  # Weighted toward internal
HOURLY_RATES = {"INTERNAL": 50.0, "CONTRACT": 75.0, "AGENCY": 95.0}

provider_rows = []
for i in range(N_PROVIDERS):
    provider_id = str(uuid.uuid5(uuid.NAMESPACE_URL, f"provider-{SEED}-{i}"))
    provider_name = fake.name()
    specialty = random.choice(SPECIALTIES)
    home_facility_id = random.choice(facility_ids)
    hired_at = base_ts.date() - timedelta(days=random.randint(30, 3650))
    provider_status = random.choice(PROVIDER_STATUSES)
    created_at = base_ts + timedelta(minutes=i)
    # Nurse staffing fields
    employment_type = random.choice(EMPLOYMENT_TYPES)
    hourly_rate = HOURLY_RATES[employment_type] + random.uniform(-5, 10)  # Add some variance
    primary_unit_id = random.choice(unit_ids) if random.random() < 0.7 else None  # 70% have a primary unit
    provider_rows.append((provider_id, provider_name, specialty, home_facility_id, hired_at, provider_status, created_at, employment_type, hourly_rate, primary_unit_id))

provider_schema = StructType([
    StructField("provider_id", StringType(), False),
    StructField("provider_name", StringType(), False),
    StructField("specialty", StringType(), False),
    StructField("home_facility_id", StringType(), False),
    StructField("hired_at", DateType(), False),
    StructField("provider_status", StringType(), False),
    StructField("created_at", TimestampType(), False),
    StructField("employment_type", StringType(), False),  # INTERNAL, CONTRACT, AGENCY
    StructField("hourly_rate", FloatType(), False),       # For labor cost calculations
    StructField("primary_unit_id", StringType(), True),   # Home unit assignment (nullable)
])

provider_df = spark.createDataFrame(provider_rows, provider_schema)
provider_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_bronze, "provider_raw"))


## bronze.credential_event_raw
Append-only style events: (event_id, provider_id, cred_type, issued_at, expires_at, verified_at, source_system, cred_status, ingested_at)


In [None]:
event_rows = []
event_i = 0

for p in provider_rows:
    provider_id = p[0]
    hired_at = p[4]
    for ct in cred_types:
        n_events = 1 + (random.randint(0, 2) if ct in ["STATE_MED_LICENSE", "ACLS"] else random.randint(0, 1))
        issued0 = datetime.combine(hired_at, datetime.min.time()) + timedelta(days=random.randint(0, 60))
        for k in range(n_events):
            event_id = str(uuid.uuid5(uuid.NAMESPACE_URL, f"cred-event-{SEED}-{event_i}"))
            issued_at = issued0 + timedelta(days=365 * k + random.randint(0, 30))
            cycle_days = 730 if ct in ["STATE_MED_LICENSE", "ACLS"] else 1095
            drift = random.randint(-120, 240)  # creates some expired credentials for risk analytics
            expires_at = issued_at + timedelta(days=cycle_days + drift)
            verified_at = issued_at + timedelta(days=random.randint(1, 14)) if random.random() < 0.85 else None
            source_system = random.choice(["CRED_SYS_A", "CRED_SYS_B"])
            cred_status = "EXPIRED" if expires_at.date() < base_ts.date() else random.choice(["ACTIVE", "ACTIVE", "PENDING_REVIEW"])
            ingested_at = base_ts + timedelta(seconds=event_i)
            event_rows.append((event_id, provider_id, ct, issued_at, expires_at, verified_at, source_system, cred_status, ingested_at))
            event_i += 1

cred_schema = StructType([
    StructField("event_id", StringType(), False),
    StructField("provider_id", StringType(), False),
    StructField("cred_type", StringType(), False),
    StructField("issued_at", TimestampType(), False),
    StructField("expires_at", TimestampType(), False),
    StructField("verified_at", TimestampType(), True),
    StructField("source_system", StringType(), False),
    StructField("cred_status", StringType(), False),
    StructField("ingested_at", TimestampType(), False)
])

cred_event_df = spark.createDataFrame(event_rows, cred_schema)
cred_event_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_bronze, "credential_event_raw"))


## bronze.privilege_raw
Privileges by provider/facility/procedure: (provider_id, facility_id, procedure_code, granted_at, privilege_status, source_system, ingested_at)


In [None]:
proc_priv = [r["procedure_code"] for r in procedures_df.filter(F.col("requires_privilege") == True).select("procedure_code").collect()]

priv_rows = []
priv_i = 0

for p in provider_rows:
    provider_id = p[0]
    provider_status = p[5]
    max_priv = 4 if provider_status == "ACTIVE" else 2
    n_priv = random.randint(0, max_priv)
    for proc in random.sample(proc_priv, k=min(n_priv, len(proc_priv))):
        facility_id = random.choice(facility_ids)
        granted_at = base_ts - timedelta(days=random.randint(0, 1200))
        privilege_status = random.choice(["ACTIVE", "ACTIVE", "SUSPENDED", "REVOKED"])
        source_system = random.choice(["PRIV_SYS_A", "PRIV_SYS_B"])
        ingested_at = base_ts + timedelta(seconds=100000 + priv_i)
        priv_rows.append((provider_id, facility_id, proc, granted_at, privilege_status, source_system, ingested_at))
        priv_i += 1

priv_schema = StructType([
    StructField("provider_id", StringType(), False),
    StructField("facility_id", StringType(), False),
    StructField("procedure_code", StringType(), False),
    StructField("granted_at", TimestampType(), False),
    StructField("privilege_status", StringType(), False),
    StructField("source_system", StringType(), False),
    StructField("ingested_at", TimestampType(), False)
])

priv_df = spark.createDataFrame(priv_rows, priv_schema)
priv_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_bronze, "privilege_raw"))


## bronze.payer_enrollment_raw
Enrollment rows: (provider_id, payer_id, enrollment_status, effective_at, source_system, ingested_at)


In [None]:
enr_rows = []
enr_i = 0

for p in provider_rows:
    provider_id = p[0]
    provider_status = p[5]
    n = random.randint(1, 3) if provider_status == "ACTIVE" else random.randint(0, 2)
    chosen = random.sample(payer_ids, k=min(max(n, 0), len(payer_ids)))
    for payer_id in chosen:
        effective_at = base_ts.date() - timedelta(days=random.randint(0, 900))
        if provider_status != "ACTIVE" and random.random() < 0.6:
            enrollment_status = "INACTIVE"
        else:
            enrollment_status = random.choice(["ACTIVE", "ACTIVE", "PENDING"])
        source_system = random.choice(["PAYER_SYS_A", "PAYER_SYS_B"])
        ingested_at = base_ts + timedelta(seconds=200000 + enr_i)
        enr_rows.append((provider_id, payer_id, enrollment_status, effective_at, source_system, ingested_at))
        enr_i += 1

enr_schema = StructType([
    StructField("provider_id", StringType(), False),
    StructField("payer_id", StringType(), False),
    StructField("enrollment_status", StringType(), False),
    StructField("effective_at", DateType(), False),
    StructField("source_system", StringType(), False),
    StructField("ingested_at", TimestampType(), False)
])

enr_df = spark.createDataFrame(enr_rows, enr_schema)
enr_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fq(schema_bronze, "payer_enrollment_raw"))


## Validate
Counts + simple groupBy (providers by specialty).


In [None]:
print("Counts:")
for t in ["provider_raw", "credential_event_raw", "privilege_raw", "payer_enrollment_raw"]:
    print(f"{fq(schema_bronze, t)}: {spark.read.table(fq(schema_bronze, t)).count():,}")

display(
    spark.read.table(fq(schema_bronze, "provider_raw"))
        .groupBy("specialty")
        .count()
        .orderBy(F.desc("count"))
)
