### Inspection Data

This stage uploads pre-generated food safety inspection report PDFs to a Unity
Catalog volume and loads the structured metadata as dimension tables. Each PDF
contains an inspection report for one of the 4 ghost kitchen locations.

In [None]:
%pip install --upgrade databricks-sdk

In [None]:
dbutils.library.restartPython()

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")

##### Create catalog, schema, and volume for food safety documents

In [None]:
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.food_safety")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.food_safety.reports")
print(f"\u2705 Created schema {CATALOG}.food_safety and volume reports")

##### Copy PDF files from the repo into the Unity Catalog volume

In [None]:
import os
import glob

pdf_source_dir = os.path.abspath("../data/inspections/pdfs")
volume_path = f"/Volumes/{CATALOG}/food_safety/reports"

pdf_files = glob.glob(os.path.join(pdf_source_dir, "*.pdf"))
print(f"Found {len(pdf_files)} PDF files to upload")

for pdf_file in pdf_files:
    filename = os.path.basename(pdf_file)
    with open(pdf_file, "rb") as src:
        with open(f"{volume_path}/{filename}", "wb") as dst:
            dst.write(src.read())
    print(f"  Uploaded: {filename}")

print(f"\u2705 Uploaded {len(pdf_files)} PDFs to {volume_path}")

##### Load structured metadata as dimension tables

The inspection_metadata.json contains the source data used to generate the PDFs,
including scores, violations, and corrective actions per inspection.

In [None]:
import json

metadata_path = os.path.abspath("../data/inspections/inspection_metadata.json")
with open(metadata_path) as f:
    metadata = json.load(f)

print(f"Loaded metadata for {len(metadata['inspections'])} inspections across {len(metadata['locations'])} locations")

In [None]:
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DateType
)
from datetime import date as dt_date

# Flatten inspections (one row per inspection)
inspection_rows = []
for insp in metadata["inspections"]:
    inspection_rows.append({
        "inspection_id": insp["inspection_id"],
        "location_id": insp["location_id"],
        "location_name": insp["location_name"],
        "address": insp["address"],
        "jurisdiction": insp["jurisdiction"],
        "inspection_date": dt_date.fromisoformat(insp["inspection_date"]),
        "inspector_name": insp["inspector_name"],
        "score": insp["score"],
        "grade": insp["grade"],
        "violation_count": insp["violation_count"],
        "critical_count": insp["critical_count"],
        "major_count": insp["major_count"],
        "minor_count": insp["minor_count"],
        "follow_up_status": insp["follow_up_status"],
    })

inspection_schema = StructType([
    StructField("inspection_id", StringType()),
    StructField("location_id", IntegerType()),
    StructField("location_name", StringType()),
    StructField("address", StringType()),
    StructField("jurisdiction", StringType()),
    StructField("inspection_date", DateType()),
    StructField("inspector_name", StringType()),
    StructField("score", IntegerType()),
    StructField("grade", StringType()),
    StructField("violation_count", IntegerType()),
    StructField("critical_count", IntegerType()),
    StructField("major_count", IntegerType()),
    StructField("minor_count", IntegerType()),
    StructField("follow_up_status", StringType()),
])

df_inspections = spark.createDataFrame(inspection_rows, schema=inspection_schema)
df_inspections.write.mode("overwrite").saveAsTable(f"{CATALOG}.food_safety.inspections")
print(f"\u2705 Created inspections table with {df_inspections.count()} rows")

In [None]:
# Flatten violations (one row per violation)
violation_rows = []
for insp in metadata["inspections"]:
    for v in insp["violations"]:
        violation_rows.append({
            "inspection_id": insp["inspection_id"],
            "location_id": insp["location_id"],
            "location_name": insp["location_name"],
            "inspection_date": dt_date.fromisoformat(insp["inspection_date"]),
            "code": v["code"],
            "severity": v["severity"],
            "category": v["category"],
            "description": v["description"],
            "corrective_action": v["corrective_action"],
            "deadline_days": v["deadline_days"],
        })

violation_schema = StructType([
    StructField("inspection_id", StringType()),
    StructField("location_id", IntegerType()),
    StructField("location_name", StringType()),
    StructField("inspection_date", DateType()),
    StructField("code", StringType()),
    StructField("severity", StringType()),
    StructField("category", StringType()),
    StructField("description", StringType()),
    StructField("corrective_action", StringType()),
    StructField("deadline_days", IntegerType()),
])

df_violations = spark.createDataFrame(violation_rows, schema=violation_schema)
df_violations.write.mode("overwrite").saveAsTable(f"{CATALOG}.food_safety.violations")
print(f"\u2705 Created violations table with {df_violations.count()} rows")

##### Register resources with uc_state for cleanup

In [None]:
import sys
sys.path.append('../utils')
from uc_state import add

print("\u2705 Inspection data stage complete")