### Menu Data

This stage uploads pre-generated restaurant menu PDFs to a Unity Catalog volume
and loads the structured metadata as a dimension table. These PDFs contain
nutritional information and allergen data for each brand's menu items.

In [None]:
%pip install --upgrade databricks-sdk

In [None]:
dbutils.library.restartPython()

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")

##### Create catalog, schema, and volume for menu documents

In [None]:
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.menu_documents")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.menu_documents.menus")
print(f"\u2705 Created schema {CATALOG}.menu_documents and volume menus")

##### Copy PDF files from the repo into the Unity Catalog volume

In [None]:
import os
import glob

pdf_source_dir = os.path.abspath("../data/menus/pdfs")
volume_path = f"/Volumes/{CATALOG}/menu_documents/menus"

pdf_files = glob.glob(os.path.join(pdf_source_dir, "*.pdf"))
print(f"Found {len(pdf_files)} PDF files to upload")

for pdf_file in pdf_files:
    filename = os.path.basename(pdf_file)
    with open(pdf_file, "rb") as src:
        with open(f"{volume_path}/{filename}", "wb") as dst:
            dst.write(src.read())
    print(f"  Uploaded: {filename}")

print(f"\u2705 Uploaded {len(pdf_files)} PDFs to {volume_path}")

##### Load structured metadata as dimension tables

The menu_metadata.json contains the source data used to generate the PDFs,
including items, nutritional info, and allergens per brand.

In [None]:
import json

metadata_path = os.path.abspath("../data/menus/menu_metadata.json")
with open(metadata_path) as f:
    metadata = json.load(f)

print(f"Loaded metadata for {len(metadata['brands'])} brands")

In [None]:
from pyspark.sql.types import (
    StructType, StructField, StringType, DoubleType, IntegerType, ArrayType
)

# Flatten items with their brand context
rows = []
for brand in metadata["brands"]:
    for item in brand["items"]:
        rows.append({
            "brand_name": brand["brand_name"],
            "cuisine": brand["cuisine"],
            "pdf_filename": brand["pdf_filename"],
            "item_name": item["name"],
            "description": item["description"],
            "category": item["category"],
            "price": float(item["price"]),
            "calories": int(item["calories"]),
            "protein_g": int(item["protein_g"]),
            "fat_g": int(item["fat_g"]),
            "carbs_g": int(item["carbs_g"]),
            "allergens": item["allergens"],
        })

schema = StructType([
    StructField("brand_name", StringType()),
    StructField("cuisine", StringType()),
    StructField("pdf_filename", StringType()),
    StructField("item_name", StringType()),
    StructField("description", StringType()),
    StructField("category", StringType()),
    StructField("price", DoubleType()),
    StructField("calories", IntegerType()),
    StructField("protein_g", IntegerType()),
    StructField("fat_g", IntegerType()),
    StructField("carbs_g", IntegerType()),
    StructField("allergens", ArrayType(StringType())),
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("overwrite").saveAsTable(f"{CATALOG}.menu_documents.brands_metadata")
print(f"\u2705 Created brands_metadata table with {df.count()} items")

##### Register resources with uc_state for cleanup

In [None]:
import sys
sys.path.append('../utils')
from uc_state import add

# The schema and volume will be cleaned up when the catalog is dropped,
# but we register the catalog itself if it was newly created.
# Note: other stages may have already registered the catalog.
print("\u2705 Menu data stage complete")