# Casper's Kitchens - Canonical Data Replay

This notebook replays pre-generated ghost kitchen events from the canonical dataset using a custom PySpark streaming data source.

**Features:**
- Reads from single self-contained events.parquet file (34 MB)
- Uses checkpoint to track progress
- Supports variable speed multiplier (1x, 60x, 3600x, etc.)
- Works with `availableNow` trigger for scheduled runs

**Dataset:**
- 75,780 orders across 4 cities (SF, SV, Bellevue, Chicago)
- 1,014,290 events (order lifecycle + driver tracking)
- Real OpenStreetMap routes

## Configuration

In [None]:
import os

# Get parameters from widgets
CATALOG = dbutils.widgets.get("CATALOG")
SCHEMA = dbutils.widgets.get("SCHEMA")
VOLUME = dbutils.widgets.get("VOLUME")
START_DAY = int(dbutils.widgets.get("START_DAY"))
SPEED_MULTIPLIER = float(dbutils.widgets.get("SPEED_MULTIPLIER"))

# Paths
DATASET_PATH = os.path.abspath("../data/canonical/canonical_dataset")  # Relative path to dataset in workspace
VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}"
CHECKPOINT_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/misc/checkpoints"

print(f"ðŸ“‹ Configuration:")
print(f"   Catalog: {CATALOG}")
print(f"   Schema: {SCHEMA}")
print(f"   Volume: {VOLUME}")
print(f"   Start Day: {START_DAY}")
print(f"   Speed Multiplier: {SPEED_MULTIPLIER}x")
print(f"   Dataset Path: {DATASET_PATH}")
print(f"   Output Path: {VOLUME_PATH}")
print(f"   Checkpoint Path: {CHECKPOINT_PATH}")

## Imports

In [None]:
from pyspark.sql.datasource import DataSource, DataSourceStreamReader
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import Row
from datetime import datetime, timedelta
import pandas as pd
import json
import uuid

## Custom Data Source Definition

In [None]:
class CaspersDataSource(DataSource):
    """
    Custom streaming data source that replays pre-generated Casper's Kitchens events.

    Options:
    - datasetPath: Path to canonical_dataset directory
    - simulationStartDay: Which day to start simulation (0-89)
    - speedMultiplier: Speed of replay (1.0=realtime, 60.0=60x speed, 3600.0=1hr per second)
    """

    @classmethod
    def name(cls):
        return "caspers"

    def schema(self):
        """Define the output schema for events."""
        return StructType([
            StructField("event_id", StringType(), False),
            StructField("event_type", StringType(), False),
            StructField("ts", StringType(), False),
            StructField("location_id", IntegerType(), False),
            StructField("order_id", StringType(), False),
            StructField("sequence", IntegerType(), False),
            StructField("body", StringType(), False),
        ])

    def simpleStreamReader(self, schema: StructType):
        """Return a simple stream reader instance (no partitioning needed)."""
        return CaspersStreamReader(schema, self.options)


class CaspersStreamReader(DataSourceStreamReader):
    """
    Stream reader that tracks simulation time as offset.

    Offset = {simulation_seconds: Unix timestamp, offset_timestamp: ISO string}

    Each run processes: (current_real_time - checkpoint_time) Ã— speed_multiplier
    """

    def __init__(self, schema, options):
        self.schema = schema
        self.options = options

        # Parse options
        self.dataset_path = options.get("datasetPath", "./canonical_dataset")
        self.sim_start_day = int(options.get("simulationStartDay", "20"))
        self.speed_multiplier = float(options.get("speedMultiplier", "1.0"))

        # Load canonical dataset using pandas (NOT spark.read - avoids circular dependency!)
        print(f"ðŸ“¦ Loading dataset from {self.dataset_path}...")
        self.events_df = pd.read_parquet(f"{self.dataset_path}/events.parquet")

        unique_orders = self.events_df['order_id'].nunique()
        print(f"âœ… Caspers DataSource initialized")
        print(f"   Orders: {unique_orders:,}")
        print(f"   Events: {len(self.events_df):,}")
        print(f"   Start Day: {self.sim_start_day}")
        print(f"   Speed: {self.speed_multiplier}x")

    def initialOffset(self):
        """
        Return the starting offset for the stream.
        First run outputs all historical data from day 0 to START_DAY + current time.
        Speed multiplier not used for first run - just historical catchup.
        """
        now = datetime.utcnow()
        # Dataset starts at 2024-01-01 00:00:00
        dataset_epoch = datetime(2024, 1, 1).timestamp()

        # Start from beginning of dataset (day 0)
        initial_unix_ts = int(dataset_epoch)

        initial = {
            "simulation_seconds": initial_unix_ts,
            "offset_timestamp": now.isoformat(),
            "is_initial": True  # Flag to indicate this is the first run
        }

        print(f"ðŸŽ¬ Initial offset: day 0 (dataset start)")
        print(f"   First run will output all data from day 0 â†’ day {self.sim_start_day} @ {now.strftime('%H:%M:%S')}")
        print(f"   Speed multiplier ({self.speed_multiplier}x) will be used for subsequent runs")
        return json.dumps(initial)

    def latestOffset(self):
        """
        Return the current offset based on time elapsed since last checkpoint.
        Each run processes: (current_time - last_offset_time) * speed_multiplier
        """
        now = datetime.utcnow()
        # End of dataset: 2024-01-01 + 90 days
        dataset_epoch = datetime(2024, 1, 1).timestamp()
        end_unix_ts = int(dataset_epoch + (90 * 86400))

        latest = {
            "simulation_seconds": end_unix_ts,  # End of dataset as Unix timestamp
            "offset_timestamp": now.isoformat()
        }
        return json.dumps(latest)

    def read(self, start_offset):
        """
        Read events from start_offset to current time.
        First run: Output all data from day 0 to START_DAY + current time (no multiplier).
        Subsequent runs: Use speed multiplier to advance simulation time.
        For simpleStreamReader, this returns (iterator, end_offset).
        """
        # Parse start offset
        start = json.loads(start_offset) if start_offset else {"simulation_seconds": 0, "offset_timestamp": datetime.utcnow().isoformat()}
        start_sim_seconds = start["simulation_seconds"]
        start_real_time = datetime.fromisoformat(start["offset_timestamp"])
        is_initial = start.get("is_initial", False)

        # Calculate elapsed real time since last checkpoint
        now = datetime.utcnow()
        dataset_epoch = datetime(2024, 1, 1).timestamp()

        if is_initial:
            # First run: Just output all historical data up to START_DAY + current time
            current_time_of_day = (now.hour * 3600) + (now.minute * 60) + now.second
            end_sim_seconds = int(dataset_epoch + (self.sim_start_day * 86400) + current_time_of_day)
            print(f"ðŸ“– Reading events (FIRST RUN - historical catchup):")
            print(f"   Start: day 0 00:00:00")
            print(f"   End:   day {self.sim_start_day} @ {now.strftime('%H:%M:%S')}")
            print(f"   Outputting all historical data (speed multiplier not used)")
        else:
            # Subsequent runs: Use speed multiplier
            elapsed_real_seconds = (now - start_real_time).total_seconds()
            elapsed_sim_seconds = int(elapsed_real_seconds * self.speed_multiplier)
            end_sim_seconds = start_sim_seconds + elapsed_sim_seconds

            start_day = int((start_sim_seconds - dataset_epoch) / 86400)
            end_day = int((end_sim_seconds - dataset_epoch) / 86400)

            print(f"ðŸ“– Reading events:")
            print(f"   Start: {start_sim_seconds} Unix timestamp (day {start_day})")
            print(f"   End:   {end_sim_seconds} Unix timestamp (day {end_day})")
            print(f"   Real time elapsed: {elapsed_real_seconds:.1f}s â†’ Sim time: {elapsed_sim_seconds}s ({self.speed_multiplier}x)")

        # Cap at end of dataset (2024-01-01 + 90 days)
        max_sim_seconds = int(dataset_epoch + (90 * 86400))
        end_sim_seconds = min(end_sim_seconds, max_sim_seconds)

        # Filter events to time window (ts_seconds is already absolute Unix timestamp)
        windowed_events = self.events_df[
            (self.events_df["ts_seconds"] >= start_sim_seconds) &
            (self.events_df["ts_seconds"] < end_sim_seconds)
        ].copy()

        print(f"   Found {len(windowed_events)} events in window")

        # Expand to full JSON format
        expanded_rows = self._expand_to_json(windowed_events)

        print(f"   âœ… Returning {len(expanded_rows)} events")

        # Convert to Spark Row objects
        rows = [Row(**row) for row in expanded_rows]

        # Create end offset with current timestamp (remove is_initial flag)
        end_offset_dict = {
            "simulation_seconds": end_sim_seconds,
            "offset_timestamp": now.isoformat()
            # is_initial removed - subsequent runs will use speed multiplier
        }

        # Return iterator and end offset (format for simpleStreamReader)
        return (iter(rows), json.dumps(end_offset_dict))

    def commit(self, end_offset):
        """
        Commit is handled by Spark's checkpoint.
        No-op for our use case.
        """
        end = json.loads(end_offset)
        dataset_epoch = datetime(2024, 1, 1).timestamp()
        day_num = int((end['simulation_seconds'] - dataset_epoch) / 86400)
        print(f"âœ“ Committed up to {end['simulation_seconds']} Unix timestamp (day {day_num})")

    def _expand_to_json(self, windowed_events_df):
        """
        Expand compact parquet format to full JSON event format.
        Data is already embedded in events.parquet - just need to format.
        Uses pandas, returns list of dicts.
        """
        # Event type mapping
        event_types = {
            1: "order_created",
            2: "gk_started",
            3: "gk_finished",
            4: "gk_ready",
            5: "driver_arrived",
            6: "driver_picked_up",
            7: "driver_ping",
            8: "delivered"
        }

        rows = []

        for _, row in windowed_events_df.iterrows():
            event_type = event_types[row["event_type_id"]]

            # Build body based on event type (data already embedded in parquet)
            body = {}
            if event_type == "order_created":
                body = {
                    "customer_lat": float(row["customer_lat"]),
                    "customer_lon": float(row["customer_lon"]),
                    "customer_addr": row["customer_addr"],
                    "items": json.loads(row["items_json"])
                }
            elif event_type == "driver_picked_up":
                if pd.notna(row["route_json"]):
                    body = {
                        "route_points": json.loads(row["route_json"])
                    }
            elif event_type == "driver_ping":
                if pd.notna(row["ping_lat"]):
                    body = {
                        "progress_pct": float(row["ping_progress"]),
                        "loc_lat": float(row["ping_lat"]),
                        "loc_lon": float(row["ping_lon"])
                    }
            elif event_type == "delivered":
                if pd.notna(row["customer_lat"]):
                    body = {
                        "delivered_lat": float(row["customer_lat"]),
                        "delivered_lon": float(row["customer_lon"])
                    }

            # Build event record
            event_record = {
                "event_id": str(uuid.uuid4()),
                "event_type": event_type,
                "ts": datetime.fromtimestamp(row["ts_seconds"]).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3],
                "location_id": int(row["location_id"]),
                "order_id": str(row["order_id"]),
                "sequence": int(row["sequence"]),
                "body": json.dumps(body)
            }

            rows.append(event_record)

        # Sort by simulation time and sequence
        rows.sort(key=lambda x: (x["ts"], x["sequence"]))

        return rows


print("âœ… CaspersDataSource class defined")

## Register Data Source

In [None]:
# Register the custom data source with Spark
spark.dataSource.register(CaspersDataSource)
print("âœ… Caspers data source registered")

## Create Streaming DataFrame

In [None]:
# Create streaming DataFrame
caspers_stream = spark.readStream \
    .format("caspers") \
    .option("datasetPath", DATASET_PATH) \
    .option("simulationStartDay", str(START_DAY)) \
    .option("speedMultiplier", str(SPEED_MULTIPLIER)) \
    .load()

print("âœ… Streaming DataFrame created")
caspers_stream.printSchema()

## Write Stream to Volume

Uses `availableNow` trigger for scheduled runs - processes all available data since last checkpoint and stops.

In [None]:
# Start streaming query with availableNow trigger
# This processes all available data since last checkpoint and stops (perfect for scheduled notebooks)
query = caspers_stream \
    .writeStream \
    .format("json") \
    .option("path", VOLUME_PATH) \
    .option("checkpointLocation", CHECKPOINT_PATH) \
    .trigger(availableNow=True) \
    .start()

print("ðŸš€ Streaming query started with availableNow trigger")
print("   This will process all available events and stop")

# Wait for completion
query.awaitTermination()

print("âœ… Streaming query complete!")

## Verify Output

In [None]:
# Count files in volume
files = dbutils.fs.ls(VOLUME_PATH)
json_files = [f for f in files if f.name.endswith('.json')]

print(f"ðŸ“Š Total JSON files in volume: {len(json_files)}")
print(f"   Volume path: {VOLUME_PATH}")

if json_files:
    # Show sample files
    print("\nðŸ“„ Sample files:")
    for f in json_files[:5]:
        print(f"   {f.name}")