In [0]:
# ============================================
# CELL 1: CONFIGURATION
# ============================================

# Set catalog and schema
spark.sql("USE CATALOG smart_claims_dev")
spark.sql("USE SCHEMA bronze")

print(f"Current Catalog: {spark.catalog.currentCatalog()}")
print(f"Current Database: {spark.catalog.currentDatabase()}")

# AWS Configuration
# TODO: Move to Databricks Secrets for production
# For now, using plaintext (replace with your actual credentials)
# Cell 1: Import Configuration
from config import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION

# NOTE: This notebook demonstrates Kinesis streaming ingestion
# ⚠️ REQUIRES: Databricks paid tier (Standard/Premium/Enterprise)
# Free Edition limitation: Streaming not supported on shared clusters
# Current implementation uses Auto Loader instead (see notebook 03)

KINESIS_STREAM_NAME = "insurance-telemetry-stream"

print("\n✅ Configuration loaded")
print(f"   Region: {AWS_REGION}")
print(f"   Stream: {KINESIS_STREAM_NAME}")


In [0]:
# ============================================
# CELL 2: DEFINE TELEMETRY SCHEMA
# ============================================

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Schema for incoming JSON telemetry events
telemetry_schema = StructType([
    StructField("vehicle_id", StringType(), False),      # Required: Vehicle identifier
    StructField("speed_mph", DoubleType(), True),        # Optional: Speed in miles per hour
    StructField("latitude", DoubleType(), True),         # Optional: GPS latitude
    StructField("longitude", DoubleType(), True),        # Optional: GPS longitude
    StructField("acceleration_mps2", DoubleType(), True), # Optional: Acceleration
    StructField("timestamp", TimestampType(), True)      # Optional: Event timestamp
])

print("✅ Telemetry schema defined:")
print(telemetry_schema)


In [0]:
# ============================================
# CELL 3: READ FROM KINESIS STREAM
# ============================================

from pyspark.sql.functions import from_json, col

# Read from Kinesis stream
kinesis_stream = spark.readStream \
    .format("kinesis") \
    .option("streamName", KINESIS_STREAM_NAME) \
    .option("region", AWS_REGION) \
    .option("awsAccessKey", AWS_ACCESS_KEY) \
    .option("awsSecretKey", AWS_SECRET_KEY) \
    .option("startingPosition", "LATEST") \
    .load()

print("✅ Connected to Kinesis stream")
print("\nRaw stream schema (before parsing):")
kinesis_stream.printSchema()

# Parse JSON data from Kinesis
# Kinesis returns: data (binary), partitionKey, sequenceNumber, approximateArrivalTimestamp
telemetry_df = kinesis_stream \
    .selectExpr("CAST(data AS STRING) as json_data") \
    .select(from_json(col("json_data"), telemetry_schema).alias("telemetry")) \
    .select("telemetry.*")

print("\n✅ JSON parsed successfully")
print("\nParsed telemetry schema:")
telemetry_df.printSchema()
