In [0]:
# ============================================
# CELL 1: CONFIGURATION (CORRECTED PATHS)
# ============================================

# Set catalog and schema
spark.sql("USE CATALOG smart_claims_dev")
spark.sql("USE SCHEMA bronze")

print(f"Current Catalog: {spark.catalog.currentCatalog()}")
print(f"Current Database: {spark.catalog.currentDatabase()}")

# File paths - Use SAME volume (raw_files) for both data and checkpoints
source_path = "/Volumes/smart_claims_dev/landing/raw_files/kinesis_landing/"
checkpoint_path = "/Volumes/smart_claims_dev/landing/raw_files/_checkpoints/telemetry"  # âœ… Inside raw_files volume
table_name = "telematics_raw"

print("\nâœ… Configuration loaded")
print(f"   Source path: {source_path}")
print(f"   Checkpoint: {checkpoint_path}")
print(f"   Target table: {table_name}")


In [0]:
# ============================================
# CELL 1B: CREATE CHECKPOINT DIRECTORY
# ============================================

# Create checkpoint directory inside raw_files volume
checkpoint_dir = "/Volumes/smart_claims_dev/landing/raw_files/_checkpoints"

try:
    dbutils.fs.ls(checkpoint_dir)
    print(f"âœ… Checkpoint directory already exists: {checkpoint_dir}")
except Exception:
    print(f"Creating checkpoint directory: {checkpoint_dir}")
    dbutils.fs.mkdirs(checkpoint_dir)
    print(f"âœ… Directory created successfully")

# Verify
print("\nDirectory created at:")
display(dbutils.fs.ls("/Volumes/smart_claims_dev/landing/raw_files/"))


In [0]:
# ============================================
# CELL 2: DEFINE TELEMETRY SCHEMA
# ============================================

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Schema matching generator output
telemetry_schema = StructType([
    StructField("vehicle_id", StringType(), True),
    StructField("speed_mph", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("acceleration_mps2", DoubleType(), True),
    StructField("timestamp", TimestampType(), True)
])

print("âœ… Schema defined:")
for field in telemetry_schema.fields:
    print(f"   {field.name}: {field.dataType}")


In [0]:
# ============================================
# CELL 3: READ FILES WITH AUTO LOADER
# ============================================

# Read telemetry files using Auto Loader
telemetry_stream = spark.readStream \
    .format("cloudFiles") \
    .option("cloudFiles.format", "csv") \
    .option("cloudFiles.schemaLocation", f"{checkpoint_path}/schema") \
    .option("cloudFiles.inferColumnTypes", "true") \
    .option("header", "true") \
    .schema(telemetry_schema) \
    .load(source_path)

print("âœ… Auto Loader configured")
print("\nStream schema:")
telemetry_stream.printSchema()


In [0]:
# ============================================
# CELL 4: WRITE STREAM TO DELTA TABLE
# ============================================

# Write to Bronze Delta table
query = telemetry_stream.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .trigger(availableNow=True) \
    .table(table_name)

print("âœ… Auto Loader processing started!")
print(f"\nQuery ID: {query.id}")
print("\nðŸ”„ Processing files...")
print("   (This will process all 10 CSV files)")

# Wait for completion (availableNow mode stops automatically)
query.awaitTermination()

print("\nâœ… Processing complete!")


In [0]:
# ============================================
# CELL 5: VERIFY INGESTION
# ============================================

# Count rows
total_rows = spark.table(table_name).count()

print("="*60)
print("ðŸŽ‰ AUTO LOADER INGESTION RESULTS")
print("="*60)
print(f"Total rows ingested: {total_rows:,}")
print(f"Expected rows: 5,000 (10 batches Ã— 500 events)")
print("="*60)

# Show sample data
print("\nSample telemetry data:")
spark.table(table_name).show(5, truncate=False)

# Show vehicle distribution
print("\nVehicle distribution (top 10):")
spark.sql(f"""
    SELECT vehicle_id, COUNT(*) as event_count
    FROM {table_name}
    GROUP BY vehicle_id
    ORDER BY event_count DESC
    LIMIT 10
""").show()
