In [0]:
# Cell 1: Read Bronze telematics and check schema
bronze_telematics = spark.table("smart_claims_dev.bronze.telematics_raw")

print(f"üìä Bronze Telematics - Row Count: {bronze_telematics.count():,}")
print("\n" + "="*80)
print("üîç Schema:")
bronze_telematics.printSchema()

In [0]:
# Cell 2: Duplicate check and basic stats for telematics
from pyspark.sql.functions import count, countDistinct

total_rows = bronze_telematics.count()
unique_events = bronze_telematics.select(countDistinct("vehicle_id", "timestamp")).collect()[0][0]
duplicate_count = total_rows - unique_events

print(f"üìä BRONZE TELEMATICS ANALYSIS")
print("=" * 80)
print(f"Total Rows:              {total_rows:,}")
print(f"Unique (vehicle_id, timestamp) pairs: {unique_events:,}")
print(f"Duplicate Records:       {duplicate_count:,}")
print("=" * 80)
bronze_telematics.show(5, truncate=False)

In [0]:
# Cell 3: NULL and business logic checks for Telematics
from pyspark.sql.functions import col

print("üîç NULL AND RANGE CHECKS")
print("=" * 80)

bads = {
    "vehicle_id": bronze_telematics.filter(col("vehicle_id").isNull()).count(),
    "timestamp": bronze_telematics.filter(col("timestamp").isNull()).count(),
    "speed_mph": bronze_telematics.filter(col("speed_mph").isNull()).count(),
    "latitude": bronze_telematics.filter(col("latitude").isNull()).count(),
    "longitude": bronze_telematics.filter(col("longitude").isNull()).count(),
    "acceleration_mps2": bronze_telematics.filter(col("acceleration_mps2").isNull()).count()
}
print(f"Null vehicle_id:      {bads['vehicle_id']:,}")
print(f"Null timestamp:       {bads['timestamp']:,}")
print(f"Null speed_mph:       {bads['speed_mph']:,}")
print(f"Null latitude:        {bads['latitude']:,}")
print(f"Null longitude:       {bads['longitude']:,}")
print(f"Null acceleration_mps2: {bads['acceleration_mps2']:,}")

out_lat = bronze_telematics.filter((col("latitude") < -90) | (col("latitude") > 90)).count()
out_long = bronze_telematics.filter((col("longitude") < -180) | (col("longitude") > 180)).count()
out_speed = bronze_telematics.filter((col("speed_mph") < 0) | (col("speed_mph") > 200)).count()
out_accel = bronze_telematics.filter((col("acceleration_mps2") < -50) | (col("acceleration_mps2") > 50)).count()

print(f"Bad latitudes:        {out_lat:,}")
print(f"Bad longitudes:       {out_long:,}")
print(f"Bad speed_mph:        {out_speed:,}")
print(f"Bad acceleration:     {out_accel:,}")

total_invalid = sum(bads.values()) + out_lat + out_long + out_speed + out_accel
print("=" * 80)
print(f"üìä TOTAL INVALID ROWS: {total_invalid:,}")
print(f"üìä VALID ROWS IF ALL REMOVED: {5000 - total_invalid:,} ({((5000 - total_invalid)/5000)*100:.2f}%)")

In [0]:
# Cell 4: Transform and write Telematics to Silver
from pyspark.sql.functions import current_timestamp

print("üîß TRANSFORMING TO SILVER...")
print("=" * 80)

# Add audit column
telematics_silver = bronze_telematics.withColumn("processed_at", current_timestamp())
print(f"‚úÖ Added audit column: processed_at")

# Write to Silver Delta table
telematics_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("smart_claims_dev.silver.telematics_clean")

print(f"‚úÖ Successfully written to: smart_claims_dev.silver.telematics_clean")
print("=" * 80)

# Verify
silver_table = spark.table("smart_claims_dev.silver.telematics_clean")
silver_count = silver_table.count()

print(f"üîç VERIFICATION:")
print(f"   Rows written:  {silver_count:>10,}")
print(f"   Expected:      {5000:>10,}")
print(f"   Match:         {'‚úÖ YES' if silver_count == 5000 else '‚ùå NO'}")
print("=" * 80)

print("\nüìä SAMPLE SILVER DATA (First 5 rows):")
display(silver_table.limit(5))