In [0]:
# Cell 1: Read Bronze customers and check schema
bronze_customers = spark.table("smart_claims_dev.bronze.customers_raw")

print(f"üìä Bronze Customers - Row Count: {bronze_customers.count():,}")
print("\n" + "="*80)
print("üîç Schema:")
bronze_customers.printSchema()


In [0]:
# Cell 2: Check for duplicates and basic statistics
from pyspark.sql.functions import count, countDistinct

# Get counts
total_rows = bronze_customers.count()
unique_customers = bronze_customers.select(countDistinct("customer_id")).collect()[0][0]
duplicate_count = total_rows - unique_customers

# Display results
print(f"üìä BRONZE CUSTOMERS ANALYSIS")
print("=" * 80)
print(f"Total Rows:              {total_rows:,}")
print(f"Unique customer_id:      {unique_customers:,}")
print(f"Duplicate Records:       {duplicate_count:,}")
print("=" * 80)

# Show sample data
print("\nüîç SAMPLE DATA (First 5 rows):")
bronze_customers.show(5, truncate=False)

In [0]:
# Cell 3: Check for NULL values in critical columns
from pyspark.sql.functions import col

# Define critical columns that should NOT be null
critical_columns = [
    'customer_id',
    'name',
    'date_of_birth'
]

print("üîç NULL VALUE ANALYSIS - Critical Columns")
print("=" * 80)
print(f"{'Column':<20} | {'Null Count':>12} | {'Null %':>10}")
print("-" * 80)

# Check each column for nulls
for column in critical_columns:
    null_count = bronze_customers.filter(col(column).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    print(f"{column:<20} | {null_count:>12,} | {null_percentage:>9.2f}%")

print("=" * 80)

In [0]:
# Cell 4: Date validation and age calculation (FINAL - with try_to_date)
from pyspark.sql.functions import try_to_date, current_date, floor, datediff, when, col, lit, coalesce

print("üîç DATE & AGE VALIDATION")
print("=" * 80)

# Try multiple date formats using try_to_date (returns NULL on failure, no error)
customers_with_dates = bronze_customers.withColumn(
    "date_of_birth_parsed",
    when(col("date_of_birth").isNull() | (col("date_of_birth") == "null"), lit(None))
    .otherwise(
        coalesce(
            try_to_date(col("date_of_birth"), "MM-dd-yyyy"),  # Try US format
            try_to_date(col("date_of_birth"), "dd-MM-yyyy"),  # Try European format
            try_to_date(col("date_of_birth"), "yyyy-MM-dd")   # Try ISO format
        )
    )
)

# Calculate age (only for valid dates)
customers_with_age = customers_with_dates.withColumn(
    "age",
    when(col("date_of_birth_parsed").isNotNull(),
         floor(datediff(current_date(), col("date_of_birth_parsed")) / 365.25)
    ).otherwise(lit(None))
)

# Check 1: Invalid date formats (couldn't parse with any format)
invalid_dates = customers_with_age.filter(col("date_of_birth_parsed").isNull()).count()
print(f"‚ùå Invalid date formats:              {invalid_dates:>10,}")

# Check 2: Ages outside valid range (18-120)
invalid_age = customers_with_age.filter(
    col("age").isNotNull() & ((col("age") < 18) | (col("age") > 120))
).count()
print(f"‚ùå Invalid ages (<18 or >120):        {invalid_age:>10,}")

# Check 3: Future dates of birth
future_dob = customers_with_age.filter(
    col("date_of_birth_parsed").isNotNull() & 
    (col("date_of_birth_parsed") > current_date())
).count()
print(f"‚ùå Future dates of birth:             {future_dob:>10,}")

print("=" * 80)

# Summary
total_invalid = invalid_dates + invalid_age + future_dob
print(f"\nüìä TOTAL INVALID RECORDS: {total_invalid:,}")
print(f"üìä VALID RECORDS: {total_rows - total_invalid:,} ({((total_rows - total_invalid)/total_rows)*100:.2f}%)")

# Show sample with age
print("\nüîç SAMPLE DATA WITH AGE (First 5 rows):")
customers_with_age.select("customer_id", "name", "date_of_birth", "date_of_birth_parsed", "age").show(5, truncate=False)


In [0]:
# Cell 5: Transform Bronze to Silver - Apply all quality rules
from pyspark.sql.functions import current_timestamp

print("üîß APPLYING TRANSFORMATIONS...")
print("=" * 80)

# Start with parsed dates and ages
customers_silver = customers_with_age

# Filter 1: Remove customers with invalid dates (couldn't parse)
customers_silver = customers_silver.filter(col("date_of_birth_parsed").isNotNull())
print(f"‚úÖ Filter 1: Remove invalid date formats")

# Filter 2: Remove future dates of birth
customers_silver = customers_silver.filter(col("date_of_birth_parsed") <= current_date())
print(f"‚úÖ Filter 2: Remove future dates of birth")

# Filter 3: Remove invalid ages (< 18 or > 120)
customers_silver = customers_silver.filter((col("age") >= 18) & (col("age") <= 120))
print(f"‚úÖ Filter 3: Remove invalid ages (<18 or >120)")

# Filter 4: Remove customers without names or IDs
customers_silver = customers_silver.filter(
    col("customer_id").isNotNull() & 
    col("name").isNotNull()
)
print(f"‚úÖ Filter 4: Remove customers without ID or name")

# Add audit column: when was this record processed to Silver
customers_silver = customers_silver.withColumn("processed_at", current_timestamp())
print(f"‚úÖ Added audit column: processed_at")

# Drop temporary parsing column, keep age
customers_silver = customers_silver.drop("date_of_birth_parsed")
print(f"‚úÖ Dropped temporary column: date_of_birth_parsed")

# Show results
final_count = customers_silver.count()
removed_count = total_rows - final_count

print("=" * 80)
print(f"üìä TRANSFORMATION RESULTS:")
print(f"   Original Bronze rows:  {total_rows:>10,}")
print(f"   Removed invalid rows:  {removed_count:>10,}")
print(f"   Final Silver rows:     {final_count:>10,}")
print(f"   Data quality:          {(final_count/total_rows)*100:>9.2f}%")
print("=" * 80)

In [0]:
# Cell 6: Write cleaned customers to Silver Delta table
print("üíæ WRITING TO SILVER LAYER...")
print("=" * 80)

# Write to Delta table
customers_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("smart_claims_dev.silver.customers_clean")

print("‚úÖ Successfully written to: smart_claims_dev.silver.customers_clean")
print("=" * 80)

# Verify the write
silver_table = spark.table("smart_claims_dev.silver.customers_clean")
silver_count = silver_table.count()

print(f"üîç VERIFICATION:")
print(f"   Rows written:  {silver_count:>10,}")
print(f"   Expected:      {final_count:>10,}")
print(f"   Match:         {'‚úÖ YES' if silver_count == final_count else '‚ùå NO'}")
print("=" * 80)

# Show sample from Silver table
print("\nüìä SAMPLE SILVER DATA (First 5 rows):")
silver_table.show(5, truncate=False)


In [0]:
# Cell 7: Final summary - Bronze vs Silver comparison for customers
print("=" * 80)
print("üéØ SILVER LAYER CUSTOMERS: FINAL SUMMARY")
print("=" * 80)
print(f"Original Bronze Rows:         {total_rows:,}")
print(f"Silver Rows (Cleaned):        {silver_count:,}")
print(f"Data Quality Percent Pass:    {(silver_count/total_rows)*100:.2f}%")
print("=" * 80)
silver_table.printSchema()
print("=" * 80)