In [0]:
# Set working catalog and schema
spark.sql("USE CATALOG smart_claims_dev")
spark.sql("USE SCHEMA bronze")


In [0]:
# Setting working catalog and schema
spark.sql("USE CATALOG smart_claims_dev")
spark.sql("USE SCHEMA bronze")

print(f"Current Catalog: {spark.catalog.currentCatalog()}")
print(f"Current Database: {spark.catalog.currentDatabase()}")

In [0]:
# CSV sources 
csv_sources = {
    "claims_raw": "/Volumes/smart_claims_dev/landing/raw_files/sql_server/claims.csv",
    "customers_raw": "/Volumes/smart_claims_dev/landing/raw_files/sql_server/customers.csv",
    "policies_raw": "/Volumes/smart_claims_dev/landing/raw_files/sql_server/policies.csv"
}

print("CSV sources configured:")
for table, path in csv_sources.items():
    print(f"  {table}: {path}")


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, BooleanType

# Define explicit schema for claims (prevents inference issues)
claims_schema = StructType([
    StructField("claim_no", StringType(), True),
    StructField("policy_no", StringType(), True),
    StructField("claim_date", DateType(), True),
    StructField("months_as_customer", IntegerType(), True),
    StructField("injury", IntegerType(), True),
    StructField("property", IntegerType(), True),
    StructField("vehicle", IntegerType(), True),
    StructField("total", IntegerType(), True),
    StructField("collision_type", StringType(), True),
    StructField("number_of_vehicles_involved", IntegerType(), True),
    StructField("age", DoubleType(), True),
    StructField("insured_relationship", StringType(), True),
    StructField("license_issue_date", DateType(), True),
    StructField("date", DateType(), True),
    StructField("hour", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("severity", StringType(), True),
    StructField("number_of_witnesses", IntegerType(), True),
    StructField("suspicious_activity", BooleanType(), True)
])

for table_name, file_path in csv_sources.items():
    
    print(f"\n{'='*60}")
    print(f"Processing: {table_name}")
    print(f"Source: {file_path}")
    print(f"{'='*60}")
    
    # Drop existing table if it exists (clean slate)
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    print(f"Dropped existing table (if any): {table_name}")
    
    # Read CSV with explicit schema (for claims) or infer for others
    if table_name == "claims_raw":
        # Use explicit schema for claims
        df = spark.read \
            .option("header", True) \
            .schema(claims_schema) \
            .csv(file_path)
    else:
        # Infer schema for other tables (simpler for now)
        df = spark.read \
            .option("header", True) \
            .option("inferSchema", True) \
            .csv(file_path)
    
    # Show schema
    print("\nSchema:")
    df.printSchema()
    
    # Preview data
    print("\nSample data (first 3 rows):")
    df.show(3, truncate=False)
    
    # Write to Delta table (no mergeSchema needed - fresh table)
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_name)
    
    # Get row count
    row_count = spark.table(table_name).count()
    
    print(f"âœ… Created table: {table_name}")
    print(f"   Rows: {row_count:,}")

print("\n" + "="*60)
print("ðŸŽ‰ CSV INGESTION COMPLETE!")
print("="*60)


In [0]:
# List tables
print("Bronze tables created:")
tables = spark.sql("SHOW TABLES IN smart_claims_dev.bronze")
tables.show(truncate=False)

# Row counts
print("\nRow counts:")
for table in ["claims_raw", "customers_raw", "policies_raw"]:
    count = spark.table(table).count()
    print(f"  {table}: {count:,} rows")
