In [0]:
# Databricks Notebook: Insurance Claims ETL + Analytics Project

In [0]:
# 1. Load CSVs
claims_df = spark.read.csv("dbfs:/FileStore/tables/Claims_v6.csv", header=True, inferSchema=True)
policy_df = spark.read.csv("dbfs:/FileStore/tables/Policy_Holders_v6.csv", header=True, inferSchema=True)
adjusters_df = spark.read.csv("dbfs:/FileStore/tables/Adjusters_v6.csv", header=True, inferSchema=True)

In [0]:
# 2. Inspect schemas
policy_df.printSchema()
claims_df.printSchema()
adjusters_df.printSchema()

root
 |-- PolicyID: integer (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- PolicyType: string (nullable = true)
 |-- StartDate: date (nullable = true)
 |-- PremiumAmount: integer (nullable = true)

root
 |-- ClaimID: integer (nullable = true)
 |-- PolicyID: integer (nullable = true)
 |-- ClaimType: string (nullable = true)
 |-- ClaimAmount: integer (nullable = true)
 |-- ClaimStatus: string (nullable = true)
 |-- ClaimDate: date (nullable = true)
 |-- AdjusterID: integer (nullable = true)

root
 |-- AdjusterID: integer (nullable = true)
 |-- AdjusterName: string (nullable = true)
 |-- Region: string (nullable = true)



In [0]:
# 3. Join datasets using correct keys, resolving ambiguous references
joined_df = claims_df.alias("c") \
    .join(policy_df.alias("p"), col("c.PolicyID") == col("p.PolicyID"), "left") \
    .join(adjusters_df.alias("a"), col("c.AdjusterID") == col("a.AdjusterID"), "left")

In [0]:

# 4. Select and rename relevant columns
final_df = joined_df.select(
    col("c.ClaimID"),
    col("c.PolicyID"),
    col("p.CustomerName"),
    col("p.PolicyType"),
    col("p.PremiumAmount"),
    col("p.StartDate"),
    col("c.ClaimType"),
    col("c.ClaimAmount"),
    col("c.ClaimStatus"),
    col("c.ClaimDate"),
    col("c.AdjusterID"),
    col("a.AdjusterName"),
    col("a.Region")
)

In [0]:
# 5. Manually export dataframe - (unable to link or upload to table in community edition)
display(final_df)


ClaimID,PolicyID,CustomerName,PolicyType,PremiumAmount,StartDate,ClaimType,ClaimAmount,ClaimStatus,ClaimDate,AdjusterID,AdjusterName,Region
5001,1866,Randy Bush,Auto,819,2024-11-13,Theft,13426,Open,2023-11-22,17,Jennifer Williams,West
5002,1512,Steven Johnson,Health,427,2024-01-12,Flood,4832,In Review,2023-11-07,10,Brenda Farley,South
5003,1740,Karen Smith,Life,1272,2025-04-21,Fire,1497,Paid,2023-07-18,17,Jennifer Williams,West
5004,1578,John Martin,Life,1367,2022-07-13,Theft,4576,In Review,2023-08-22,17,Jennifer Williams,West
5005,1335,John Simmons,Health,695,2025-02-28,Accident,14890,Closed,2024-09-08,4,Ryan Pierce,East
5006,1418,Daniel Hood,Health,1342,2024-04-25,Accident,6599,Open,2023-09-03,6,Karen Greer,West
5007,1932,John Crawford,Health,1186,2024-09-27,Other,6055,Open,2023-09-15,12,Melissa Hughes,East
5008,1774,Dr. Martha Moran,Health,725,2023-09-21,Flood,11423,Closed,2024-09-03,20,Adam Pennington,East
5009,1295,Sarah Aguirre,Life,456,2022-06-14,Other,14944,Paid,2024-10-07,17,Jennifer Williams,West
5010,1938,David Williamson,Auto,1396,2024-12-25,Other,12629,Closed,2024-07-20,17,Jennifer Williams,West
