In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year

In [0]:
# Initialize Spark session (in Databricks this is already available)
spark = SparkSession.builder.appName("InsuranceClaimsAnalysis").getOrCreate()

In [0]:
# Load CSV files
policy_df = spark.read.csv("dbfs:/FileStore/tables/Policy_Holders_v3.csv", header=True, inferSchema=True)
claims_df = spark.read.csv("dbfs:/FileStore/tables/Claims_v3.csv", header=True, inferSchema=True)
adjusters_df = spark.read.csv("dbfs:/FileStore/tables/Adjusters_v3.csv", header=True, inferSchema=True)

In [0]:
# Join datasets
claims_with_policy = claims_df.join(policy_df, on="PolicyID", how="inner")
full_data = claims_with_policy.join(adjusters_df, on="AdjusterID", how="left")

In [0]:
# Example analysis: Total claim amount by policy type and year
claims_by_type_year = (
    full_data.withColumn("Year", year("ClaimDate"))
    .groupBy("PolicyType", "Year")
    .sum("ClaimAmount")
    .withColumnRenamed("sum(ClaimAmount)", "TotalClaimAmount")
    .orderBy("Year", "PolicyType")
)

In [0]:
# Show the results
claims_by_type_year.show(10)

+----------+----+----------------+
|PolicyType|Year|TotalClaimAmount|
+----------+----+----------------+
|      Auto|2021|          155804|
|      Home|2021|          174185|
|      Life|2021|          207133|
|      Auto|2022|          159784|
|      Home|2022|          177559|
|      Life|2022|          224096|
|      Auto|2023|          176114|
|      Home|2023|           95552|
|      Life|2023|          235902|
+----------+----+----------------+



In [0]:
# Save joined data for Power BI import
full_data.write.mode("overwrite").option("header", True).csv("/mnt/data/Claims_Joined_Data")