In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=7ad6b04a0aad07604ba64be9acb72ead54bec8f740a374597e53bf0c516679a4
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [45]:
from google.colab import files
from google.colab import drive
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, when, floor, datediff, current_date, avg, count
from pyspark.sql.functions import year
from pyspark.sql.functions import col
from pyspark.sql.functions import count, desc, row_number
from pyspark.sql.window import Window

In [36]:
#initializing Spark session

spark = SparkSession.builder.appName("Data exploration - State Farm").config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [37]:
combined_info_df = spark.read.parquet("part-*.snappy.parquet")

In [38]:
combined_info_df = combined_info_df.withColumn("Customer Age", floor(datediff(current_date(), col("Date of Birth")) / 365))

# how many customers are in each group?

In [39]:
# which age group has the most claims?
age_groups_df_og = combined_info_df.withColumn(
    "AgeGroup",
    when(col("Customer Age") < 18, "Under 18")
    .when((col("Customer Age") >= 18) & (col("Customer Age") < 25), "18-24")
    .when((col("Customer Age") >= 25) & (col("Customer Age") < 35), "25-34")
    .when((col("Customer Age") >= 35) & (col("Customer Age") < 45), "35-44")
    .when((col("Customer Age") >= 45) & (col("Customer Age") < 55), "45-54")
    .when((col("Customer Age") >= 55) & (col("Customer Age") < 65), "55-64")
    .otherwise("65 and Over")
)

claims_by_age_group = age_groups_df.groupBy("AgeGroup").count()

# Sort by claim count in descending order
claims_by_age_group = claims_by_age_group.orderBy("count", ascending=False)

In [24]:
claims_by_age_group.show()

+-----------+------+
|   AgeGroup| count|
+-----------+------+
|65 and Over|272098|
|      25-34|134384|
|      35-44|131985|
|      18-24|127949|
|      55-64| 73903|
|      45-54| 59373|
|   Under 18| 46113|
+-----------+------+



# which age group has the most claims?

In [32]:
filtered_claims_df = combined_info_df.filter(col("Claim Payout") != 0)

age_groups_df = filtered_claims_df.withColumn(
    "AgeGroup",
    when(col("Customer Age") < 18, "Under 18")
    .when((col("Customer Age") >= 18) & (col("Customer Age") < 25), "18-24")
    .when((col("Customer Age") >= 25) & (col("Customer Age") < 35), "25-34")
    .when((col("Customer Age") >= 35) & (col("Customer Age") < 45), "35-44")
    .when((col("Customer Age") >= 45) & (col("Customer Age") < 55), "45-54")
    .when((col("Customer Age") >= 55) & (col("Customer Age") < 65), "55-64")
    .otherwise("65 and Over")
)

# Group by age group and count claims
num_claims_by_age_group = age_groups_df.groupBy("AgeGroup").count()
num_claims_by_age_group = claims_by_age_group.orderBy("count", ascending=False)

In [33]:
num_claims_by_age_group.show()

+-----------+-----+
|   AgeGroup|count|
+-----------+-----+
|65 and Over| 2028|
|      35-44| 1040|
|      25-34|  998|
|      18-24|  922|
|      55-64|  572|
|      45-54|  469|
|   Under 18|  338|
+-----------+-----+



In [34]:
# How many people are in each age group?

num_age_group = age_groups_df_og.groupBy("AgeGroup").count()

# Sort by claim count in descending order
claims_by_age_group = claims_by_age_group.orderBy("count", ascending=False)

In [35]:
claims_by_age_group.show()

+-----------+-----+
|   AgeGroup|count|
+-----------+-----+
|65 and Over| 2028|
|      35-44| 1040|
|      25-34|  998|
|      18-24|  922|
|      55-64|  572|
|      45-54|  469|
|   Under 18|  338|
+-----------+-----+



# What marital status has the highest average claims?

In [41]:
divorced_df = combined_info_df.filter(combined_info_df["Marital Status"] == "D")
married_df = combined_info_df.filter(combined_info_df["Marital Status"] == "M")
single_df = combined_info_df.filter(combined_info_df["Marital Status"] == "S")
widow_df = combined_info_df.filter(combined_info_df["Marital Status"] == "W")

# Calculate the average claim payout for each group
avg_claim_payout_divorced = divorced_df.agg(avg("Claim Payout")).collect()[0][0]
avg_claim_payout_married = married_df.agg(avg("Claim Payout")).collect()[0][0]
avg_claim_payout_single = single_df.agg(avg("Claim Payout")).collect()[0][0]
avg_claim_payout_widow = widow_df.agg(avg("Claim Payout")).collect()[0][0]

# Display the average claim payout for each group
print("Average Claim Payout for Divorced:", avg_claim_payout_divorced)
print("Average Claim Payout for Married:", avg_claim_payout_married)
print("Average Claim Payout for Single:", avg_claim_payout_single)
print("Average Claim Payout for Widow:", avg_claim_payout_widow)

Average Claim Payout for Divorced: 1232.0013591265347
Average Claim Payout for Married: 1226.8023464071528
Average Claim Payout for Single: 1200.8047799723413
Average Claim Payout for Widow: 1031.8701934418496


#finding total number of claims per merital status

In [None]:
#finding total number of claims per merital status

divorced_df = combined_info_df.filter(combined_info_df["Marital Status"] == "D")
married_df = combined_info_df.filter(combined_info_df["Marital Status"] == "M")
single_df = combined_info_df.filter(combined_info_df["Marital Status"] == "S")
widow_df = combined_info_df.filter(combined_info_df["Marital Status"] == "W")


In [46]:
customers_per_marital_status = combined_info_df.groupBy("Marital Status").agg(count("CUST_ID").alias("Total Customers"))

In [47]:
customers_per_marital_status.show()

+--------------+---------------+
|Marital Status|Total Customers|
+--------------+---------------+
|             M|         281963|
|             D|         264876|
|             W|          16956|
|             S|         282010|
+--------------+---------------+



In [48]:
# most popular car per age group

from pyspark.sql.functions import count, desc, row_number
from pyspark.sql.window import Window

# Group the DataFrame by marital status and car make, and count the number of occurrences
marital_status_car_count = combined_info_df.groupBy("Marital Status", "Make").agg(count("*").alias("Count"))

# Define a window specification to partition by marital status and order by count in descending order
window_spec = Window.partitionBy("Marital Status").orderBy(desc("Count"))

# Assign row numbers to each row within each partition (marital status group)
marital_status_car_count = marital_status_car_count.withColumn("rn", row_number().over(window_spec))

# Filter to keep only the rows with the highest count for each marital status group
most_popular_cars_per_marital_status = marital_status_car_count.filter(col("rn") == 1)

# Show the most popular car make for each marital status
most_popular_cars_per_marital_status.show()


+--------------+-------------+-----+---+
|Marital Status|         Make|Count| rn|
+--------------+-------------+-----+---+
|             D|Manufacturer1|53194|  1|
|             M|Manufacturer2|56506|  1|
|             S|Manufacturer2|56662|  1|
|             W|Manufacturer2| 3453|  1|
+--------------+-------------+-----+---+

