# Importing dependencies/libraries needed + initial setup

In [1]:
#installing pyspark in this environment

!pip install pyspark



In [222]:
#importing the package that I'm going to use do my data analysis

from google.colab import files
from google.colab import drive
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, avg, round, datediff, current_date, count, stddev, floor, expr, min, max, when, desc
from datetime import datetime

In [3]:
#creating a SparkSession
spark = SparkSession.builder.appName("Data Analysis - State Farm").getOrCreate()

In [166]:
#reading in the parquet I created in part 1

combined_info_df = spark.read.parquet("part-*.snappy.parquet")

In [167]:
# displaying the dataframe to make sure everything looks correct

combined_info_df.show(5)

+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+-------------+----------+-------------+-------------------+------------+---------------+----+-----+----------------------+-----------------------+------------+----------------------+
|CAR_ID|  CUST_ID|    HH_ID|Active HH|HH Start Date|  Phone Number|  ZIP|HH State|Country|Referral Source|Date of Birth|Gender|Marital Status|     Employment Type|Income|              Status|Car State|Year|         Make|Body Style|Vehicle Value|Annual Miles Driven|Business Use|Antique Vehicle|Lien|Lease|Driver Safety Discount|Vehicle Safety Discount|Claim Payout|6 Month Premium Amount|
+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+-------------+--

# Question 1:
What is the average number of cars per household? ~3.8<br />
To calculate this I'm going to calculate the average number of cars per unique HH_ID

In [168]:
#grouping by HH_ID and counting the number of distinct CAR_IDs per household

cars_per_household_df = combined_info_df.groupBy("HH_ID").agg(F.countDistinct("CAR_ID").alias("num_cars"))

In [169]:
#calculating the average of the cars per household found above and rounding for readability

average_cars_per_household = cars_per_household_df.agg(F.avg("num_cars")).first()[0]
# rounded_avg = round(average_cars_per_household, 3)

print(f"Average number of cars per household: {average_cars_per_household}")

Average number of cars per household: 3.8065579511541507


# Question 2:
How many cars are there by age? I'm going to break this into multiple steps. <br />
Step 1 will be determining the cars age (I'm going to subtract the year of the car from the current year). <br />
Step 2 will be grouping all the cars together by their calculated age.<br />
Step 3 will be counting the number of cars in each group - Example: There are 123 cars that are 20 years old.

In [170]:
#calculating the current year (I could also hardcode this if needed)

current_year = datetime.now().year
print(current_year)

2024


In [171]:
#calculating the age of each car and creating a new column to hold our vehicle age

combined_info_df = combined_info_df.withColumn("Car Age", F.lit(current_year) - F.col("Year"))

In [172]:
combined_info_df.show(5)

+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+-------------+----------+-------------+-------------------+------------+---------------+----+-----+----------------------+-----------------------+------------+----------------------+-------+
|CAR_ID|  CUST_ID|    HH_ID|Active HH|HH Start Date|  Phone Number|  ZIP|HH State|Country|Referral Source|Date of Birth|Gender|Marital Status|     Employment Type|Income|              Status|Car State|Year|         Make|Body Style|Vehicle Value|Annual Miles Driven|Business Use|Antique Vehicle|Lien|Lease|Driver Safety Discount|Vehicle Safety Discount|Claim Payout|6 Month Premium Amount|Car Age|
+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+

In [173]:
#grouping the results by car age and displaying the first 5 records. Here we see there are 27,000 "new" cars - aka have a car age of zero.

cars_by_age = combined_info_df.groupBy("Car Age").count().orderBy("Car Age")
cars_by_age.show(5)


+-------+-----+
|Car Age|count|
+-------+-----+
|      0|27700|
|      1|27492|
|      2|27515|
|      3|27721|
|      4|27233|
+-------+-----+
only showing top 5 rows



# Question 3:
How many cars are there by make? <br />
Will approach this similar to question 2. <br />
I'm going to group the data by the "Make" column and then count the number of cars for each make

In [174]:
# using groupby + agg count to show the number of cars per make. The alias isn't needed, but I like it for readability

cars_by_make_df = combined_info_df.groupBy("Make").agg(count("CAR_ID").alias("Number_of_Cars"))

In [175]:
#displaying the cars by make

cars_by_make_df.show(10)

+-------------+--------------+
|         Make|Number_of_Cars|
+-------------+--------------+
|Manufacturer5|         58704|
|Manufacturer7|        110503|
|Manufacturer1|        168954|
|Manufacturer3|        168834|
|Manufacturer2|        169311|
|Manufacturer4|         58928|
|Manufacturer6|        110571|
+-------------+--------------+



# Question 4:
Which cars are the safest? Manufacturer4<br />
For this I'm going to use the columns only related to the car itself - ie I'm not going to consider things like driver safety discount.<br />
I'm going to create a score based on these values: Vehicle Safety Discount, Year, Claim Payout, and Antique Vehicle.<br />

I used these factors to create a safety score - I'll go more into how/why when I answer question #6

In [176]:
#calculating the average claim payout per make. My reasoning is higher claim payout = less safe care (more passenger injuries)

average_claim_payout_per_make = combined_info_df.groupBy("Make").agg(round(avg("Claim Payout"), 2).alias("avg claim payout"))

#joining the average claim payout with the original dataframe

claims_avg_df = combined_info_df.join(average_claim_payout_per_make, on="Make")

In [177]:
claims_avg_df.show(5)

+-------------+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+----------+-------------+-------------------+------------+---------------+----+-----+----------------------+-----------------------+------------+----------------------+-------+----------------+
|         Make|CAR_ID|  CUST_ID|    HH_ID|Active HH|HH Start Date|  Phone Number|  ZIP|HH State|Country|Referral Source|Date of Birth|Gender|Marital Status|     Employment Type|Income|              Status|Car State|Year|Body Style|Vehicle Value|Annual Miles Driven|Business Use|Antique Vehicle|Lien|Lease|Driver Safety Discount|Vehicle Safety Discount|Claim Payout|6 Month Premium Amount|Car Age|avg claim payout|
+-------------+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+----------------

In [178]:
#calculating a safety score based on the available columns
safety_score_df = claims_avg_df.withColumn(
    "safety_score",
    (col("Vehicle Safety Discount") * 100) + 1900 - col("avg claim payout")  - col("Antique Vehicle") - ((2024 - col("Year")) * 5))

# standarding it
safety_score_df = safety_score_df.withColumn("safety_score", col("safety_score") / 1000)

ranked_cars_df = safety_score_df.orderBy(col("safety_score").desc())

# creating a new df to display the safest in decending order

safest_score_df = ranked_cars_df.groupBy("Make").agg(F.avg("safety_score").alias("avg_safety_score"))
safest_score_df = safest_score_df.withColumn("avg_safety_score", round("avg_safety_score", 2))
safest_score_df = safest_score_df.orderBy(col("avg_safety_score").desc())

In [179]:
safest_score_df.show(10)

+-------------+----------------+
|         Make|avg_safety_score|
+-------------+----------------+
|Manufacturer4|            0.92|
|Manufacturer5|            0.77|
|Manufacturer2|            0.68|
|Manufacturer3|            0.64|
|Manufacturer6|             0.6|
|Manufacturer1|            0.53|
|Manufacturer7|            0.46|
+-------------+----------------+



# Question 5:
Which cars are the most dangerous? Manufacturer7<br />
For this I'm going to use the columns only related to the car itself - ie I'm not going to consider things like driver safety discount.<br />
I'm going to create a score based on these values: Vehicle Safety Discount, Year, Claim Payout, and Antique Vehicle.<br />

I used these factors to create a safety score in question 5. - I'll go more into how/why when I answer question #6

In [180]:
# since i already created the safest in the last problem, I just sorted the dataframe by ascending to get the most dangerous

most_dangerous_df = safest_score_df.orderBy(col("avg_safety_score").asc())
most_dangerous_df.show()

+-------------+----------------+
|         Make|avg_safety_score|
+-------------+----------------+
|Manufacturer7|            0.46|
|Manufacturer1|            0.53|
|Manufacturer6|             0.6|
|Manufacturer3|            0.64|
|Manufacturer2|            0.68|
|Manufacturer5|            0.77|
|Manufacturer4|            0.92|
+-------------+----------------+



# Question 6:
How did you define “safe” versus “dangerous”? <br />

When thinging of safe/dangerous I looked at columns relating to the care itself - like year, antique, and safety discount. I also decided to incorperate claims payout into the equation. My reasoning is that if a car is unsafe then vehicle accidents will result in more injuries- resulting in higher claims payouts.
I created a sort of weighted formula to calculate the safest/most dangerous car make. Older vehicles, antiques, and whether or not the vehicle had a safety discount all attributed to its final safety score

# Question 7:
Which states have the largest households? MI, AR, CA, GA, WA<br />
To do this I'm going to group the values by HH_state and calculate the average number of persons per household



In [181]:
# grouping and counting the hh_id to determine household size

household_size = combined_info_df.groupBy("HH_ID").count()


#joining the household size with the original dataframe to get state information for each household - renaming the column for clarity
df_with_size = combined_info_df.join(household_size, combined_info_df["HH_ID"] == household_size["HH_ID"], how="inner")
df_with_size = df_with_size.withColumnRenamed("count", "hh_size")

In [182]:
df_with_size.show(5)

+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+-------------+----------+-------------+-------------------+------------+---------------+----+-----+----------------------+-----------------------+------------+----------------------+-------+---------+-------+
|CAR_ID|  CUST_ID|    HH_ID|Active HH|HH Start Date|  Phone Number|  ZIP|HH State|Country|Referral Source|Date of Birth|Gender|Marital Status|     Employment Type|Income|              Status|Car State|Year|         Make|Body Style|Vehicle Value|Annual Miles Driven|Business Use|Antique Vehicle|Lien|Lease|Driver Safety Discount|Vehicle Safety Discount|Claim Payout|6 Month Premium Amount|Car Age|    HH_ID|hh_size|
+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+

In [183]:
#calculating the average household size per state by grouping by HH State and averaging the count of hh_size (which we calculated above)

avg_household_size_by_state = df_with_size.groupBy("HH State").avg("hh_size")

#ordering by largest to smallest average household size - displaying top 5

avg_household_size_by_state = avg_household_size_by_state.orderBy(avg_household_size_by_state["avg(hh_size)"].desc())
avg_household_size_by_state.show(5)

+--------+-----------------+
|HH State|     avg(hh_size)|
+--------+-----------------+
|      MI|7.916047972587093|
|      AR|7.912389839294971|
|      CA|7.878361223312176|
|      GA|7.877387981938173|
|      WA|7.874041621029573|
+--------+-----------------+
only showing top 5 rows



# Question 8:
What is the average age of customers? <br />
To do this I created a new column called "age" - much like I did for the car age. I then calculated the age by grouping the dataframe by itself - since I needed the overall average.

In [184]:
#finding the age of each customer

age_df = combined_info_df.withColumn("Age", datediff(current_date(), "Date of Birth") / 365)

#calculating the average age

average_age_result = age_df.agg(round(avg("Age"), 2)).first()[0]
print(f"Average age of customers is {average_age_result}")

Average age of customers is 50.56


# Question 9:
How much does age vary by region? KY, PA, and WV have the most variation.<br />
Since 'region' is abiguous - I'm going to do by state. Will do more of an aggregated region approach time permitting. <br />
I found this age variation by using standard deviation of a newly created customer age column.

In [192]:
#creating a new column customer age - using floor to round down (since most people don't round their age up)

combined_info_df = combined_info_df.withColumn("Customer Age", floor(datediff(current_date(), col("Date of Birth")) / 365))

In [195]:
#calculating the standard deviation of the customer age column and grouping it by hh state - ordering by descending
age_stddev_df = combined_info_df.groupBy("HH State").agg(stddev("Customer Age").alias("AgeStdDev"))
age_stddev_df = age_stddev_df.orderBy("AgeStdDev", ascending=False)
age_stddev_df.show(5)

+--------+------------------+
|HH State|         AgeStdDev|
+--------+------------------+
|      KY| 25.78204507065621|
|      PA|25.773153433990718|
|      WV|25.742397541504932|
|      WA|25.725187332850236|
|      CA|25.706508939877047|
+--------+------------------+
only showing top 5 rows



# Question 10:
Which age group has the most expensive claims? 55-64 age group <br />
I'm going to answer this by creating a bin of ages and calculate the average claim amount of each binned group.


In [203]:
#placing customer in bins based on their age.

age_groups_df = combined_info_df.withColumn(
    "AgeGroup",
    expr(
        "CASE WHEN `Customer Age` < 18 THEN 'Under 18' \
              WHEN `Customer Age` >= 18 AND `Customer Age` < 25 THEN '18-24' \
              WHEN `Customer Age` >= 25 AND `Customer Age` < 35 THEN '25-34' \
              WHEN `Customer Age` >= 35 AND `Customer Age` < 45 THEN '35-44' \
              WHEN `Customer Age` >= 45 AND `Customer Age` < 55 THEN '45-54' \
              WHEN `Customer Age` >= 55 AND `Customer Age` < 65 THEN '55-64' \
              ELSE '65 and Over' END"))


In [205]:
age_groups_df.show(5)

+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------------+------+--------------------+---------+----+-------------+----------+-------------+-------------------+------------+---------------+----+-----+----------------------+-----------------------+------------+----------------------+-------+------------+-----------+
|CAR_ID|  CUST_ID|    HH_ID|Active HH|HH Start Date|  Phone Number|  ZIP|HH State|Country|Referral Source|Date of Birth|Gender|Marital Status|     Employment Type|Income|              Status|Car State|Year|         Make|Body Style|Vehicle Value|Annual Miles Driven|Business Use|Antique Vehicle|Lien|Lease|Driver Safety Discount|Vehicle Safety Discount|Claim Payout|6 Month Premium Amount|Car Age|Customer Age|   AgeGroup|
+------+---------+---------+---------+-------------+--------------+-----+--------+-------+---------------+-------------+------+--------------+--------------

In [214]:
#grouping the age groups together and determining their average claim payout

avg_claim_payout_by_age_group = age_groups_df.groupBy("AgeGroup").agg(avg("Claim Payout").alias("AvgClaimPayout"))
max_claim_payout_age_group = avg_claim_payout_by_age_group.orderBy("AvgClaimPayout", ascending=False).first()

In [215]:
#here we see that accourding to our data the 55-64 age group has the most expensive claims on average

print(max_claim_payout_age_group)

Row(AgeGroup='55-64', AvgClaimPayout=1281.055329282979)


# Revisiting Question 9:
I'm going to approach this with a more 'regional' mindset - creating bins for regions of the US and placing certain states into them

In [218]:
#creating region bins based on regions I found on google
regions = {
    "Northeast": ["CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA"],
    "Midwest": ["IL", "IN", "MI", "OH", "WI", "IA", "KS", "MN", "MO", "NE", "ND", "SD"],
    "South": ["DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV", "AL", "KY", "MS", "TN", "AR", "LA", "OK", "TX"],
    "West": ["AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY", "CA", "OR", "WA"],
    "Alaska & Hawaii": ["AK", "HI"]}

In [219]:
#creating a new columns 'region' based on the specified above regions. I'm not sure hot to handle DC or Us protectorates so I used "other"
combined_info_df = combined_info_df.withColumn(
    "Region",
    when(col("HH State").isin(regions["Northeast"]), "Northeast")
    .when(col("HH State").isin(regions["Midwest"]), "Midwest")
    .when(col("HH State").isin(regions["South"]), "South")
    .when(col("HH State").isin(regions["West"]), "West")
    .when(col("HH State").isin(regions["Alaska & Hawaii"]), "Alaska & Hawaii")
    .otherwise("Other")
)

In [225]:
#grouping the data by region and calculating the standard deviation of ages within each region - similar to what I originally did, just on a regional scale instead of state.

age_variation_by_region = combined_info_df.groupBy("Region").agg(stddev("Customer Age").alias("AgeVariation"))
age_variation_by_region_desc = age_variation_by_region.orderBy(desc("AgeVariation"))



In [227]:
#when grouped by region, Alaska and Hawaii have the most variation of age by region.

age_variation_by_region_desc.show()

+---------------+------------------+
|         Region|      AgeVariation|
+---------------+------------------+
|Alaska & Hawaii| 25.60378172319714|
|      Northeast| 25.56367930743783|
|           West|25.554609130779536|
|          South|25.550928584371444|
|        Midwest|25.525246300120145|
+---------------+------------------+

