# Research Question:
### What are the most influential variables on the severity of accidents?

Useful Paper:
    https://www.sciencedirect.com/science/article/pii/S2590198223000611

In [1]:
#Display Spark Output in scrollable format within jupyter notebook
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [14]:
#Supress Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.stat import Correlation
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import holidays
from datetime import datetime, timezone
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

# Load Data

In [3]:
#Instantiate Spark Session
spark = (SparkSession
  .builder
  .appName("US_Accidents")
  .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/01 14:46:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Read in Dataset
df = spark.read.parquet("final_dataset.parquet")
df.show(5)

+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------+-------------------+------------+--------------------------+-----------+--------------------+--------------------+-----------+
|Severity|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Speed(mph)|Precipitation(in)|Weekday|Rush Hour|Holiday|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate Indicator|Sex ratio (males per 100 females)|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|MedianIncome_MarginOfError|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|
+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------

In [5]:
# Remove MedianIncome_MarginOfError
df = df.drop('MedianIncome_MarginOfError')
df.show(5)

+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------+-------------------+------------+-----------+--------------------+--------------------+-----------+
|Severity|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Speed(mph)|Precipitation(in)|Weekday|Rush Hour|Holiday|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate Indicator|Sex ratio (males per 100 females)|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|
+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------

In [6]:
# Get row count
rows = df.count()
print(f"DataFrame Rows count : {rows}")

# Get columns count
cols = len(df.columns)
print(f"DataFrame Columns count : {cols}")

DataFrame Rows count : 7026806
DataFrame Columns count : 24


# Sampling

In [7]:
# Check Class Imbalance
cts = df.groupBy("Severity").count().withColumn('percent', (F.col('count') / rows)*100)
cts.show()

+--------+-------+------------------+
|Severity|  count|           percent|
+--------+-------+------------------+
|       1|  65142|0.9270499285165977|
|       3|1123799|15.993027272988611|
|       4| 178821|2.5448404296347444|
|       2|5659044| 80.53508236886005|
+--------+-------+------------------+



## Random Split

In [None]:
# Train/Test Split
train_data, test_data = df.randomSplit([0.8, 0.2])

#Model just guessed severity = 2 for everything

## Undersampling

In [22]:
# Undersample each class by 80% of the smallest class
sample = (cts.select("count").rdd.min()[0])*0.8

class1 = sample/(cts.select("count").where(cts.Severity == '1').rdd.min()[0])
class2 = sample/(cts.select("count").where(cts.Severity == '2').rdd.min()[0])
class3 = sample/(cts.select("count").where(cts.Severity == '3').rdd.min()[0])
class4 = sample/(cts.select("count").where(cts.Severity == '4').rdd.min()[0])

# Split Data by Class - Downsampling

# Create a temporary view to use SQL
df.createOrReplaceTempView("data_view")

# Calculate fractions for each class
#fractions = df.groupBy("Severity").count().rdd.map(lambda row: (row[0], 0.8)).collectAsMap() #samples 80% of each class
fractions = {1: class1, 2: class2, 3: class3, 4: class4} #downsample each class to 80% of the smallest class

# Use stratified sampling to maintain class distribution
train_data = df.sampleBy("Severity", fractions, seed=42)
test_data = df.subtract(train_data)

#Model performed ~32% accuracy - great precision though

## Binary Classification

In [26]:
# Binary Classification (1 or 2 vs 3 or 4)
df = df.withColumn('Severity_Binary', when((col("Severity")==1) | (col("Severity")==2), 0).otherwise(1))
df.show(5)

+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------+-------------------+------------+-----------+--------------------+--------------------+-----------+---------------+
|Severity|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Speed(mph)|Precipitation(in)|Weekday|Rush Hour|Holiday|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate Indicator|Sex ratio (males per 100 females)|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|Severity_Binary|
+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+---

In [27]:
# Check Class Imbalance
cts = df.groupBy("Severity_Binary").count().withColumn('percent', (F.col('count') / rows)*100)
cts.show()

+---------------+-------+------------------+
|Severity_Binary|  count|           percent|
+---------------+-------+------------------+
|              1|1302620|18.537867702623352|
|              0|5724186| 81.46213229737664|
+---------------+-------+------------------+



In [29]:
# Undersample each class by 80% of the smallest class
sample = (cts.select("count").rdd.min()[0])*0.8

class0 = sample/(cts.select("count").where(cts.Severity_Binary == '0').rdd.min()[0])
class1 = sample/(cts.select("count").where(cts.Severity_Binary == '1').rdd.min()[0])

# Split Data by Class - Downsampling

# Create a temporary view to use SQL
df.createOrReplaceTempView("data_view")

# Calculate fractions for each class
#fractions = df.groupBy("Severity").count().rdd.map(lambda row: (row[0], 0.8)).collectAsMap() #samples 80% of each class
fractions = {0: class0, 1: class1} #downsample each class to 80% of the smallest class

# Use stratified sampling to maintain class distribution
train_data = df.sampleBy("Severity_Binary", fractions, seed=42)
test_data = df.subtract(train_data)

#Model performed ~77% accuracy with base rf model

# Modeling

In [41]:
# Select all features except target variable
features = df.select([col for col in df.columns if col != "Severity" and col != "Severity_Binary"]).columns

# Vectorize Features
assembler = VectorAssembler(inputCols=features, outputCol='features')

# Model
model = RandomForestClassifier(featuresCol = 'features', labelCol = 'Severity_Binary', numTrees=3, maxDepth=2)
  
# Creating the pipeline 
pipeline = Pipeline(stages=[assembler, model])

fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

24/11/01 15:37:10 WARN MemoryStore: Not enough space to cache rdd_814_0 in memory! (computed 11.7 MiB so far)
24/11/01 15:37:10 WARN BlockManager: Persisting block rdd_814_0 to disk instead.
24/11/01 15:37:10 WARN MemoryStore: Not enough space to cache rdd_814_4 in memory! (computed 18.0 MiB so far)
24/11/01 15:37:10 WARN BlockManager: Persisting block rdd_814_4 to disk instead.
24/11/01 15:37:11 WARN MemoryStore: Not enough space to cache rdd_814_5 in memory! (computed 27.1 MiB so far)
24/11/01 15:37:11 WARN BlockManager: Persisting block rdd_814_5 to disk instead.
24/11/01 15:37:11 WARN MemoryStore: Not enough space to cache rdd_814_3 in memory! (computed 27.1 MiB so far)
24/11/01 15:37:11 WARN BlockManager: Persisting block rdd_814_3 to disk instead.
24/11/01 15:37:11 WARN MemoryStore: Not enough space to cache rdd_814_2 in memory! (computed 18.0 MiB so far)
24/11/01 15:37:11 WARN BlockManager: Persisting block rdd_814_2 to disk instead.
24/11/01 15:37:11 WARN MemoryStore: Not enoug

# Evaluation

In [42]:
#Metrics (Confusion Matrix, Accuracy, Weighted Precision, Recall, and F1 Score)

from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = results.select("prediction", "Severity_Binary")
metrics = MulticlassMetrics(predictionAndLabels.rdd.map(lambda x: tuple(map(float, x))))

# Get confusion matrix
print(metrics.confusionMatrix().toArray())

# Get precision, recall, and F1-score for each class
print(f'Weighted Precision: {metrics.weightedPrecision}')
print(f'Weighted Recall: {metrics.weightedRecall}')
print(f'Weighted F1 Score: {metrics.weightedFMeasure()}')
print(f'Accuracy: {metrics.accuracy}')



[[1145667.       0.]
 [ 260410.       0.]]
Weighted Precision: 0.6638937787536361
Weighted Recall: 0.8147967714428157
Weighted F1 Score: 0.7316453161042615
Accuracy: 0.8147967714428157


                                                                                

In [48]:
# Print Overall % Sampled from DF
print(train_data.count()/df.count()*100)

# Print % Sampled for each class within Train Data
train_data.groupBy("Severity").count().withColumn('percent', (F.col('count') / train_data.count())*100).show()

2.970026495679545
+--------+-----+------------------+
|Severity|count|           percent|
+--------+-----+------------------+
|       1|52012|24.922136292633372|
|       3|51869| 24.85361623015074|
|       4|52207|25.015572741473324|
|       2|52610| 25.20867473574256|
+--------+-----+------------------+



In [49]:
# Print Overall % Sampled from DF
print(test_data.count()/df.count()*100)

# Print % Sampled for each class within Train Data
test_data.groupBy("Severity").count().withColumn('percent', (F.col('count') / test_data.count())*100).show()

                                                                                

79.87516661197137


24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/01 14:26:34 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+-------+------------------+
|Severity|  count|           percent|
+--------+-------+------------------+
|       1|  10966|0.1953792782868341|
|       3|1029576|18.343773100624247|
|       4|  95412| 1.699938692312914|
|       2|4476719|   79.760908928776|
+--------+-------+------------------+



                                                                                

In [None]:
# Import Pipeline
from pyspark.ml import Pipeline 
  
model = ########################### Model Here ###########################
  
# Creating the pipeline 
pipe = Pipeline(stages=[model]) 

In [None]:
# Fitting the model on training data 
fit_model = pipe.fit(scaledData_train) 
  
# Storing the results on test data 
results = fit_model.transform(scaledData_test) 

In [None]:
# ROC Curve - he asked for ROC curve but usually only for binary classification - could do 4 one-vs-all roc curves?