In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.stat import Correlation
#import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
#import holidays
from datetime import datetime, timezone
from pyspark.ml.classification import RandomForestClassifier, BinaryLogisticRegressionSummary
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.pipeline import PipelineModel

In [2]:
# Instantiate Spark Session
spark = (SparkSession
  .builder
  .appName("US_Accidents")
  .getOrCreate())
spark.sparkContext.setLogLevel("ERROR") #supress warnings

/opt/conda/lib/python3.7/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/11/26 14:03:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.
24/11/26 14:03:15 WARN Utils: Service 'SparkUI' could not bind on

In [3]:
# Read in Data
df = spark.read.parquet("updated_dataset.parquet")

In [4]:
# Use StringIndexer for encoding the 'Severity' column
indexer = StringIndexer(inputCol="Severity", outputCol="SeverityIndex")
df = indexer.fit(df).transform(df)

                                                                                

In [5]:
# Create list of features
feature_list = []
for col in df.columns:
    if col == 'Severity':
        continue
    elif col == 'SeverityIndex':
        continue
    elif col == 'Sex_ratio':
        continue
    elif col == 'Severity_Binary':
        continue
    else:
        feature_list.append(col)

In [6]:
# Split the data into train and test
splits = df.randomSplit([0.8, 0.2], 314)
train = splits[0]
test = splits[1]

In [7]:
# Undersampling
from pyspark.sql import functions as F

# Step 1: Group by 'Severity' and count occurrences
class_counts = train.groupBy("SeverityIndex").count()

# Step 2: Use PySpark's min() function to find the minimum count
min_class_size = class_counts.agg(F.min('count')).collect()[0][0]

undersampled_train_list = []

for row in class_counts.collect():
    class_label = row['SeverityIndex']
    class_size = row['count']

    if class_size > min_class_size:
        # Sample the data for this class to the size of the minimum class
        class_data = train.filter(F.col("SeverityIndex") == class_label)
        class_data_undersampled = class_data.sample(withReplacement=False, fraction=min_class_size / class_size)
    else:
        # For classes that are already at the minimum size, keep all samples
        class_data_undersampled = train.filter(F.col("SeverityIndex") == class_label)

    undersampled_train_list.append(class_data_undersampled)

# Combine all the undersampled DataFrames
undersampled_train = undersampled_train_list[0]  # start with the first one
for df in undersampled_train_list[1:]:
    undersampled_train = undersampled_train.union(df)

# Show the result
undersampled_train.show()

# Step 4: Group by 'Severity' and count the occurrences in the undersampled DataFrame
undersampled_class_counts = undersampled_train.groupBy("SeverityIndex").count()

# Show the result
undersampled_class_counts.show()

                                                                                

+--------+-----------+--------+--------+----------+----------+-------------+-------+---------+----+----+-------------+--------------------------+--------------------+---------+-------------------+------------+-----------+--------------------+--------------------+-----------+-----------------+---------------+-------------+
|Severity|Temperature|Humidity|Pressure|Visibility|Wind_Speed|Precipitation|Weekday|Rush_Hour|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate_Indicator|Sex_ratio|Percent_Age_65_over|MedianIncome|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|Percent_Age_15-24|Severity_Binary|SeverityIndex|
+--------+-----------+--------+--------+----------+----------+-------------+-------+---------+----+----+-------------+--------------------------+--------------------+---------+-------------------+------------+-----------+--------------------+--------------------+-----------+-----------------+---------------+-------------+
|       2|        -11|      



+-------------+-----+
|SeverityIndex|count|
+-------------+-----+
|          0.0|52289|
|          1.0|52134|
|          3.0|52151|
|          2.0|52482|
+-------------+-----+



                                                                                

In [8]:
# Assemble data for logistic regression model
assembler = VectorAssembler(inputCols=feature_list,
                            outputCol="features")

undersampled_train = assembler.transform(undersampled_train)
test = assembler.transform(test)

In [9]:
# Standardize the predictors
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(undersampled_train)
scaledTrainData = scalerModel.transform(undersampled_train)
scaledTestData = scalerModel.transform(test)

                                                                                

In [10]:
# Fit logistic regression model with intercept
from pyspark.ml.classification import LogisticRegression

# instantiate the model
lr = LogisticRegression(labelCol='SeverityIndex',
                        featuresCol='scaledFeatures',
                        #maxIter=10, 
                        #regParam=0.3, 
                        #elasticNetParam=0.8,
                        family="multinomial")

# Fit the model
lrModel = lr.fit(scaledTrainData)
print(f"Detected number of classes: {lrModel.numClasses}")

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

                                                                                

Detected number of classes: 4
Coefficients: DenseMatrix([[-0.01387877, -0.07113134,  0.03576145, -0.04274648, -0.0105225 ,
              -0.00251069, -0.02823771, -0.04445998, -0.01114432,  0.03680349,
               0.53169128,  0.33241802,  0.02827636, -0.00294697, -0.09985955,
               0.0322089 ,  0.00443762,  0.04605693, -0.01250226, -0.03271464,
               0.05256482,  0.00666857],
             [-0.14761398, -0.08772155,  0.23559898,  0.0051682 ,  0.124713  ,
               0.01023625, -0.06507424, -0.01018898,  0.04253267,  0.03902944,
               0.21543081,  0.1542089 , -0.12180086, -0.11957644,  0.47560523,
              -0.13505205,  0.00413156,  0.27508152,  0.015609  , -0.25787706,
              -0.08930309, -0.02836291],
             [-0.24107399, -0.04505362,  0.00675056,  0.03022322,  0.06621366,
              -0.00517722, -0.13907572, -0.17494515,  0.01736376,  0.03586381,
               0.2197966 ,  0.13656248, -0.08186694,  0.15942275, -0.0122839 ,
     

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute predictions. this will append column "prediction" to dataframe
lrPred = lrModel.transform(scaledTestData)
lrPred.select("prediction").distinct().show()

evaluator = MulticlassClassificationEvaluator(labelCol='SeverityIndex', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(lrPred, {evaluator.metricName: "accuracy"})
print(f'Accuracy: {accuracy}')

                                                                                

+----------+
|prediction|
+----------+
|       0.0|
|       1.0|
|       3.0|
|       2.0|
+----------+





Accuracy: 0.3881040384467644


                                                                                

In [12]:
precision = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedPrecision'})
recall = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(lrPred, {evaluator.metricName: 'f1'})

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')



Precision: 0.7675445743994155
Recall: 0.3881040384467644
F1 Score: 0.4791333968225278


                                                                                

In [13]:
# Add a column to indicate correct or incorrect predictions
predictions = lrPred.withColumn(
    'is_correct', F.expr("CASE WHEN SeverityIndex = prediction THEN 1 ELSE 0 END")
)

# Calculate accuracy by class
accuracy_by_class = predictions.groupBy('SeverityIndex').agg(
    (F.sum('is_correct') / F.count('SeverityIndex')).alias('accuracy')
)

# Show per-class accuracy
accuracy_by_class.show()



+-------------+-------------------+
|SeverityIndex|           accuracy|
+-------------+-------------------+
|          0.0|0.36098152417799073|
|          1.0|0.49578504623138764|
|          3.0|  0.777230390270187|
|          2.0|  0.426246440734744|
+-------------+-------------------+



                                                                                

In [14]:
# Calculate metrics by class
labels = lrPred.select('SeverityIndex').distinct().orderBy('SeverityIndex').rdd.flatMap(lambda x: x).collect()

metrics = {}
for label in labels:
    # Filter predictions for the current label
    true_positive = lrPred.filter((F.col('SeverityIndex') == label) & (F.col('prediction') == label)).count()
    false_positive = lrPred.filter((F.col('SeverityIndex') != label) & (F.col('prediction') == label)).count()
    false_negative = lrPred.filter((F.col('SeverityIndex') == label) & (F.col('prediction') != label)).count()
    true_negative = lrPred.filter((F.col('SeverityIndex') != label) & (F.col('prediction') != label)).count()

    # Precision, Recall, and F1 Score
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    # Store metrics
    metrics[label] = {'precision': precision, 'recall': recall, 'f1_score': f1_score}

# Print metrics for each class
for label, metric in metrics.items():
    print(f"Class {label} - Precision: {metric['precision']:.4f}, Recall: {metric['recall']:.4f}, F1 Score: {metric['f1_score']:.4f}")



Class 0.0 - Precision: 0.8867, Recall: 0.3610, F1 Score: 0.5131
Class 1.0 - Precision: 0.3243, Recall: 0.4958, F1 Score: 0.3921
Class 2.0 - Precision: 0.0622, Recall: 0.4262, F1 Score: 0.1085
Class 3.0 - Precision: 0.0285, Recall: 0.7772, F1 Score: 0.0549


                                                                                