In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.stat import Correlation
#import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
#import holidays
from datetime import datetime, timezone
from pyspark.ml.classification import RandomForestClassifier, BinaryLogisticRegressionSummary
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.pipeline import PipelineModel

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("US_Accidents") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR") #supress warnings

/opt/conda/lib/python3.7/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/11/26 13:25:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read in Data
df = spark.read.parquet("updated_dataset.parquet")

In [4]:
# Use StringIndexer for encoding the 'Severity' column
indexer = StringIndexer(inputCol="Severity", outputCol="SeverityIndex")
df = indexer.fit(df).transform(df)

                                                                                

In [5]:
# Create list of features
feature_list = []
for col in df.columns:
    if col == 'Severity':
        continue
    elif col == 'SeverityIndex':
        continue
    elif col == 'Severity_Binary':
        continue
    else:
        feature_list.append(col)

In [6]:
# Split the data into train and test
splits = df.randomSplit([0.8, 0.2], 314)
train = splits[0]
test = splits[1]

In [7]:
# Assemble data for logistic regression model
assembler = VectorAssembler(inputCols=feature_list,
                            outputCol="features")

train = assembler.transform(train)
test = assembler.transform(test)

In [8]:
# Standardize the predictors
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(train)
scaledTrainData = scalerModel.transform(train)
scaledTestData = scalerModel.transform(test)

                                                                                

In [9]:
# Fit logistic regression model with intercept
from pyspark.ml.classification import LogisticRegression

# instantiate the model
lr = LogisticRegression(labelCol='SeverityIndex',
                        featuresCol='scaledFeatures',
                        #maxIter=10, 
                        #regParam=0.3, 
                        #elasticNetParam=0.8,
                        family="multinomial")

# Fit the model
lrModel = lr.fit(scaledTrainData)
print(f"Detected number of classes: {lrModel.numClasses}")

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

                                                                                

Detected number of classes: 4
Coefficients: DenseMatrix([[-2.27735764e-02, -6.23754106e-02,  1.53160173e-02,
              -3.28420660e-02, -6.45348773e-03,  1.39240298e-03,
              -3.16710499e-02, -4.05644817e-02, -8.38530131e-03,
               4.19694910e-02,  5.90229957e-01,  3.62096102e-01,
               1.60934640e-02, -8.00280010e-03, -9.20909433e-02,
               2.71530334e-02,  2.13969360e-02,  1.39678948e-02,
               2.53666344e-02, -6.82602470e-03, -3.16979718e-02,
               6.18419718e-02,  6.20371595e-04],
             [-1.45166515e-01, -3.97795728e-02,  2.25646949e-01,
               1.14266171e-02,  1.40053445e-01,  8.42096859e-03,
              -6.94983532e-02, -1.62892522e-02,  4.77137951e-02,
               3.01627961e-02,  2.61331848e-01,  1.76270704e-01,
              -9.12856789e-02, -1.26511156e-01,  4.85355739e-01,
               5.65162734e-02, -1.21287360e-01,  1.89641890e-03,
               2.54810209e-01,  4.64836622e-03, -2.52849638e-0

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute predictions. this will append column "prediction" to dataframe
lrPred = lrModel.transform(scaledTestData)
lrPred.select("prediction").distinct().show()

evaluator = MulticlassClassificationEvaluator(labelCol='SeverityIndex', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(lrPred, {evaluator.metricName: "accuracy"})
print(f'Accuracy: {accuracy}')

                                                                                

+----------+
|prediction|
+----------+
|       0.0|
|       1.0|
|       3.0|
+----------+



                                                                                

Recall for label 1.0: 0.02468129091535546




Accuracy: 0.8140694044905775


                                                                                

In [11]:
precision = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedPrecision'})
#recall = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(lrPred, {evaluator.metricName: 'f1'})

print(f'Precision: {precision}')
#print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')



Precision: 0.7569790103051561
F1 Score: 0.7390516151494088


                                                                                

In [12]:
# Add a column to indicate correct or incorrect predictions
predictions = lrPred.withColumn(
    'is_correct', F.expr("CASE WHEN SeverityIndex = prediction THEN 1 ELSE 0 END")
)

# Calculate accuracy by class
accuracy_by_class = predictions.groupBy('SeverityIndex').agg(
    (F.sum('is_correct') / F.count('SeverityIndex')).alias('accuracy')
)

# Show per-class accuracy
accuracy_by_class.show()



+-------------+--------------------+
|SeverityIndex|            accuracy|
+-------------+--------------------+
|          0.0|  0.9942249248179769|
|          1.0|0.027632349548334427|
|          3.0|                 0.0|
|          2.0|                 0.0|
+-------------+--------------------+



                                                                                

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F

# Compute predictions
lrPred = lrModel.transform(scaledTestData)

# Show distinct predictions
lrPred.select("prediction").distinct().show()

# Evaluate overall accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='SeverityIndex', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(lrPred)
print(f'Accuracy: {accuracy}')

# Calculate metrics by class
labels = lrPred.select('SeverityIndex').distinct().orderBy('SeverityIndex').rdd.flatMap(lambda x: x).collect()

metrics = {}
for label in labels:
    # Filter predictions for the current label
    true_positive = lrPred.filter((F.col('SeverityIndex') == label) & (F.col('prediction') == label)).count()
    false_positive = lrPred.filter((F.col('SeverityIndex') != label) & (F.col('prediction') == label)).count()
    false_negative = lrPred.filter((F.col('SeverityIndex') == label) & (F.col('prediction') != label)).count()
    true_negative = lrPred.filter((F.col('SeverityIndex') != label) & (F.col('prediction') != label)).count()

    # Precision, Recall, and F1 Score
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    # Store metrics
    metrics[label] = {'precision': precision, 'recall': recall, 'f1_score': f1_score}

# Print metrics for each class
for label, metric in metrics.items():
    print(f"Class {label} - Precision: {metric['precision']:.4f}, Recall: {metric['recall']:.4f}, F1 Score: {metric['f1_score']:.4f}")

                                                                                

+----------+
|prediction|
+----------+
|       0.0|
|       1.0|
|       3.0|
+----------+



                                                                                

Accuracy: 0.8047301584591702




Class 0.0 - Precision: 0.8078, Recall: 0.9942, F1 Score: 0.8914
Class 1.0 - Precision: 0.4769, Recall: 0.0276, F1 Score: 0.0522
Class 2.0 - Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000
Class 3.0 - Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000


                                                                                