In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.stat import Correlation
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import holidays
from datetime import datetime, timezone
from pyspark.ml.classification import RandomForestClassifier, BinaryLogisticRegressionSummary
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.pipeline import PipelineModel

In [2]:
# Instantiate Spark Session
spark = (SparkSession
  .builder
  .appName("US_Accidents")
  .getOrCreate())
spark.sparkContext.setLogLevel("ERROR") #supress warnings

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/11 13:01:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read in Data
df = spark.read.parquet("final_dataset.parquet")
df.show(5)

                                                                                

+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------+-------------------+------------+--------------------------+-----------+--------------------+--------------------+-----------+
|Severity|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Speed(mph)|Precipitation(in)|Weekday|Rush Hour|Holiday|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate Indicator|Sex ratio (males per 100 females)|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|MedianIncome_MarginOfError|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|
+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------

In [4]:
# Confirm that there are 4 Classes
unique_labels = df.select("Severity").distinct().count()
print(f"Number of unique labels in Severity: {unique_labels}")



Number of unique labels in Severity: 4


                                                                                

In [5]:
# Create list of features
feature_list = []
for col in df.columns:
    if col == 'Severity':
        continue
    elif col == 'SeasonVec':
        continue
    else:
        feature_list.append(col)

# Original Dataset

In [6]:
# Assemble data for logistic regression model
assembler = VectorAssembler(inputCols=feature_list,
                            outputCol="features")

df = assembler.transform(df)

In [7]:
# Confirm that there are 4 Classes after assembling
unique_labels = df.select("Severity").distinct().count()
print(f"Number of unique labels in Severity: {unique_labels}")



Number of unique labels in Severity: 4


                                                                                

In [8]:
# Split the data into train and test
splits = df.randomSplit([0.8, 0.2], 314)
train = splits[0]
test = splits[1]

In [9]:
# Standardize the predictors
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(train)
scaledTrainData = scalerModel.transform(train)
scaledTestData = scalerModel.transform(test)

                                                                                

In [10]:
# Fit logistic regression model with intercept
from pyspark.ml.classification import LogisticRegression

# instantiate the model
lr = LogisticRegression(labelCol='Severity',
                        featuresCol='scaledFeatures',
                        maxIter=10, 
                        regParam=0.3, 
                        elasticNetParam=0.8,
                        family="multinomial")

# Fit the model
lrModel = lr.fit(scaledTrainData)
print(f"Detected number of classes: {lrModel.numClasses}")

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

                                                                                

Detected number of classes: 5
Coefficients: 5 X 23 CSRMatrix

Intercept: [-10.352852767967923,0.5063385243198986,4.972389653205689,3.3563424695647748,1.51778212087756]


In [11]:
# Assuming you used StringIndexer for encoding the 'Severity' column
indexer = StringIndexer(inputCol="Severity", outputCol="SeverityIndex")
indexer_model = indexer.fit(scaledTrainData)
indexer_model.labels

                                                                                

['2', '3', '4', '1']

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute predictions. this will append column "prediction" to dataframe
lrPred = lrModel.transform(scaledTestData)
lrPred.select("prediction").distinct().show()

evaluator = MulticlassClassificationEvaluator(labelCol='Severity', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(lrPred, {evaluator.metricName: "accuracy"})
print(f'Accuracy: {accuracy}')

                                                                                

+----------+
|prediction|
+----------+
|       2.0|
+----------+





Accuracy: 0.805556522845291


                                                                                

In [13]:
precision = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedPrecision'})
recall = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(lrPred, {evaluator.metricName: 'f1'})
#recallByLabel = evaluator.evaluate(lrPred, {evaluator.metricName: 'recallByLabel'})

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')
#print(f'Recall by Label: {recallByLabel}')



Precision: 0.6489213114985959
Recall: 0.805556522845291
F1 Score: 0.7188047599595404


                                                                                

# Always predicts majority class (class 2)

In [14]:
# Add a column to indicate correct or incorrect predictions
predictions = lrPred.withColumn(
    'is_correct', F.expr("CASE WHEN Severity = prediction THEN 1 ELSE 0 END")
)

# Calculate accuracy by class
accuracy_by_class = predictions.groupBy('Severity').agg(
    (F.sum('is_correct') / F.count('Severity')).alias('accuracy')
)

# Show per-class accuracy
accuracy_by_class.show()



+--------+--------+
|Severity|accuracy|
+--------+--------+
|       1|     0.0|
|       3|     0.0|
|       4|     0.0|
|       2|     1.0|
+--------+--------+



                                                                                

# Undersampling

In [15]:
# Read in Data
df = spark.read.parquet("final_dataset.parquet")

In [16]:
df = assembler.transform(df)

In [17]:
from pyspark.sql import functions as F

# Step 1: Group by 'Severity' and count occurrences
class_counts = df.groupBy("Severity").count()

# Step 2: Use PySpark's min() function to find the minimum count
min_class_size = class_counts.agg(F.min('count')).collect()[0][0]

undersampled_df_list = []

for row in class_counts.collect():
    class_label = row['Severity']
    class_size = row['count']

    if class_size > min_class_size:
        # Sample the data for this class to the size of the minimum class
        class_data = df.filter(F.col("Severity") == class_label)
        class_data_undersampled = class_data.sample(withReplacement=False, fraction=min_class_size / class_size)
    else:
        # For classes that are already at the minimum size, keep all samples
        class_data_undersampled = df.filter(F.col("Severity") == class_label)

    undersampled_df_list.append(class_data_undersampled)

# Combine all the undersampled DataFrames
undersampled_df = undersampled_df_list[0]  # start with the first one
for df in undersampled_df_list[1:]:
    undersampled_df = undersampled_df.union(df)

# Show the result
undersampled_df.show()

# Step 4: Group by 'Severity' and count the occurrences in the undersampled DataFrame
undersampled_class_counts = undersampled_df.groupBy("Severity").count()

# Show the result
undersampled_class_counts.show()

+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------+-------------------+------------+--------------------------+-----------+--------------------+--------------------+-----------+--------------------+
|Severity|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Speed(mph)|Precipitation(in)|Weekday|Rush Hour|Holiday|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate Indicator|Sex ratio (males per 100 females)|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|MedianIncome_MarginOfError|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|            features|
+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+---------------------



+--------+-----+
|Severity|count|
+--------+-----+
|       1|65142|
|       3|65399|
|       4|65355|
|       2|64717|
+--------+-----+



                                                                                

In [18]:
# Split the data into train and test
splits = undersampled_df.randomSplit([0.8, 0.2], 314)
undersampled_train = splits[0]
undersampled_test = splits[1]

In [19]:
# Group by 'Severity' and count the occurrences in the training undersampled DataFrame
undersampled_class_counts = undersampled_train.groupBy("Severity").count()

# Show the result
undersampled_class_counts.show()



+--------+-----+
|Severity|count|
+--------+-----+
|       1|52009|
|       3|52184|
|       4|52291|
|       2|51736|
+--------+-----+



                                                                                

In [20]:
# Group by 'Severity' and count the occurrences in the testing undersampled DataFrame
undersampled_class_counts = undersampled_test.groupBy("Severity").count()

# Show the result
undersampled_class_counts.show()



+--------+-----+
|Severity|count|
+--------+-----+
|       1|13133|
|       3|13215|
|       4|13064|
|       2|12981|
+--------+-----+



                                                                                

In [21]:
# Standardize the predictors
from pyspark.ml.feature import StandardScaler
undersampled_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
undersampled_scalerModel = undersampled_scaler.fit(undersampled_train)
undersampled_scaledTrainData = undersampled_scalerModel.transform(undersampled_train)
undersampled_scaledTestData = undersampled_scalerModel.transform(undersampled_test)

                                                                                

In [22]:
# Fit logistic regression model with intercept
from pyspark.ml.classification import LogisticRegression

# instantiate the model
undersampled_lr = LogisticRegression(labelCol='Severity',
                        featuresCol='scaledFeatures',
                        maxIter=10, 
                        regParam=0.3, 
                        elasticNetParam=0.8,
                        family="multinomial")

# Fit the model
undersampled_lrModel = undersampled_lr.fit(undersampled_scaledTrainData)
print(f"Detected number of classes: {undersampled_lrModel.numClasses}")

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(undersampled_lrModel.coefficientMatrix))
print("Intercept: " + str(undersampled_lrModel.interceptVector))

                                                                                

Detected number of classes: 5
Coefficients: 5 X 23 CSRMatrix

Intercept: [-8.718887089228927,2.1789981477907783,2.1735858168777313,2.1820987362526707,2.1842043883077458]


In [23]:
# Assuming you used StringIndexer for encoding the 'Severity' column
indexer = StringIndexer(inputCol="Severity", outputCol="SeverityIndex")
indexer_model = indexer.fit(undersampled_scaledTrainData)
indexer_model.labels

                                                                                

['4', '3', '1', '2']

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute predictions. this will append column "prediction" to dataframe
undersampled_lrPred = undersampled_lrModel.transform(undersampled_scaledTestData)
undersampled_lrPred.select("prediction").distinct().show()

evaluator = MulticlassClassificationEvaluator(labelCol='Severity', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(undersampled_lrPred, {evaluator.metricName: "accuracy"})
print(f'Accuracy: {accuracy}')

                                                                                

+----------+
|prediction|
+----------+
|       4.0|
+----------+





Accuracy: 0.2493462867176913


                                                                                

In [25]:
precision = evaluator.evaluate(undersampled_lrPred, {evaluator.metricName: 'weightedPrecision'})
recall = evaluator.evaluate(undersampled_lrPred, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(undersampled_lrPred, {evaluator.metricName: 'f1'})
#recallByLabel = evaluator.evaluate(undersampled_lrPred, {evaluator.metricName: 'recallByLabel'})

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')
#print(f'Recall by Label: {recallByLabel}')



Precision: 0.062173570699901114
Recall: 0.2493462867176913
F1 Score: 0.0995297642629488


                                                                                

In [26]:
# Add a column to indicate correct or incorrect predictions
predictions = undersampled_lrPred.withColumn(
    'is_correct', F.expr("CASE WHEN Severity = prediction THEN 1 ELSE 0 END")
)

# Calculate accuracy by class
accuracy_by_class = predictions.groupBy('Severity').agg(
    (F.sum('is_correct') / F.count('Severity')).alias('accuracy')
)

# Show per-class accuracy
accuracy_by_class.show()



+--------+--------+
|Severity|accuracy|
+--------+--------+
|       1|     0.0|
|       3|     0.0|
|       4|     1.0|
|       2|     0.0|
+--------+--------+



                                                                                

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F

# Function to evaluate model performance
def evaluate_model(model, data, evaluator):
    predictions = model.transform(data)
    return evaluator.evaluate(predictions)

# Initialize the evaluator (using F1 score here)
evaluator = MulticlassClassificationEvaluator(labelCol="Severity", predictionCol="prediction", metricName="f1")

# Calculate baseline performance on test data
baseline_score = evaluate_model(undersampled_lrModel, undersampled_scaledTestData, evaluator)
print(f"Baseline F1 score: {baseline_score}")

# Assuming you know which features were used in the VectorAssembler, replace these with the actual feature names
feature_names = feature_list  # Replace with actual feature names from your dataset

# Initialize a dictionary to store feature importances
feature_importances = {}

# Loop through each feature and calculate permutation importance
for feature in feature_names:
    # Shuffle the feature column
    shuffled_data = undersampled_scaledTestData.withColumn(feature, F.rand())  # Shuffle values in the column
    
    # Evaluate the model with the shuffled feature
    score_with_permuted_feature = evaluate_model(undersampled_lrModel, shuffled_data, evaluator)
    
    # Calculate the drop in performance
    importance = baseline_score - score_with_permuted_feature
    feature_importances[feature] = importance

# Sort the features by importance (i.e., the drop in F1 score)
sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the feature importances
print("Feature Importances (by drop in F1 score):")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")

                                                                                

Baseline F1 score: 0.0995297642629488




Feature Importances (by drop in F1 score):
Temperature(F): 0.0
Humidity(%): 0.0
Pressure(in): 0.0
Visibility(mi): 0.0
Wind_Speed(mph): 0.0
Precipitation(in): 0.0
Weekday: 0.0
Rush Hour: 0.0
Holiday: 0.0
Rain: 0.0
Snow: 0.0
Astronomical_TwilightIndex: 0.0
Interstate Indicator: 0.0
Sex ratio (males per 100 females): 0.0
Percent_Age_15-19: 0.0
Percent_Age_20-24: 0.0
Percent_Age_65_over: 0.0
MedianIncome: 0.0
MedianIncome_MarginOfError: 0.0
Urban_Ratio: 0.0
Traffic_Interference: 0.0
Traffic_Intersection: 0.0
Destination: 0.0


                                                                                

# Oversampling

In [28]:
# Read in Data
df = spark.read.parquet("final_dataset.parquet")

In [29]:
df = assembler.transform(df)

In [30]:
from pyspark.sql import functions as F

# Step 1: Group by 'Severity' and count occurrences
class_counts = df.groupBy("Severity").count()

# Step 2: Use PySpark's max() function to find the maximum count
max_class_size = class_counts.agg(F.max('count')).collect()[0][0]

# Initialize a list to store the oversampled DataFrames for each class
oversampled_df_list = []

# Step 3: Oversample each class to match the maximum class size
for row in class_counts.collect():
    class_label = row['Severity']
    class_size = row['count']

    # Filter data for the current class
    class_data = df.filter(F.col("Severity") == class_label)

    if class_size < max_class_size:
        # Oversample the class to match the maximum class size
        class_data_oversampled = class_data.sample(withReplacement=True, fraction=max_class_size / class_size)
    else:
        # Retain the original data for the class if it's already at the maximum size
        class_data_oversampled = class_data

    # Add the oversampled data to the list
    oversampled_df_list.append(class_data_oversampled)

# Step 4: Combine all the oversampled DataFrames
oversampled_df = oversampled_df_list[0]  # start with the first one
for class_df in oversampled_df_list[1:]:
    oversampled_df = oversampled_df.union(class_df)

# Show the result of oversampling
oversampled_df.show()

# Step 5: Group by 'Severity' and count the occurrences in the oversampled DataFrame
oversampled_class_counts = oversampled_df.groupBy("Severity").count()

# Show the result
oversampled_class_counts.show()

+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+--------------------------+--------------------+---------------------------------+-----------------+-----------------+-------------------+------------+--------------------------+-----------+--------------------+--------------------+-----------+--------------------+
|Severity|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Speed(mph)|Precipitation(in)|Weekday|Rush Hour|Holiday|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate Indicator|Sex ratio (males per 100 females)|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|MedianIncome_MarginOfError|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|            features|
+--------+--------------+-----------+------------+--------------+---------------+-----------------+-------+---------+-------+----+----+-------------+---------------------



+--------+-------+
|Severity|  count|
+--------+-------+
|       1|5659217|
|       3|5656247|
|       4|5659448|
|       2|5659044|
+--------+-------+



                                                                                

In [31]:
# Split the data into train and test
splits = oversampled_df.randomSplit([0.8, 0.2], 314)
oversampled_train = splits[0]
oversampled_test = splits[1]

In [32]:
# Group by 'Severity' and count the occurrences in the training oversampled DataFrame
oversampled_class_counts = oversampled_train.groupBy("Severity").count()

# Show the result
oversampled_class_counts.show()



+--------+-------+
|Severity|  count|
+--------+-------+
|       1|4525580|
|       3|4524452|
|       4|4526837|
|       2|4526438|
+--------+-------+



                                                                                

In [33]:
# Group by 'Severity' and count the occurrences in the testing oversampled DataFrame
oversampled_class_counts = oversampled_test.groupBy("Severity").count()

# Show the result
oversampled_class_counts.show()



+--------+-------+
|Severity|  count|
+--------+-------+
|       1|1133637|
|       3|1131795|
|       4|1132611|
|       2|1132606|
+--------+-------+



                                                                                

In [34]:
# Standardize the predictors
from pyspark.ml.feature import StandardScaler
oversampled_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
oversampled_scalerModel = oversampled_scaler.fit(oversampled_train)
oversampled_scaledTrainData = oversampled_scalerModel.transform(oversampled_train)
oversampled_scaledTestData = oversampled_scalerModel.transform(oversampled_test)

                                                                                

In [35]:
# Fit logistic regression model with intercept
from pyspark.ml.classification import LogisticRegression

# instantiate the model
oversampled_lr = LogisticRegression(labelCol='Severity',
                        featuresCol='scaledFeatures',
                        maxIter=10, 
                        regParam=0.3, 
                        elasticNetParam=0.8,
                        family="multinomial")

# Fit the model
oversampled_lrModel = oversampled_lr.fit(oversampled_scaledTrainData)
print(f"Detected number of classes: {oversampled_lrModel.numClasses}")

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(oversampled_lrModel.coefficientMatrix))
print("Intercept: " + str(oversampled_lrModel.interceptVector))

                                                                                

Detected number of classes: 5
Coefficients: 5 X 23 CSRMatrix

Intercept: [-12.260248820919148,3.0650077037447794,3.0651972746919434,3.064758422912057,3.065285419570369]


In [36]:
# Assuming you used StringIndexer for encoding the 'Severity' column
indexer = StringIndexer(inputCol="Severity", outputCol="SeverityIndex")
indexer_model = indexer.fit(oversampled_scaledTrainData)
indexer_model.labels

                                                                                

['4', '2', '1', '3']

In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute predictions. this will append column "prediction" to dataframe
oversampled_lrPred = oversampled_lrModel.transform(oversampled_scaledTestData)
oversampled_lrPred.select("prediction").distinct().show()

evaluator = MulticlassClassificationEvaluator(labelCol='Severity', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(oversampled_lrPred, {evaluator.metricName: "accuracy"})
print(f'Accuracy: {accuracy}')

                                                                                

+----------+
|prediction|
+----------+
|       4.0|
+----------+





Accuracy: 0.2499886881548317


                                                                                

In [38]:
precision = evaluator.evaluate(oversampled_lrPred, {evaluator.metricName: 'weightedPrecision'})
recall = evaluator.evaluate(oversampled_lrPred, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(oversampled_lrPred, {evaluator.metricName: 'f1'})
#recallByLabel = evaluator.evaluate(oversampled_lrPred, {evaluator.metricName: 'recallByLabel'})

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')
#print(f'Recall by Label: {recallByLabel}')



Precision: 0.062494344205373684
Recall: 0.2499886881548317
F1 Score: 0.09999185560250883


                                                                                

In [39]:
# Add a column to indicate correct or incorrect predictions
predictions = oversampled_lrPred.withColumn(
    'is_correct', F.expr("CASE WHEN Severity = prediction THEN 1 ELSE 0 END")
)

# Calculate accuracy by class
accuracy_by_class = predictions.groupBy('Severity').agg(
    (F.sum('is_correct') / F.count('Severity')).alias('accuracy')
)

# Show per-class accuracy
accuracy_by_class.show()



+--------+--------+
|Severity|accuracy|
+--------+--------+
|       1|     0.0|
|       3|     0.0|
|       4|     1.0|
|       2|     0.0|
+--------+--------+



                                                                                

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F

# Function to evaluate model performance
def evaluate_model(model, data, evaluator):
    predictions = model.transform(data)
    return evaluator.evaluate(predictions)

# Initialize the evaluator (using F1 score here)
evaluator = MulticlassClassificationEvaluator(labelCol="Severity", predictionCol="prediction", metricName="f1")

# Calculate baseline performance on test data
baseline_score = evaluate_model(oversampled_lrModel, oversampled_scaledTestData, evaluator)
print(f"Baseline F1 score: {baseline_score}")

# Assuming you know which features were used in the VectorAssembler, replace these with the actual feature names
feature_names = feature_list  # Replace with actual feature names from your dataset

# Initialize a dictionary to store feature importances
feature_importances = {}

# Loop through each feature and calculate permutation importance
for feature in feature_names:
    # Shuffle the feature column
    shuffled_data = oversampled_scaledTestData.withColumn(feature, F.rand())  # Shuffle values in the column
    
    # Evaluate the model with the shuffled feature
    score_with_permuted_feature = evaluate_model(oversampled_lrModel, shuffled_data, evaluator)
    
    # Calculate the drop in performance
    importance = baseline_score - score_with_permuted_feature
    feature_importances[feature] = importance

# Sort the features by importance (i.e., the drop in F1 score)
sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the feature importances
print("Feature Importances (by drop in F1 score):")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")

                                                                                

Baseline F1 score: 0.09999185560250883




# Do Not Scale Data to Investigate Feature Importance

In [None]:
# Fit logistic regression model with intercept
from pyspark.ml.classification import LogisticRegression

# instantiate the model
lr = LogisticRegression(labelCol='Severity',
                        featuresCol='features',
                        maxIter=10, 
                        regParam=0.3, 
                        elasticNetParam=0.8,
                        family="multinomial")

# Fit the model
lrModel = lr.fit(train)
print(f"Detected number of classes: {lrModel.numClasses}")

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

In [None]:
# Assuming you used StringIndexer for encoding the 'Severity' column
indexer = StringIndexer(inputCol="Severity", outputCol="SeverityIndex")
indexer_model = indexer.fit(train)
indexer_model.labels

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute predictions. this will append column "prediction" to dataframe
lrPred = lrModel.transform(test)
lrPred.select("prediction").distinct().show()

evaluator = MulticlassClassificationEvaluator(labelCol='Severity', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(lrPred, {evaluator.metricName: "accuracy"})
print(f'Accuracy: {accuracy}')

In [None]:
precision = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedPrecision'})
recall = evaluator.evaluate(lrPred, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(lrPred, {evaluator.metricName: 'f1'})
#recallByLabel = evaluator.evaluate(lrPred, {evaluator.metricName: 'recallByLabel'})

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')
#print(f'Recall by Label: {recallByLabel}')

In [None]:
# Add a column to indicate correct or incorrect predictions
predictions = lrPred.withColumn(
    'is_correct', F.expr("CASE WHEN Severity = prediction THEN 1 ELSE 0 END")
)

# Calculate accuracy by class
accuracy_by_class = predictions.groupBy('Severity').agg(
    (F.sum('is_correct') / F.count('Severity')).alias('accuracy')
)

# Show per-class accuracy
accuracy_by_class.show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F

# Function to evaluate model performance
def evaluate_model(model, data, evaluator):
    predictions = model.transform(data)
    return evaluator.evaluate(predictions)

# Initialize the evaluator (using F1 score here)
evaluator = MulticlassClassificationEvaluator(labelCol="Severity", predictionCol="prediction", metricName="f1")

# Calculate baseline performance on test data
baseline_score = evaluate_model(lrModel, test, evaluator)
print(f"Baseline F1 score: {baseline_score}")

# Assuming you know which features were used in the VectorAssembler, replace these with the actual feature names
feature_names = feature_list  # Replace with actual feature names from your dataset

# Initialize a dictionary to store feature importances
feature_importances = {}

# Loop through each feature and calculate permutation importance
for feature in feature_names:
    # Shuffle the feature column
    shuffled_data = test.withColumn(feature, F.rand())  # Shuffle values in the column
    
    # Evaluate the model with the shuffled feature
    score_with_permuted_feature = evaluate_model(lrModel, shuffled_data, evaluator)
    
    # Calculate the drop in performance
    importance = baseline_score - score_with_permuted_feature
    feature_importances[feature] = importance

# Sort the features by importance (i.e., the drop in F1 score)
sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the feature importances
print("Feature Importances (by drop in F1 score):")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")