# Research Question:
### What are the most influential variables on the severity of accidents?

Useful Paper:
    https://www.sciencedirect.com/science/article/pii/S2590198223000611

In [None]:
#Display Spark Output in scrollable format within jupyter notebook
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
#Supress Warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#pip install seaborn

In [4]:
# Core libraries
import pandas as pd
import numpy as np

# PySpark imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.functions import col

# Additional libraries for visualization and analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Import Boost-specific modules for PySpark
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics


# Load Data

In [5]:
spark = SparkSession.builder \
    .appName('GBoost-Traffic-Accidents') \
    .config("spark.executor.memory", "24g")\
    .config("spark.executor.cores", "8") \
    .config("spark.num.executors", "8") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")  # Suppress warnings

/opt/conda/lib/python3.7/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/02 18:21:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Read in Dataset
df = spark.read.parquet("final_dataset_revised.parquet")
df.show(5)

                                                                                

+--------+-----------+--------+--------+----------+----------+-------------+-------+---------+----+----+-------------+--------------------------+--------------------+---------+-----------------+-----------------+-------------------+------------+-----------+--------------------+--------------------+-----------+
|Severity|Temperature|Humidity|Pressure|Visibility|Wind_Speed|Precipitation|Weekday|Rush_Hour|Rain|Snow|    SeasonVec|Astronomical_TwilightIndex|Interstate_Indicator|Sex_ratio|Percent_Age_15-19|Percent_Age_20-24|Percent_Age_65_over|MedianIncome|Urban_Ratio|Traffic_Interference|Traffic_Intersection|Destination|
+--------+-----------+--------+--------+----------+----------+-------------+-------+---------+----+----+-------------+--------------------------+--------------------+---------+-----------------+-----------------+-------------------+------------+-----------+--------------------+--------------------+-----------+
|       2|         21|      85|      30|         1|        10|  

In [7]:
# Get row count
rows = df.count()
print(f"DataFrame Rows count : {rows}")

# Get columns count
cols = len(df.columns)
print(f"DataFrame Columns count : {cols}")

DataFrame Rows count : 7026806
DataFrame Columns count : 23


# Sampling

In [8]:
# Check Class Imbalance
import pyspark.sql.functions as F
cts = df.groupBy("Severity").count().withColumn('percent', (F.col('count') / rows)*100)
cts.show()

+--------+-------+------------------+
|Severity|  count|           percent|
+--------+-------+------------------+
|       1|  65142|0.9270499285165977|
|       3|1123799|15.993027272988611|
|       4| 178821|2.5448404296347444|
|       2|5659044| 80.53508236886005|
+--------+-------+------------------+



                                                                                

In [9]:
df_with_weights = df.withColumn(
    "Severity_Binary", 
    F.when(df["Severity"] >= 3, 1).otherwise(0)
)

In [10]:
class_counts = df_with_weights.groupBy('Severity_Binary').count().collect()
total_count = df_with_weights.count()

# Ensure that we have counts for both classes
class_weights = {}
for row in class_counts:
    class_weights[row['Severity_Binary']] = total_count / row['count']

print(f"Class Weights: {class_weights}")

Class Weights: {1: 5.39436366707098, 0: 1.2275642335870987}


In [11]:
df_with_weights = df_with_weights.withColumn(
    'weight', 
    F.when(df_with_weights['Severity_Binary'] == 1, class_weights[1])
     .when(df_with_weights['Severity_Binary'] == 0, class_weights[0])
)

# Step 3: Drop rows with missing values in 'Severity_Binary' and 'weight' columns
df_with_weights = df_with_weights.dropna(subset=['Severity_Binary', 'weight'])

# Modeling

In [12]:
feature_cols = ['Temperature', 'Humidity', 'Pressure', 'Visibility', 'Wind_Speed',
                'Precipitation', 'Weekday', 'Rush_Hour', 'Rain', 'Snow', 'SeasonVec',
                'Astronomical_TwilightIndex', 'Interstate_Indicator', 'Sex_ratio',
                'Percent_Age_15-19', 'Percent_Age_20-24', 'Percent_Age_65_over', 'MedianIncome',
                'Urban_Ratio', 'Traffic_Interference', 'Traffic_Intersection', 'Destination']

# Assemble the feature columns into a single vector column 'scaled_features'
assembler = VectorAssembler(inputCols=feature_cols, outputCol='scaled_features')
# Optional: Scale the features (depending on whether you think it helps)
#scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

In [13]:
gbt = GBTClassifier(labelCol="Severity_Binary", featuresCol="scaled_features", weightCol="weight", maxIter=10)

# Step 4: Create a pipeline with the VectorAssembler and GBTClassifier
pipeline = Pipeline(stages=[assembler, gbt])

# Step 5: Handle missing values - Remove rows with missing values in the required columns
df_with_weights = df_with_weights.dropna(subset=['Severity_Binary', 'weight'])
df_with_weights.printSchema()

root
 |-- Severity: integer (nullable = true)
 |-- Temperature: decimal(10,0) (nullable = true)
 |-- Humidity: decimal(10,0) (nullable = true)
 |-- Pressure: decimal(10,0) (nullable = true)
 |-- Visibility: decimal(10,0) (nullable = true)
 |-- Wind_Speed: decimal(10,0) (nullable = true)
 |-- Precipitation: decimal(10,0) (nullable = true)
 |-- Weekday: integer (nullable = true)
 |-- Rush_Hour: integer (nullable = true)
 |-- Rain: integer (nullable = true)
 |-- Snow: integer (nullable = true)
 |-- SeasonVec: vector (nullable = true)
 |-- Astronomical_TwilightIndex: integer (nullable = true)
 |-- Interstate_Indicator: integer (nullable = true)
 |-- Sex_ratio: float (nullable = true)
 |-- Percent_Age_15-19: float (nullable = true)
 |-- Percent_Age_20-24: float (nullable = true)
 |-- Percent_Age_65_over: float (nullable = true)
 |-- MedianIncome: float (nullable = true)
 |-- Urban_Ratio: float (nullable = true)
 |-- Traffic_Interference: integer (nullable = true)
 |-- Traffic_Intersection: 

In [14]:
# Split into train, validation, and test sets (e.g., 60% train, 20% validation, 20% test)
train_df, val_df, test_df = df_with_weights.randomSplit([0.6, 0.2, 0.2], seed=42)

# Check the class distribution in each of the splits
train_class_counts = train_df.groupBy("Severity_Binary").count()
val_class_counts = val_df.groupBy("Severity_Binary").count()
test_class_counts = test_df.groupBy("Severity_Binary").count()

# Show the counts for train, validation, and test sets
train_class_counts.show()
val_class_counts.show()
test_class_counts.show()

                                                                                

+---------------+-------+
|Severity_Binary|  count|
+---------------+-------+
|              1| 781694|
|              0|3435128|
+---------------+-------+



                                                                                

+---------------+-------+
|Severity_Binary|  count|
+---------------+-------+
|              1| 260903|
|              0|1144588|
+---------------+-------+





+---------------+-------+
|Severity_Binary|  count|
+---------------+-------+
|              1| 260023|
|              0|1144470|
+---------------+-------+



                                                                                

In [15]:
# Split into train, validation, and test sets (e.g., 60% train, 20% validation, 20% test)
train_df, val_df, test_df = df_with_weights.randomSplit([0.7, 0.15, 0.15], seed=42)

# Step 2: Get counts of each class in the training set
class_0_count = train_df.filter(train_df['Severity_Binary'] == 0).count()
class_1_count = train_df.filter(train_df['Severity_Binary'] == 1).count()

# Step 3: Downsample the majority class (class 0) to match the minority class (class 1) in the train set
class_0_sampled = train_df.filter(train_df['Severity_Binary'] == 0) \
    .sample(False, class_1_count / class_0_count, seed=42)

# For class 1 (minority class), no need to sample, keep all instances
class_1_df = train_df.filter(train_df['Severity_Binary'] == 1)

# Step 4: Combine the sampled data for the balanced training set
balanced_train_df = class_0_sampled.union(class_1_df)

# Check the class distribution in the balanced train set
train_class_counts_after = balanced_train_df.groupBy("Severity_Binary").count()
print("Train Class Counts (After Balancing):")
train_class_counts_after.show()

# Cache the balanced training set if needed
balanced_train_df.cache()

# Show the counts for validation, and test sets
val_class_counts = val_df.groupBy("Severity_Binary").count()
test_class_counts = test_df.groupBy("Severity_Binary").count()

# Show the counts for validation, and test sets
val_class_counts.show()
test_class_counts.show()


                                                                                

Train Class Counts (After Balancing):




24/12/02 18:22:10 ERROR Executor: Exception in task 33.0 in stage 29.0 (TID 251)
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:50)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:553)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:172)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.hashAgg_doAg

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3552, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_557486/3595977965.py", line 21, in <module>
    train_class_counts_after.show()
  File "/opt/conda/lib/python3.7/site-packages/pyspark/sql/dataframe.py", line 606, in show
    print(self._jdf.showString(n, 20, vertical))
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1322, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/opt/conda/lib/python3.7/site-packages/pyspark/sql/utils.py", line 190, in deco
    return f(*a, **kw)
  File "/opt/conda/lib/python3.7/site-packages/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>

During handling of the above exception, another exce

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
gbt_model = pipeline.fit(balanced_train_df)
evaluator = BinaryClassificationEvaluator(labelCol="Severity_Binary", rawPredictionCol="prediction")

# Evaluate the model on the validation set
val_predictions = gbt_model.transform(val_df)

In [None]:
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score, f1_score

# Prepare the true labels and predicted scores for the validation set
y_true_val = val_predictions.select("Severity_Binary").rdd.flatMap(lambda x: x).collect()
y_scores_val = val_predictions.select("probability").rdd.flatMap(lambda x: x).collect()
y_scores_val = [score[1] for score in y_scores_val]  # Only take the probability for Class 1

# Compute Precision-Recall curve and AUC for Validation Set
precision_val, recall_val, _ = precision_recall_curve(y_true_val, y_scores_val)
pr_auc_val = auc(recall_val, precision_val)

# Compute Precision, Recall, F1 for Validation Set
predictions_val_bin = [1 if score > 0.5 else 0 for score in y_scores_val]  # Apply threshold of 0.5
precision_val_final = precision_score(y_true_val, predictions_val_bin)
recall_val_final = recall_score(y_true_val, predictions_val_bin)
f1_val_final = f1_score(y_true_val, predictions_val_bin)
val_accuracy = evaluator.evaluate(val_predictions)

# Print Metrics for Validation Set
print(f"Validation Set Metrics:")
print(f"Validation Accuracy: {val_accuracy}")
print(f"Precision: {precision_val_final:.2f}")
print(f"Recall: {recall_val_final:.2f}")
print(f"F1 Score: {f1_val_final:.2f}")
print(f"AUC: {pr_auc_val:.2f}")

# Plot the Precision-Recall curve for Validation Set
plt.figure(figsize=(8, 6))
plt.plot(recall_val, precision_val, marker='.', label=f'AUC = {pr_auc_val:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Validation Set)')
plt.legend()
plt.show()

### Hyperparameter Tuning

In [None]:
# Step 1: Define the paramGrid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [4, 6])  # Expanded depth options for more granularity
             .addGrid(gbt.maxIter, [10, 15])   # Slightly different iteration ranges
             #.addGrid(gbt.stepSize, [0.05, 0.2])  # Step size options
             .build())

# Step 3: Set up the cross-validation
crossval = CrossValidator(estimator=pipeline, 
                          estimatorParamMaps=paramGrid, 
                          evaluator=evaluator,
                          parallelism=2, 
                          numFolds=4)  # Number of folds for cross-validation

# Step 4: Train the model with cross-validation
cv_model = crossval.fit(train_df)

# Step 5: Make predictions with the best model on the validation set
cv_predictions = cv_model.transform(val_df)

# Step 6: Evaluate the cross-validated model on the validation set
cv_accuracy = evaluator.evaluate(cv_predictions)
print(f"Cross-validated Accuracy on Validation Set: {cv_accuracy}")

# Step 7: Compute Precision, Recall, F1, AUC for the validation set
y_true_val = cv_predictions.select("Severity_Binary").rdd.flatMap(lambda x: x).collect()
y_scores_val = cv_predictions.select("probability").rdd.flatMap(lambda x: x).collect()
y_scores_val = [score[1] for score in y_scores_val]  # Only take the probability for Class 1

# Precision Recall Curve

precision_val, recall_val, _ = precision_recall_curve(y_true_val, y_scores_val)
pr_auc_val = auc(recall_val, precision_val)

# Compute Precision, Recall, F1 for Validation Set
predictions_val_bin = [1 if score > 0.5 else 0 for score in y_scores_val]  # Apply threshold of 0.5
precision_val_final = precision_score(y_true_val, predictions_val_bin)
recall_val_final = recall_score(y_true_val, predictions_val_bin)
f1_val_final = f1_score(y_true_val, predictions_val_bin)

# Print Metrics for Validation Set
print(f"Validation Set Metrics:")
print(f"Precision: {precision_val_final:.2f}")
print(f"Recall: {recall_val_final:.2f}")
print(f"F1 Score: {f1_val_final:.2f}")
print(f"AUC: {pr_auc_val:.2f}")

# Step 8: Get and save the best model from cross-validation
bestModel = cv_model.bestModel
bestModel.write().overwrite().save("best_gb_model_v2")
print("Best model parameters:")
print(bestModel.stages[-1]._java_obj.paramMap())  # Print the best model's parameters

# Optionally: Display the best hyperparameters from cross-validation
best_params = bestModel.stages[-1]._java_obj.paramMap()
print(f"Best Hyperparameters: {best_params}")

## Load Saved Best Model

In [None]:
# Load the saved best model
#loadedGBModel = PipelineModel.load("best_gb_model_v2")

## Feature Importance

In [None]:
feature_importances = cv_model.bestModel.stages[-1].featureImportances

# Create a mapping between feature names and their importance scores
feature_importance_dict = {}
feature_names = assembler.getInputCols()
for i, feature_name in enumerate(feature_names):
    feature_importance_dict[feature_name] = feature_importances[i]

# Sort the feature importance dictionary by score in descending order
sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the feature importances
for feature_name, importance_score in sorted_feature_importances:
    print(f"{feature_name}: {importance_score}")

# Evaluation and Metrics

In [None]:
# Step 1: Evaluate Metrics for Test Set
test_predictions = gbt_model.transform(test_df)

# Prepare the true labels and predicted scores for the test set
y_true_test = test_predictions.select("Severity_Binary").rdd.flatMap(lambda x: x).collect()
y_scores_test = test_predictions.select("probability").rdd.flatMap(lambda x: x).collect()
y_scores_test = [score[1] for score in y_scores_test]  # Only take the probability for Class 1

# Compute Precision-Recall curve and AUC for Test Set
precision_test, recall_test, _ = precision_recall_curve(y_true_test, y_scores_test)
pr_auc_test = auc(recall_test, precision_test)

# Compute Precision, Recall, F1 for Test Set
precision_test_final = precision_score(y_true_test, [1 if score > 0.5 else 0 for score in y_scores_test])
recall_test_final = recall_score(y_true_test, [1 if score > 0.5 else 0 for score in y_scores_test])
f1_test_final = f1_score(y_true_test, [1 if score > 0.5 else 0 for score in y_scores_test])

# Print Metrics for Test Set
print(f"Test Set Metrics:")
print(f"Precision: {precision_test_final:.2f}")
print(f"Recall: {recall_test_final:.2f}")
print(f"F1 Score: {f1_test_final:.2f}")
print(f"AUC: {pr_auc_test:.2f}")

# Plot the Precision-Recall curve for Test Set
plt.figure(figsize=(8, 6))
plt.plot(recall_test, precision_test, marker='.', label=f'AUC = {pr_auc_test:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Test Set)')
plt.legend()
plt.show()

In [None]:
# Step 1: Convert predictions to RDD for evaluation
predictionAndLabels = predictions.select("prediction", "Severity_Binary")
rdd = predictionAndLabels.rdd.map(lambda x: (float(x[0]), float(x[1])))

# Step 2: Compute confusion matrix using BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(rdd)  # Binary Classification Metrics

# Step 3: Confusion Matrix (PySpark)
conf_matrix = metrics.confusionMatrix().toArray()  # Confusion Matrix
print(f"Confusion Matrix (PySpark):\n{conf_matrix}")

# Step 4: Convert confusion matrix to pandas DataFrame for better visualization
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
sns.heatmap(conf_matrix_df, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title('Confusion Matrix')
plt.show()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.7/site-packages/py4j/clientserver.py", line 540, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
