In [1]:
from pyspark.sql import SparkSession
MAX_MEMORY = '8g'
spark = SparkSession.builder.appName('UK-road-safety-pred')\
                    .config('spark.driver.memory', MAX_MEMORY)\
                    .config('spark.executor.memory', MAX_MEMORY)\
                    .getOrCreate()

In [2]:
import os
cwd = os.getcwd()
a_data_path = os.path.join(cwd, 'learning_spark_data', 'Accident_Information.csv')
v_data_path = os.path.join(cwd, 'learning_spark_data', 'Vehicle_Information.csv')

a_file_path = f"file:///{a_data_path.replace(os.sep, '/')}"
v_file_path = f"file:///{v_data_path.replace(os.sep, '/')}"

accident_df = spark.read.csv(a_file_path, inferSchema=True, header=True)
vehicle_df = spark.read.csv(v_file_path, inferSchema=True, header=True)

accident_df.printSchema(), vehicle_df.printSchema()

root
 |-- Accident_Index: string (nullable = true)
 |-- 1st_Road_Class: string (nullable = true)
 |-- 1st_Road_Number: string (nullable = true)
 |-- 2nd_Road_Class: string (nullable = true)
 |-- 2nd_Road_Number: string (nullable = true)
 |-- Accident_Severity: string (nullable = true)
 |-- Carriageway_Hazards: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Day_of_Week: string (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: string (nullable = true)
 |-- Junction_Control: string (nullable = true)
 |-- Junction_Detail: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Light_Conditions: string (nullable = true)
 |-- Local_Authority_(District): string (nullable = true)
 |-- Local_Authority_(Highway): string (nullable = true)
 |-- Location_Easting_OSGR: string (nullable = true)
 |-- Location_Northing_OSGR: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- LSOA_of_Accident_Location: string (nullable = true)
 |-- Num

(None, None)

In [3]:
vehicle_df = vehicle_df.withColumnRenamed("Engine_Capacity_.CC.", "Engine_Capacity")

In [4]:
accident_df.createOrReplaceTempView("accident")
vehicle_df.createOrReplaceTempView("vehicle")

In [5]:
query = '''
    SELECT
        a.Accident_Index,
        a.1st_Road_Class,
        a.Date,
        a.Day_of_Week,
        a.Accident_Severity,
        a.Did_Police_Officer_Attend_Scene_of_Accident,
        a.Road_Type,
        a.Speed_limit,
        a.Urban_or_Rural_Area,
        v.Age_Band_of_Driver,
        v.Sex_of_Driver,
        v.Vehicle_Type,
        v.Age_of_Vehicle,
        v.Engine_Capacity,
        v.Driver_Home_Area_Type
    FROM accident a
    JOIN vehicle v
    ON a.Accident_Index = v.Accident_Index
'''

join_df = spark.sql(query)

In [6]:
# 전처리
cat_columns = [
    "1st_Road_Class", "Day_of_Week", "Did_Police_Officer_Attend_Scene_of_Accident",
    "Road_Type", "Urban_or_Rural_Area", "Age_Band_of_Driver", "Sex_of_Driver",
    "Vehicle_Type", "Driver_Home_Area_Type"
]

join_df = join_df.fillna({cat: "Unknown" for cat in cat_columns})

In [7]:
from pyspark.sql.functions import avg, col

join_df = join_df.withColumn("Speed_limit", col("Speed_limit").cast("double"))
join_df = join_df.withColumn("Age_of_Vehicle", col("Age_of_Vehicle").cast("double"))
join_df = join_df.withColumn("Engine_Capacity", col("Engine_Capacity").cast("double"))

# 평균값
avg_speed = join_df.select(avg("Speed_limit")).first()[0]
avg_age = join_df.select(avg("Age_of_Vehicle")).first()[0]
avg_engine = join_df.select(avg("Engine_Capacity")).first()[0]

join_df = join_df.fillna({
    "Speed_limit": avg_speed,
    "Age_of_Vehicle": avg_age,
    "Engine_Capacity": avg_engine
})

In [8]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [9]:
stages = []

In [10]:
cat_features = [
    "1st_Road_Class", "Day_of_Week", "Did_Police_Officer_Attend_Scene_of_Accident",
    "Road_Type", "Urban_or_Rural_Area", "Age_Band_of_Driver", "Sex_of_Driver",
    "Vehicle_Type", "Driver_Home_Area_Type"
]

for cat in cat_features:
    cat_index = StringIndexer(inputCol=cat, outputCol=cat+'_idx').setHandleInvalid('keep')
    stages += [cat_index]

In [11]:
target_indexer = StringIndexer(inputCol="Accident_Severity", outputCol="label").setHandleInvalid('keep')
stages += [target_indexer]

In [12]:
num_features = ["Speed_limit", "Age_of_Vehicle", "Engine_Capacity"]

In [13]:
assembler_input = [cat + "_idx" for cat in cat_features] + num_features

In [14]:
assembler = VectorAssembler(inputCols=assembler_input, outputCol="feature_vector")
stages += [assembler]

In [15]:
train_df, test_df = join_df.randomSplit([0.8, 0.2], seed=3)

In [16]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
model = pipeline.fit(train_df)
vtrain_df = model.transform(train_df)

In [17]:
from pyspark.ml.linalg import DenseVector, VectorUDT
from pyspark.sql.functions import udf

to_dense = udf(lambda v: DenseVector(v.toArray()), VectorUDT())
vtrain_df = vtrain_df.withColumn("dense_vector", to_dense("feature_vector"))

In [18]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    labelCol="label", featuresCol="dense_vector", seed=3, numTrees=300, maxDepth=8,
)

In [19]:
rf_model = rf.fit(vtrain_df)

In [20]:
vtest_df = model.transform(test_df)

In [21]:
vtest_df = vtest_df.withColumn("dense_vector", to_dense("feature_vector"))

In [22]:
pred = rf_model.transform(vtest_df)

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

eval = MulticlassClassificationEvaluator(
    labelCol='label',
    predictionCol='prediction',
    metricName='f1'
)

f1 = eval.evaluate(pred)
f1

0.7917836050024545

In [24]:
spark.stop()