In [2]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# Imports
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType, col
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer

In [3]:
# Building session now
spark = SparkSession.builder.appName('ML_with_spark').getOrCreate()

In [4]:
# Load data from csv
data = spark.read.csv('accidents_2005_to_2007.csv', header=True, inferSchema=True)

In [5]:
data = data.drop("Junction_Detail")
data = data.drop("Junction_Control")
data = data.dropna(how='any', thresh=None, subset=None)
data = data.withColumnRenamed("Accident_Severity", "label")

In [6]:
data.columns

['Accident_Index',
 'Location_Easting_OSGR',
 'Location_Northing_OSGR',
 'Longitude',
 'Latitude',
 'Police_Force',
 'label',
 'Number_of_Vehicles',
 'Number_of_Casualties',
 'Date',
 'Day_of_Week',
 'Time',
 'Local_Authority_(District)',
 'Local_Authority_(Highway)',
 '1st_Road_Class',
 '1st_Road_Number',
 'Road_Type',
 'Speed_limit',
 '2nd_Road_Class',
 '2nd_Road_Number',
 'Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Light_Conditions',
 'Weather_Conditions',
 'Road_Surface_Conditions',
 'Special_Conditions_at_Site',
 'Carriageway_Hazards',
 'Urban_or_Rural_Area',
 'Did_Police_Officer_Attend_Scene_of_Accident',
 'LSOA_of_Accident_Location',
 'Year']

In [7]:
data.printSchema()

root
 |-- Accident_Index: string (nullable = true)
 |-- Location_Easting_OSGR: integer (nullable = true)
 |-- Location_Northing_OSGR: integer (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Police_Force: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- Number_of_Vehicles: integer (nullable = true)
 |-- Number_of_Casualties: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Day_of_Week: integer (nullable = true)
 |-- Time: string (nullable = true)
 |-- Local_Authority_(District): integer (nullable = true)
 |-- Local_Authority_(Highway): string (nullable = true)
 |-- 1st_Road_Class: integer (nullable = true)
 |-- 1st_Road_Number: integer (nullable = true)
 |-- Road_Type: string (nullable = true)
 |-- Speed_limit: integer (nullable = true)
 |-- 2nd_Road_Class: integer (nullable = true)
 |-- 2nd_Road_Number: integer (nullable = true)
 |-- Pedestrian_Crossing-Human_Control: string (nullable = true)


In [8]:
#assembler = VectorAssembler(
#  inputCols=['Location_Easting_OSGR',
#             'Location_Northing_OSGR',
#             'Longitude',
#             'Police_Force',
#             'Number_of_Vehicles',
#             'Number_of_Casualties',
#
#             'Day_of_Week',
# 
#             'Local_Authority_(District)',
#             '1st_Road_Class',
#             '1st_Road_Number',
#  
#             'Speed_limit',
#             '2nd_Road_Class',
#             '2nd_Road_Number',
#        
#            'Urban_or_Rural_Area',
       
#            'Year',],
#              outputCol="features")

#output = assembler.transform(data)

In [9]:
assembler = VectorAssembler(
  inputCols=[
             'Number_of_Vehicles',
             'Number_of_Casualties'],
              outputCol="features")

output = assembler.transform(data)

In [9]:
categorical_columns = ['Accident_Index']
indexer = StringIndexer(inputCol='Accident_Index', outputCol="Accident_Index_index")

In [10]:
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features",'label')

In [11]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [13]:
dtc = DecisionTreeClassifier(labelCol='label',featuresCol='features')
rfc = RandomForestClassifier(labelCol='label',featuresCol='features')
#gbt = GBTClassifier(labelCol='label',featuresCol='features')

In [14]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
#gbt_model = gbt.fit(train_data)

In [15]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
#gbt_predictions = gbt_model.transform(test_data)

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [17]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [18]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
#gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [19]:
print("Here are the results!")
print('-'*50)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*50)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

Here are the results!
--------------------------------------------------
A single decision tree had an accuracy of: 85.86%
--------------------------------------------------
A random forest ensemble had an accuracy of: 85.86%
