In [None]:
import numpy as np
import pandas as pd

## Load the data

In [3]:
accidents = pd.read_csv('../data/raw/Road Safety Data - Accidents 2019.csv', delimiter=',', dtype='str_')
casualties = pd.read_csv("../data/raw/Road Safety Data - Casualties 2019.csv", delimiter = ",", dtype = "str_")
vehicles = pd.read_csv("../data/raw/Road Safety Data- Vehicles 2019.csv", delimiter = ",", dtype = "str_")

In [4]:
# Convert loaded data to pandas data frames
accidents = pd.DataFrame(accidents)
casualties = pd.DataFrame(casualties)
vehicles = pd.DataFrame(vehicles)

## Report the dimensions of the data (number of tables, rows, fields).

In [5]:
print(accidents.shape)
print(casualties.shape)
print(vehicles.shape)

(117536, 32)
(153158, 16)
(216381, 23)


## Masking for Birmingham

In [6]:
# creating the mask to show the data for Birmingham on the accidents sheet
birmingham_mask = accidents["Local_Authority_(District)"] == "300"
clean_accidents = accidents[birmingham_mask]

print(clean_accidents.shape)


(2623, 32)


In [7]:
# Create an array of the accident IDs related to Birmingham (use this to filter the other tables)
accident_ids = pd.array(clean_accidents["Accident_Index"])

# Create a mask to filter casualties table based on the accident ids
casualties_mask = casualties["Accident_Index"].isin(accident_ids)
clean_casualties = casualties[casualties_mask]

print(clean_casualties.shape)

(3551, 16)


In [8]:
# Create a mask to filter vehicle table based on the accident ids
vehicles_mask = vehicles["Accident_Index"].isin(accident_ids)
clean_vehicles = vehicles[vehicles_mask]

print(clean_vehicles.shape)

(4962, 23)


In [9]:
# optional
# combining all the data
# can we verify that this actually merges correctly? Or should we keep the tables seperate?
# some accidents have more than 1 casualty, for example, and the combined data only shows info for 1 casualty each
result = pd.concat([accidents, vehicles, casualties], axis=1)

# making sure there are no duplicate columns
data = result.loc[:,~result.columns.duplicated()]

combined_mask = data["Local_Authority_(District)"] == "300"

combined_clean_data = data[combined_mask]

#print(data[combined_mask])

#print(combined_clean_data["Local_Authority_(District)"])

# How can we write this line so that it saves to the correct folder in the repository?
combined_clean_data.to_csv(r'../Data/interim/combined_clean_data.csv', index = False, header = True)

## Identify for each variable whether it is numerical or categorical.

In [10]:
for i in clean_casualties.head(0):
    print(i," - Categorical<br>")

Accident_Index  - Categorical<br>
Vehicle_Reference  - Categorical<br>
Casualty_Reference  - Categorical<br>
Casualty_Class  - Categorical<br>
Sex_of_Casualty  - Categorical<br>
Age_of_Casualty  - Categorical<br>
Age_Band_of_Casualty  - Categorical<br>
Casualty_Severity  - Categorical<br>
Pedestrian_Location  - Categorical<br>
Pedestrian_Movement  - Categorical<br>
Car_Passenger  - Categorical<br>
Bus_or_Coach_Passenger  - Categorical<br>
Pedestrian_Road_Maintenance_Worker  - Categorical<br>
Casualty_Type  - Categorical<br>
Casualty_Home_Area_Type  - Categorical<br>
Casualty_IMD_Decile  - Categorical<br>


References<br>
what is categorical - http://www.stat.yale.edu/Courses/1997-98/101/catdat.htm <br>
the hell is LSOA - https://datadictionary.nhs.uk/nhs_business_definitions/lower_layer_super_output_area.html#:~:text=A%20Lower%20Layer%20Super%20Output,Lower%20Layer%20Super%20Output%20Areas
decile -http://mast.roadsafetyanalysis.org/wiki/index.php?title=Driver_IMD_Decile#:~:text=An%20IMD%20decile%20is%20a,the%2010%25%20least%20deprived%20areas.


Acidents <br><br>
Accident_Index  - Categorical<br>
Location_Easting_OSGR  - Numerical<br>
Location_Northing_OSGR  - Numerical<br>
Longitude  - Numerical<br>
Latitude  - Numerical<br>
Police_Force  - Categorical<br>
Accident_Severity  - Categorical<br>
Number_of_Vehicles  - Numerical<br>
Number_of_Casualties  - Numerical<br>
Date  - Numerical/Categorical<br>
Day_of_Week  - Categorical<br>
Time  - Numerical(but can be categorical)<br>
Local_Authority_(District)  - Categorical<br>
Local_Authority_(Highway)  - Categorical<br>
1st_Road_Class  - Categorical<br>
1st_Road_Number  - Categorical<br>
Road_Type  - Categorical<br>
Speed_limit  - Categorical(we questioned this)<br>
Junction_Detail  - Categorical<br>
Junction_Control  - Categorical<br>
2nd_Road_Class  - Categorical<br>
2nd_Road_Number  - Categorical<br>
Pedestrian_Crossing-Human_Control  - Categorical<br>
Pedestrian_Crossing-Physical_Facilities  - Categorical<br>
Light_Conditions  - Categorical<br>
Weather_Conditions  - Categorical<br>
Road_Surface_Conditions  - Categorical<br>
Special_Conditions_at_Site  - Categorical<br>
Carriageway_Hazards  - Categorical<br>
Urban_or_Rural_Area  - Categorical<br>
Did_Police_Officer_Attend_Scene_of_Accident  - Categorical<br>
LSOA_of_Accident_Location  - Categorical<br>

Vehicles<br><br>
Accident_Index  - Categorical<br>
Vehicle_Reference  - Categorical<br>
Vehicle_Type  - Categorical<br>
Towing_and_Articulation  - Categorical<br>
Vehicle_Manoeuvre  - Categorical<br>
Vehicle_Location-Restricted_Lane  - Categorical<br>
Junction_Location  - Categorical<br>
Skidding_and_Overturning  - Categorical<br>
Hit_Object_in_Carriageway  - Categorical<br>
Vehicle_Leaving_Carriageway  - Categorical<br>
Hit_Object_off_Carriageway  - Categorical<br>
1st_Point_of_Impact  - Categorical<br>
Was_Vehicle_Left_Hand_Drive?  - Categorical<br>
Journey_Purpose_of_Driver  - Categorical<br>
Sex_of_Driver  - Categorical<br>
Age_of_Driver  - Numerical<br>
Age_Band_of_Driver  - Categorical<br>
Engine_Capacity_(CC)  - Numerical<br>
Propulsion_Code  - Categorical(probably)<br>
Age_of_Vehicle  - Numerical<br>
Driver_IMD_Decile  - Categorical<br>
Driver_Home_Area_Type  - Categorical<br>
Vehicle_IMD_Decile  - Categorical<br>

Casualties <br><br>
Accident_Index  - Categorical<br>
Vehicle_Reference  - Categorical<br>
Casualty_Reference  - Categorical<br>
Casualty_Class  - Categorical<br>
Sex_of_Casualty  - Categorical<br>
Age_of_Casualty  - Numerical<br>
Age_Band_of_Casualty  - Categorical<br>
Casualty_Severity  - Categorical<br>
Pedestrian_Location  - Categorical<br>
Pedestrian_Movement  - Categorical<br>
Car_Passenger  - Categorical<br>
Bus_or_Coach_Passenger  - Categorical<br>
Pedestrian_Road_Maintenance_Worker  - Categorical<br>
Casualty_Type  - Categorical<br>
Casualty_Home_Area_Type  - Categorical<br>
Casualty_IMD_Decile  - Categorical<br>

## Make the following sanity check: Does every AccidentID in the casualties and vehicles tables have their corresponding AccidentID in the accident table?

## Report a five number summary for all numerical variables (where this makes sense)

## Report a box plot for all numerical variables (where this makes sense)

## Report a frequency histogram for all numerical variables, and a frequency bar plot for all categorical variables (where this makes sense)