In [10]:
import requests

url = "https://raw.githubusercontent.com/dthomas1108/year4-data-visualisation-assignment/refs/heads/master/collision2017.csv"
r = requests.get(url)

with open("file.csv", "wb") as f:
    f.write(r.content)

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyproj import Transformer
from dash import Dash, dcc, html, Input, Output, ctx
import plotly.express as px
import webview
from threading import Thread
from time import sleep
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons
import plotly.graph_objects as go
import folium

<H1>Collision Data</H1>
<h2>Reading and Cleaning</h2>

In [16]:
csv_path = 'collision2017.csv'

dfCol = pd.read_csv(csv_path)

display(dfCol.head(10))
print('Shape:', dfCol.shape)
print('Columns:', list(dfCol.columns))

Unnamed: 0,a_year,a_ref,a_District,a_type,a_veh,a_cas,a_wkday,a_day,a_month,a_hour,...,a_jdet,a_jcont,a_pedhum,a_pedphys,a_light,a_weat,a_roadsc,a_speccs,a_chaz,a_scene
0,2017,1,NEMD,3,1,1,SUN,1,1,3,...,,,,,,,,,,
1,2017,2,MEAN,3,2,1,SUN,1,1,3,...,,,,,,,,,,
2,2017,3,ARBC,2,1,1,SUN,1,1,10,...,2.0,7.0,1.0,1.0,2.0,10.0,1.0,1.0,1.0,1.0
3,2017,4,ANTN,3,2,1,SUN,1,1,10,...,,,,,,,,,,
4,2017,5,ANTN,2,2,2,SUN,1,1,11,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
5,2017,6,ARND,3,2,4,SUN,1,1,12,...,,,,,,,,,,
6,2017,7,LISC,3,3,1,SUN,1,1,13,...,,,,,,,,,,
7,2017,8,MEAN,2,1,2,SUN,1,1,17,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0
8,2017,9,CCGL,2,1,1,MON,2,1,8,...,10.0,7.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0
9,2017,10,MEAN,2,3,4,MON,2,1,9,...,10.0,7.0,1.0,1.0,2.0,9.0,4.0,1.0,1.0,1.0


Shape: (6081, 25)
Columns: ['a_year', 'a_ref', 'a_District', 'a_type', 'a_veh', 'a_cas', 'a_wkday', 'a_day', 'a_month', 'a_hour', 'a_min', 'a_gd1', 'a_gd2', 'a_ctype', 'a_speed', 'a_jdet', 'a_jcont', 'a_pedhum', 'a_pedphys', 'a_light', 'a_weat', 'a_roadsc', 'a_speccs', 'a_chaz', 'a_scene']


In [29]:
dfCol.dtypes

a_year         int64
a_ref          int64
a_District    object
a_type         int64
a_veh          int64
a_cas          int64
a_wkday       object
a_day          int64
a_month        int64
a_hour         int64
a_min          int64
a_gd1          int64
a_gd2          int64
a_ctype        int64
a_speed        int64
a_jdet        object
a_jcont       object
a_pedhum      object
a_pedphys     object
a_light       object
a_weat        object
a_roadsc      object
a_speccs      object
a_chaz        object
a_scene       object
dtype: object

In [18]:
# Detect duplicate rows based on the 'a_ref' column
duplicate_rows = dfCol[dfCol.duplicated(subset='a_ref', keep=False)]

# Print the duplicate rows
print("Duplicate rows based on 'a_ref':")
display(duplicate_rows)

Duplicate rows based on 'a_ref':


Unnamed: 0,a_year,a_ref,a_District,a_type,a_veh,a_cas,a_wkday,a_day,a_month,a_hour,...,a_jdet,a_jcont,a_pedhum,a_pedphys,a_light,a_weat,a_roadsc,a_speccs,a_chaz,a_scene


### The Above should be empty showing 0 duplicates <br> <br>The Below shows the Unique string values (exluding numeric catorgarised as Strings)

In [19]:
# Display the unique values for string columns (excuding numberic catorgaised as strings)
for col in dfCol.columns:
    if dfCol[col].dtype == 'object'and col not in ['a_jdet', 'a_jcont','a_pedhum','a_light','a_weat','a_roadsc','a_speccs','a_chaz','a_scene','a_pedphys']:
        unique_values = dfCol[col].unique()
        print(f"Unique values for '{col}':")
        display(unique_values[:20])

Unique values for 'a_District':


array(['NEMD', 'MEAN', 'ARBC', 'ANTN', 'ARND', 'LISC', 'CCGL', 'DCST',
       'BELC', 'MIDU', 'FERO'], dtype=object)

Unique values for 'a_wkday':


array(['SUN', 'MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT'], dtype=object)

### adding NaN
### Step 1: Quantify Missingness

In [21]:
# Define the values to consider as missing
missing_values = [' ','', 'Unknown', np.nan]

# Iterate through each column and count missing values
for col in dfCol.columns:
    if col in ['a_District','a_wkday']: continue
    missing_count = dfCol[col].isin(missing_values).sum() + dfCol[col].isna().sum()
    if missing_count > 0:
        print(f"Column '{col}': {missing_count} missing values")

Column 'a_jdet': 5376 missing values
Column 'a_jcont': 5376 missing values
Column 'a_pedhum': 5376 missing values
Column 'a_pedphys': 5376 missing values
Column 'a_light': 5376 missing values
Column 'a_weat': 5376 missing values
Column 'a_roadsc': 5376 missing values
Column 'a_speccs': 5376 missing values
Column 'a_chaz': 5376 missing values
Column 'a_scene': 5376 missing values


### 10 Columns are missing 5376 values (Over 88% of each of these columns is missing), this is substantial. Could Are these the accidents Police wherent called to?<br>
### Step 2: replace missing with NaN

In [26]:
missing_values = [' ','', 'Unknown', np.nan]
dfCol.replace(missing_values, np.nan, inplace=True)
dfCol.head(10)

Unnamed: 0,a_year,a_ref,a_District,a_type,a_veh,a_cas,a_wkday,a_day,a_month,a_hour,...,a_jdet,a_jcont,a_pedhum,a_pedphys,a_light,a_weat,a_roadsc,a_speccs,a_chaz,a_scene
0,2017,1,NEMD,3,1,1,SUN,1,1,3,...,,,,,,,,,,
1,2017,2,MEAN,3,2,1,SUN,1,1,3,...,,,,,,,,,,
2,2017,3,ARBC,2,1,1,SUN,1,1,10,...,2.0,7.0,1.0,1.0,2.0,10.0,1.0,1.0,1.0,1.0
3,2017,4,ANTN,3,2,1,SUN,1,1,10,...,,,,,,,,,,
4,2017,5,ANTN,2,2,2,SUN,1,1,11,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
5,2017,6,ARND,3,2,4,SUN,1,1,12,...,,,,,,,,,,
6,2017,7,LISC,3,3,1,SUN,1,1,13,...,,,,,,,,,,
7,2017,8,MEAN,2,1,2,SUN,1,1,17,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0
8,2017,9,CCGL,2,1,1,MON,2,1,8,...,10.0,7.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0
9,2017,10,MEAN,2,3,4,MON,2,1,9,...,10.0,7.0,1.0,1.0,2.0,9.0,4.0,1.0,1.0,1.0


In [61]:
#boxplot of casuiltys
dfCol[['a_cas']].boxplot()

<Axes: >

### As expected there are no Negative Casulties, but there is 2 dots that are outliers, at the 15 and 22/23(?) position <br>
### We will keep this data as knowing the location of the most serious/hightest casulty colisions is of high importance

### Maping values to decrease cognative Load
####  a_type: map 1,2,3 to Fatal,Serious,Slight

In [40]:
a_type_map = {
    1: "Fatal", 2: "Serious", 3: "Slight",
}

dfCol['a_type'] = dfCol['a_type'].replace(a_type_map)
dfCol.head(5)

Unnamed: 0,a_year,a_ref,a_District,a_type,a_veh,a_cas,a_wkday,a_day,a_month,a_hour,...,a_jdet,a_jcont,a_pedhum,a_pedphys,a_light,a_weat,a_roadsc,a_speccs,a_chaz,a_scene
0,2017,1,NEMD,Slight,1,1,SUN,1,1,3,...,,,,,,,,,,
1,2017,2,MEAN,Slight,2,1,SUN,1,1,3,...,,,,,,,,,,
2,2017,3,ARBC,Serious,1,1,SUN,1,1,10,...,2.0,7.0,1.0,1.0,2.0,10.0,1.0,1.0,1.0,1.0
3,2017,4,ANTN,Slight,2,1,SUN,1,1,10,...,,,,,,,,,,
4,2017,5,ANTN,Serious,2,2,SUN,1,1,11,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


#### a_ctype: Map values from the data guide (Carriage way type)
#### a_jdet: Map values from the data guide (Junction Detail)
#### a_weat: Map values from the data guide (Weather Conditions)
#### a_light: Map values from the data guide (Light Conditions)

#### (data guide @ https://admin.opendatani.gov.uk/dataset/police-recorded-injury-road-traffic-collision-statistics-northern-ireland-2017/resource/de8384e7-95c8-4e35-910b-ce919db78024 )


In [50]:
a_ctype_map = {
    1: "Roundabout", 2: "One way street", 10: "Other / unknown",11:"Dual carriageway",12:"Motorway",13:"Single carriageway",14:"Slip road",
}
a_jdet_map = {
    '1': "Not at or within 20m of junction", '2': "Roundabout", '3': "Mini-roundabout",'6':"Crossroads",'8':"Multiple junction",'9':"Slip road",'10':"Private drive / entrance",'11':"Other junction",'12':"T or staggered junction"
}
a_weat_map = {
    '1': "Fine without high winds", '2': "Raining without high winds", '3': "Snowing without high winds",'4':"Fine with high winds",'5':"Raining with high winds",'6':"Snowing with high winds",'7':"Fog or mist - if hazard",'8':"Strong sun (glaring)",'9':'Other','10':"Unknown"
}
a_light_map = {
    '1': " Daylight: street lights present", '2': "Daylight: no street lighting", '3': "Daylight: street lighting unknown",'4':"Darkness: street lights present and lit",'5':"Darkness: street lights present but unlit",'6':"Darkness: no street lighting",'7':"Darkness: street lighting unknown",
}
dfCol['a_ctype'] = dfCol['a_ctype'].replace(a_ctype_map)
dfCol['a_jdet'] = dfCol['a_jdet'].replace(a_jdet_map)
dfCol['a_weat'] = dfCol['a_weat'].replace(a_weat_map)
dfCol['a_light'] = dfCol['a_light'].replace(a_light_map)
dfCol.head(5)

Unnamed: 0,a_year,a_ref,a_District,a_type,a_veh,a_cas,a_wkday,a_day,a_month,a_hour,...,a_jdet,a_jcont,a_pedhum,a_pedphys,a_light,a_weat,a_roadsc,a_speccs,a_chaz,a_scene
0,2017,1,NEMD,Slight,1,1,SUN,1,1,3,...,,,,,,,,,,
1,2017,2,MEAN,Slight,2,1,SUN,1,1,3,...,,,,,,,,,,
2,2017,3,ARBC,Serious,1,1,SUN,1,1,10,...,Roundabout,7.0,1.0,1.0,Daylight: no street lighting,Unknown,1.0,1.0,1.0,1.0
3,2017,4,ANTN,Slight,2,1,SUN,1,1,10,...,,,,,,,,,,
4,2017,5,ANTN,Serious,2,2,SUN,1,1,11,...,Not at or within 20m of junction,1.0,1.0,1.0,Daylight: no street lighting,Fine without high winds,1.0,1.0,1.0,1.0


#### Make those rows catagorical

In [52]:
# Example: set some columns to categorical
for cat_col in ['a_type', 'a_ctype', 'a_jdet', 'a_weat','a_light']:
    if cat_col in dfCol.columns:
        dfCol[cat_col] = dfCol[cat_col].astype('category')
dfCol[['a_type', 'a_ctype', 'a_jdet', 'a_weat','a_light']].info() # It should be category

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6081 entries, 0 to 6080
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   a_type   6081 non-null   category
 1   a_ctype  6081 non-null   category
 2   a_jdet   705 non-null    category
 3   a_weat   705 non-null    category
 4   a_light  705 non-null    category
dtypes: category(5)
memory usage: 31.2 KB


<H1>Vehicle Data</H1>
<h2>Reading and Cleaning</h2>

In [53]:
csv_path = 'vehicle2017.csv'

dfVeh = pd.read_csv(csv_path)

display(dfVeh.head(10))
print('Shape:', dfVeh.shape)
print('Columns:', list(dfVeh.columns))

Unnamed: 0,a_year,a_ref,v_id,v_type,v_tow,v_man,v_loc,v_junc,v_skid,v_hit,v_leave,v_hitoff,v_impact,v_sex,v_agegroup,v_hitr,v_forreg
0,2017,1,1,8,1,18,3,,,,,,2,1.0,6,1,
1,2017,2,1,8,1,10,3,,,,,,3,1.0,7,1,
2,2017,2,2,8,1,18,3,,,,,,2,1.0,6,2,
3,2017,3,1,8,1,18,3,3.0,1.0,1.0,1.0,1.0,2,1.0,7,1,1.0
4,2017,4,1,8,1,18,3,,,,,,2,1.0,8,1,
5,2017,4,2,8,1,2,3,,,,,,4,,9,1,
6,2017,5,1,8,1,13,3,1.0,1.0,1.0,1.0,1.0,2,1.0,5,1,1.0
7,2017,5,2,8,1,3,6,1.0,1.0,1.0,1.0,1.0,2,1.0,5,1,1.0
8,2017,6,1,8,1,9,3,,,,,,4,1.0,7,1,
9,2017,6,2,8,1,13,3,,,,,,2,1.0,8,1,


Shape: (11299, 17)
Columns: ['a_year', 'a_ref', 'v_id', 'v_type', 'v_tow', 'v_man', 'v_loc', 'v_junc', 'v_skid', 'v_hit', 'v_leave', 'v_hitoff', 'v_impact', 'v_sex', 'v_agegroup', 'v_hitr', 'v_forreg']


In [54]:
dfVeh.dtypes

a_year         int64
a_ref          int64
v_id           int64
v_type         int64
v_tow          int64
v_man          int64
v_loc          int64
v_junc        object
v_skid        object
v_hit         object
v_leave       object
v_hitoff      object
v_impact       int64
v_sex         object
v_agegroup    object
v_hitr         int64
v_forreg      object
dtype: object

### We already checked for duplicate collisions
### adding NaN
### Step 1: Quantify Missingness

In [58]:
# Define the values to consider as missing
missing_values = [' ','', 'Unknown', np.nan]

# Iterate through each column and count missing values
for col in dfVeh.columns:
    if col in ['a_District','a_wkday']: continue
    missing_count = dfVeh[col].isin(missing_values).sum() + dfVeh[col].isna().sum()
    if missing_count > 0:
        print(f"Column '{col}': {missing_count} missing values")

Column 'v_junc': 10157 missing values
Column 'v_skid': 10157 missing values
Column 'v_hit': 10157 missing values
Column 'v_leave': 10157 missing values
Column 'v_hitoff': 10157 missing values
Column 'v_sex': 611 missing values
Column 'v_agegroup': 1 missing values
Column 'v_forreg': 10170 missing values


### 6 Columns are missing 10157 values (Nearly 90% of each of these columns is missing), this is substantial. There is also data missing from on sex of the driver and one instance of age of the driver<br>
### Step 2: replace missing with NaN

In [62]:
missing_values = [' ','', 'Unknown', np.nan]
dfVeh.replace(missing_values, np.nan, inplace=True)
dfVeh.head(10)

Unnamed: 0,a_year,a_ref,v_id,v_type,v_tow,v_man,v_loc,v_junc,v_skid,v_hit,v_leave,v_hitoff,v_impact,v_sex,v_agegroup,v_hitr,v_forreg
0,2017,1,1,8,1,18,3,,,,,,2,1.0,6,1,
1,2017,2,1,8,1,10,3,,,,,,3,1.0,7,1,
2,2017,2,2,8,1,18,3,,,,,,2,1.0,6,2,
3,2017,3,1,8,1,18,3,3.0,1.0,1.0,1.0,1.0,2,1.0,7,1,1.0
4,2017,4,1,8,1,18,3,,,,,,2,1.0,8,1,
5,2017,4,2,8,1,2,3,,,,,,4,,9,1,
6,2017,5,1,8,1,13,3,1.0,1.0,1.0,1.0,1.0,2,1.0,5,1,1.0
7,2017,5,2,8,1,3,6,1.0,1.0,1.0,1.0,1.0,2,1.0,5,1,1.0
8,2017,6,1,8,1,9,3,,,,,,4,1.0,7,1,
9,2017,6,2,8,1,13,3,,,,,,2,1.0,8,1,


### Maping values to decrease cognative Load
#### v_type: map 1,2...25 to Unknown,Pedal cycle...Agricultural vehicle
#### v_man: Map values from the data guide (Vehicle Manoeuvre)
#### v_impact: Map values from the data guide (First point of impact)
#### v_sex: Map values from the data guide (Sex of Driver)
#### v_agegroup: Map values from the data guide (Age of Driver)

#### (data guide @ https://admin.opendatani.gov.uk/dataset/police-recorded-injury-road-traffic-collision-statistics-northern-ireland-2017/resource/de8384e7-95c8-4e35-910b-ce919db78024 )


In [None]:
## THIS CODE NEEDS TO BE CHANGED

a_ctype_map = {
    1: "Roundabout", 2: "One way street", 10: "Other / unknown",11:"Dual carriageway",12:"Motorway",13:"Single carriageway",14:"Slip road",
}
a_jdet_map = {
    '1': "Not at or within 20m of junction", '2': "Roundabout", '3': "Mini-roundabout",'6':"Crossroads",'8':"Multiple junction",'9':"Slip road",'10':"Private drive / entrance",'11':"Other junction",'12':"T or staggered junction"
}
a_weat_map = {
    '1': "Fine without high winds", '2': "Raining without high winds", '3': "Snowing without high winds",'4':"Fine with high winds",'5':"Raining with high winds",'6':"Snowing with high winds",'7':"Fog or mist - if hazard",'8':"Strong sun (glaring)",'9':'Other','10':"Unknown"
}
a_light_map = {
    '1': " Daylight: street lights present", '2': "Daylight: no street lighting", '3': "Daylight: street lighting unknown",'4':"Darkness: street lights present and lit",'5':"Darkness: street lights present but unlit",'6':"Darkness: no street lighting",'7':"Darkness: street lighting unknown",
}
dfCol['a_ctype'] = dfCol['a_ctype'].replace(a_ctype_map)
dfCol['a_jdet'] = dfCol['a_jdet'].replace(a_jdet_map)
dfCol['a_weat'] = dfCol['a_weat'].replace(a_weat_map)
dfCol['a_light'] = dfCol['a_light'].replace(a_light_map)
dfCol.head(5)

In [None]:
# Create a transformer from Irish Grid (EPSG:29903) to WGS84 (EPSG:4326)
transformer = Transformer.from_crs("EPSG:29903", "EPSG:4326", always_xy=True)

# Convert Easting/Northing to Longitude/Latitude
# The transformer returns (longitude, latitude)
df['longitude'], df['latitude'] = transformer.transform(
    df['a_gd1'].values,
    df['a_gd2'].values
)

# View the results
print(df[['a_gd1', 'a_gd2', 'longitude', 'latitude']].head(10))

In [None]:
print(df['a_District'].value_counts())