In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
road_accident = pd.read_csv('accident_data.csv')

In [54]:
road_accident

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type,Month,Year,severity_num
0,200701BS64157,Serious,2019-06-05,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car,June,2019,2
1,200701BS65737,Serious,2019-07-02,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car,July,2019,2
2,200701BS66127,Serious,2019-08-26,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,Single carriageway,Urban,Fine no high winds,Taxi/Private hire car,August,2019,2
3,200701BS66128,Serious,2019-08-16,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats),August,2019,2
4,200701BS66837,Slight,2019-09-03,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,Single carriageway,Urban,Fine no high winds,Other vehicle,September,2019,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,2022-02-18,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car,February,2022,1
660675,201091NM01881,Slight,2022-02-21,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car,February,2022,1
660676,201091NM01935,Slight,2022-02-23,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car,February,2022,1
660677,201091NM01964,Serious,2022-02-23,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc,February,2022,2


<h1>Data Type Conversion</h1>

In [13]:
road_accident['Index']= road_accident['Index'].astype(str)
road_accident['Accident_Severity']= road_accident['Accident_Severity'].astype('category')
road_accident['Accident Date'] = pd.to_datetime(road_accident['Accident Date'], format='%d-%m-%Y')
road_accident['Month'] = road_accident['Accident Date'].dt.month_name()
road_accident['Year'] = road_accident['Accident Date'].dt.year
road_accident['Light_Conditions']= road_accident['Light_Conditions'].astype('category')
road_accident['District Area']= road_accident['District Area'].astype('category')
road_accident['Road_Surface_Conditions']= road_accident['Road_Surface_Conditions'].astype('category')
road_accident['Road_Type']= road_accident['Road_Type'].astype('category')
road_accident['Urban_or_Rural_Area']= road_accident['Urban_or_Rural_Area'].astype('category')
road_accident['Weather_Conditions']= road_accident['Weather_Conditions'].astype('category')
road_accident['Vehicle_Type']= road_accident['Vehicle_Type'].astype('category')

In [9]:
road_accident.dtypes

Index                              object
Accident_Severity                category
Accident Date              datetime64[ns]
Latitude                          float64
Light_Conditions                 category
District Area                    category
Longitude                         float64
Number_of_Casualties                int64
Number_of_Vehicles                  int64
Road_Surface_Conditions          category
Road_Type                        category
Urban_or_Rural_Area              category
Weather_Conditions               category
Vehicle_Type                     category
Month                              object
dtype: object

<h1>Data Cleaning Procedures</h1>

In [99]:
road_accident['Latitude'] = road_accident['Latitude'].fillna(road_accident['Latitude'].median())
road_accident['Longitude'] = road_accident['Longitude'].fillna(road_accident['Longitude'].median())
road_accident['Road_Surface_Conditions'] = road_accident['Road_Surface_Conditions'].fillna(road_accident['Road_Surface_Conditions'].mode()[0])
road_accident['Road_Type'] = road_accident['Road_Type'].fillna(road_accident['Road_Type'].mode()[0])
road_accident['Urban_or_Rural_Area'] = road_accident['Urban_or_Rural_Area'].fillna(road_accident['Urban_or_Rural_Area'].mode()[0])
road_accident['Weather_Conditions'] = road_accident['Weather_Conditions'].fillna(road_accident['Weather_Conditions'].mode()[0])

In [100]:
road_accident.isnull().sum()

Index                      0
Accident_Severity          0
Accident Date              0
Latitude                   0
Light_Conditions           0
District Area              0
Longitude                  0
Number_of_Casualties       0
Number_of_Vehicles         0
Road_Surface_Conditions    0
Road_Type                  0
Urban_or_Rural_Area        0
Weather_Conditions         0
Vehicle_Type               0
Month                      0
Year                       0
severity_num               0
dtype: int64

<h1>Analysis of the Cleaned Data</h1>

<h1>1. Do fatal accidents happen more often in daylight than in the dark, contradicting the common belief that night driving is more dangerous?</h1>

In [57]:
fatal_accident = road_accident[road_accident['Accident_Severity'] == 'Fatal']
fatal_accident.groupby('Light_Conditions').size().reset_index(name='Fatal Accidents').sort_values(by='Fatal Accidents', ascending=False)

Unnamed: 0,Light_Conditions,Fatal Accidents
4,Daylight,5076
1,Darkness - lights lit,1860
3,Darkness - no lighting,1612
0,Darkness - lighting unknown,68
2,Darkness - lights unlit,45


<h4>Insight: Fatal accidents in this dataset occur more often during daylight hours than in the dark, which contradicts the common belief that driving at night is more dangerous. While night driving has its own set of risks such as reduced visibility and drowsy drivers, other factors associated with daylight hours like higher traffic volume, rush hour, or certain types of driver behavior may contribute to a higher total number of fatal crashes</h4>

<h1>2. Are fatal accidents more common in urban areas than rural ones, despite the stereotype that “country roads are deadlier</h1>

In [58]:
fatal_accident.groupby('Urban_or_Rural_Area').size().reset_index(name='Fatal Accidents').sort_values(by='Fatal Accidents', ascending=False)

Unnamed: 0,Urban_or_Rural_Area,Fatal Accidents
0,Rural,5601
2,Urban,3060
1,Unallocated,0


<h3>Insight: fatal accidents are far more common in rural areas (5,601) than urban ones (3,060), suggesting country roads are indeed deadlier overall.</h3>

<h1>3. Do certain district areas with reputations for being “safe” actually have higher rates of serious or fatal accidents compared to “dangerous” areas?</h1>

In [59]:
severity_counts = (road_accident.groupby(['District Area', 'Accident_Severity']).size().reset_index(name='Accident_Count'))

In [60]:
severity_counts

Unnamed: 0,District Area,Accident_Severity,Accident_Count
0,Aberdeen City,Fatal,12
1,Aberdeen City,Serious,239
2,Aberdeen City,Slight,1072
3,Aberdeenshire,Fatal,66
4,Aberdeenshire,Serious,463
...,...,...,...
1261,Wyre Forest,Serious,132
1262,Wyre Forest,Slight,815
1263,York,Fatal,21
1264,York,Serious,255


In [61]:
severity_pivot = severity_counts.pivot(index='District Area',
                                       columns='Accident_Severity',
                                       values='Accident_Count')

In [62]:
severity_pivot

Accident_Severity,Fatal,Serious,Slight
District Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aberdeen City,12,239,1072
Aberdeenshire,66,463,1401
Adur,8,101,510
Allerdale,24,143,961
Alnwick,6,33,193
...,...,...,...
Wychavon,30,193,1138
Wycombe,20,216,1493
Wyre,15,186,1037
Wyre Forest,22,132,815


In [63]:
severity_pivot['Severe_Ratio'] = (
    (severity_pivot['Serious'] + severity_pivot['Fatal'])
    / severity_pivot.sum(axis=1)
)

In [64]:
severity_pivot

Accident_Severity,Fatal,Serious,Slight,Severe_Ratio
District Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aberdeen City,12,239,1072,0.189720
Aberdeenshire,66,463,1401,0.274093
Adur,8,101,510,0.176090
Allerdale,24,143,961,0.148050
Alnwick,6,33,193,0.168103
...,...,...,...,...
Wychavon,30,193,1138,0.163850
Wycombe,20,216,1493,0.136495
Wyre,15,186,1037,0.162359
Wyre Forest,22,132,815,0.158927


In [65]:
severity_pivot.sort_values('Severe_Ratio', ascending=False).head(10)

Accident_Severity,Fatal,Serious,Slight,Severe_Ratio
District Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ryedale,21,223,477,0.338419
South Northamptonshire,28,219,630,0.281642
Daventry,25,226,651,0.278271
Perth and Kinross,37,283,832,0.277778
Richmondshire,22,179,525,0.27686
Aberdeenshire,66,463,1401,0.274093
Dumfries and Galloway,29,322,931,0.273791
Hambleton,36,260,835,0.261715
Craven,25,180,602,0.254027
Argyll and Bute,27,184,625,0.252392


In [66]:
safe_districts = severity_pivot.sort_values('Severe_Ratio').head(5)

In [67]:
safe_districts

Accident_Severity,Fatal,Serious,Slight,Severe_Ratio
District Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tamworth,5,33,644,0.055718
London Airport (Heathrow),2,9,168,0.061453
Stoke-on-Trent,16,186,2942,0.064249
Lichfield,21,73,1195,0.072925
Plymouth,17,207,2801,0.07405


In [68]:
dangerous_districts = severity_pivot.sort_values('Severe_Ratio', ascending=False).head(5)

In [69]:
dangerous_districts

Accident_Severity,Fatal,Serious,Slight,Severe_Ratio
District Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ryedale,21,223,477,0.338419
South Northamptonshire,28,219,630,0.281642
Daventry,25,226,651,0.278271
Perth and Kinross,37,283,832,0.277778
Richmondshire,22,179,525,0.27686


<h3>Based on the result of the analysis. We can say that it breaks the given stigma that only big and busy cities like London are dangerous than the smaller ones. Heathrow, London, and Plymouth look relatively safe base on the result of the analysis and small rural districts like Ryedale or Richmondshire are far deadlier per accident. We arrive on this conclusion by using the Severe Ratio with the formula of total number of accidents per district devide it by the total amount of fatal and serious accident in each districts. The severe ratio for the safe districts shows below 10%. Meanwhile, for the dangerous districts we can see that the severe ration is higher with atleast 27.7% ratio. </h3>

<h1>4. Which Identified Vehicle Type causes the highest average casualties per accident?</h1>

In [70]:
road_accident.groupby('Vehicle_Type')['Number_of_Casualties'].mean().sort_values(ascending=False).reset_index()

Unnamed: 0,Vehicle_Type,Number_of_Casualties
0,Data missing or out of range,1.5
1,Pedal cycle,1.370558
2,Taxi/Private hire car,1.368663
3,Goods over 3.5t. and under 7.5t,1.362861
4,Motorcycle over 500cc,1.359434
5,Car,1.358841
6,Van / Goods 3.5 tonnes mgw or under,1.354537
7,Goods 7.5 tonnes mgw and over,1.351881
8,Bus or coach (17 or more pass seats),1.349216
9,Minibus (8 - 16 passenger seats),1.345648


<h3>Insight: Surprisingly, pedal cycle-related accidents have the most casualties, with an average of 1.37 casualties per accident.</h3>

<h1>5. Is there a correlation between the number of vehicles involved and the number of casualties that shocks by being weaker than expected, suggesting single-vehicle accidents are deadlier?</h1>

In [71]:
road_accident['Number_of_Vehicles'].corr(road_accident['Number_of_Casualties'])

np.float64(0.22888886126927557)

In [72]:
road_accident[road_accident['Number_of_Vehicles'] == 1]['Number_of_Casualties'].mean()

np.float64(1.1709323810804484)

In [73]:
road_accident[road_accident['Number_of_Vehicles'] > 1]['Number_of_Casualties'].mean()

np.float64(1.4382942082054047)

In [74]:
road_accident.groupby('Number_of_Vehicles')['Number_of_Casualties'].mean().reset_index()

Unnamed: 0,Number_of_Vehicles,Number_of_Casualties
0,1,1.170932
1,2,1.37488
2,3,1.711169
3,4,1.995575
4,5,2.315341
5,6,2.612137
6,7,3.064189
7,8,3.401361
8,9,3.350877
9,10,3.62963


In [75]:
road_accident['Number_of_Casualties'].max()

np.int64(68)

In [76]:
road_accident[road_accident['Number_of_Casualties'] == 68]

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type,Month,Year,severity_num
117980,200743N002017,Fatal,2019-01-03,51.497547,Darkness - lights lit,South Bucks,-0.496697,68,1,Wet or damp,Slip road,Rural,Raining no high winds,Car,January,2019,3


<h3>
Insight: Although the average number of casualties in multi-vehicle accidents is slightly higher than in single-vehicle accidents, the Pearson correlation coefficient of 0.23 shows that this relationship is much weaker than many would expect. This means that involving more vehicles does not automatically lead to higher casualties. In fact, the deadliest accident in this dataset was a single-vehicle crash that resulted in 68 casualties.
</h3>


<h1>6. Does weather data break the stigma that rain always increases accident severity, perhaps showing 'Fine no high winds' conditions lead to more fatal outcomes due to complacency?</h1>

In [77]:
road_accident.groupby('Weather_Conditions')['Number_of_Casualties'].mean().sort_values(ascending=False).reset_index()

Unnamed: 0,Weather_Conditions,Number_of_Casualties
0,Fog or mist,1.452948
1,Snowing + high winds,1.418079
2,Raining + high winds,1.416641
3,Raining no high winds,1.408214
4,Fine + high winds,1.386018
5,Other,1.354869
6,Fine no high winds,1.347397
7,Snowing no high winds,1.341776


In [78]:
severity_mapping = {'Slight': 1, 'Serious': 2, 'Fatal': 3}
road_accident['severity_num'] = road_accident['Accident_Severity'].map(severity_mapping)
road_accident['severity_num'] = road_accident['severity_num'].astype(int)

In [79]:
mean_severity = road_accident.groupby('Weather_Conditions')['severity_num'].mean().sort_values(ascending=False)

In [80]:
mean_severity

Weather_Conditions
Fine + high winds        1.186462
Fog or mist              1.183390
Fine no high winds       1.163919
Raining + high winds     1.161310
Raining no high winds    1.140082
Snowing + high winds     1.129944
Other                    1.124257
Snowing no high winds    1.102116
Name: severity_num, dtype: float64

In [81]:
groups = [road_accident[road_accident['Weather_Conditions'] == condition]['severity_num'].dropna() for condition in road_accident['Weather_Conditions'].unique()]

In [82]:
anova_result = f_oneway(*groups)

In [83]:
anova_result.statistic

np.float64(81.39648956054963)

In [84]:
anova_result.pvalue

np.float64(8.833873772256261e-119)

In [85]:
fatal_counts = road_accident[road_accident['Accident_Severity'] == 'Fatal'].groupby('Weather_Conditions').size()
total_counts = road_accident.groupby('Weather_Conditions').size()
fatal_proportions = (fatal_counts / total_counts).fillna(0).sort_values(ascending=False)

In [86]:
fatal_proportions.reset_index()

Unnamed: 0,Weather_Conditions,0
0,Fog or mist,0.023243
1,Fine + high winds,0.020458
2,Raining + high winds,0.015081
3,Fine no high winds,0.013471
4,Raining no high winds,0.01064
5,Other,0.009621
6,Snowing no high winds,0.005771
7,Snowing + high winds,0.00339


<h3>Insight: </h3>

<h1>7. Which 10 district areas have the lowest average casualties per year?</h1>

In [87]:
result = (
    road_accident
    .groupby(['District Area', 'Year'])['Number_of_Casualties']
    .mean()
    .groupby('District Area')
    .mean()
    .sort_values()
    .head(10)
    .reset_index()
)

result.index = result.index + 1
result

Unnamed: 0,District Area,Number_of_Casualties
1,City of London,1.1266
2,Islington,1.126799
3,Camden,1.133677
4,Westminster,1.133999
5,Kensington and Chelsea,1.136303
6,Hammersmith and Fulham,1.136985
7,Aberdeen City,1.139534
8,Cambridge,1.150649
9,Wandsworth,1.15317
10,Southwark,1.154591


<h3>Insight 7: Several of the districts with the lowest average casualties per year, including City of London, Islington, Camden, Westminster, Kensington & Chelsea, Wandsworth, and Southwark, are inner London boroughs. Interestingly, despite being located in one of the busiest metropolitan areas in the UK, these districts consistently record lower casualty average.</h3>

<h1>8. Do specific road surface conditions (Dry vs Wet vs Ice) increase casualties on urban vs rural roads differently?</h1>

In [108]:
casualties_count = road_accident.groupby(['Road_Surface_Conditions', 'Urban_or_Rural_Area'])['Number_of_Casualties'].count()
casualties_avg = road_accident.groupby(['Road_Surface_Conditions', 'Urban_or_Rural_Area'])['Number_of_Casualties'].mean()

0         Urban
1         Urban
2         Urban
3         Urban
4         Urban
          ...  
660674    Rural
660675    Rural
660676    Rural
660677    Rural
660678    Rural
Name: Urban_or_Rural_Area, Length: 660679, dtype: category
Categories (3, object): ['Rural', 'Unallocated', 'Urban']

In [110]:
casualties_avg

Road_Surface_Conditions  Urban_or_Rural_Area
Dry                      Rural                  1.470564
                         Unallocated            1.400000
                         Urban                  1.269472
Flood over 3cm. deep     Rural                  1.512102
                         Unallocated            1.000000
                         Urban                  1.441558
Frost or ice             Rural                  1.376518
                         Unallocated                 NaN
                         Urban                  1.308699
Snow                     Rural                  1.438749
                         Unallocated                 NaN
                         Urban                  1.271521
Wet or damp              Rural                  1.510575
                         Unallocated            1.000000
                         Urban                  1.338084
Name: Number_of_Casualties, dtype: float64

In [None]:
f_statistic, p_value = f_oneway(performance1, performance2, performance3, performance4)