In [1]:
# for getting data
import os
import zipfile

# for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

### Crashes

In [2]:
crashes = pd.read_csv('./data/traffic_crashes.csv', low_memory = False)
crashes.head()

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,6c1659069e9c6285a650e70d6f9b574ed5f64c12888479...,,08/18/2023 12:50:00 PM,15,OTHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,OTHER,...,1.0,0.0,1.0,0.0,12,6,8,,,
1,5f54a59fcb087b12ae5b1acff96a3caf4f2d37e79f8db4...,,07/29/2023 02:45:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,0.0,1.0,0.0,14,7,7,41.85412,-87.665902,POINT (-87.665902342962 41.854120262952)
2,61fcb8c1eb522a6469b460e2134df3d15f82e81fd93e9c...,,08/18/2023 05:58:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PEDALCYCLIST,NOT DIVIDED,...,1.0,0.0,1.0,0.0,17,6,8,41.942976,-87.761883,POINT (-87.761883496974 41.942975745006)
3,004cd14d0303a9163aad69a2d7f341b7da2a8572b2ab33...,,11/26/2019 08:38:00 AM,25,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PEDESTRIAN,ONE-WAY,...,0.0,0.0,1.0,0.0,8,3,11,,,
4,a1d5f0ea90897745365a4cbb06cc60329a120d89753fac...,,08/18/2023 10:45:00 AM,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,FIXED OBJECT,OTHER,...,0.0,0.0,1.0,0.0,10,6,8,,,


In [3]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901446 entries, 0 to 901445
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                901446 non-null  object 
 1   CRASH_DATE_EST_I               66531 non-null   object 
 2   CRASH_DATE                     901446 non-null  object 
 3   POSTED_SPEED_LIMIT             901446 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         901446 non-null  object 
 5   DEVICE_CONDITION               901446 non-null  object 
 6   WEATHER_CONDITION              901446 non-null  object 
 7   LIGHTING_CONDITION             901446 non-null  object 
 8   FIRST_CRASH_TYPE               901446 non-null  object 
 9   TRAFFICWAY_TYPE                901446 non-null  object 
 10  LANE_CNT                       199022 non-null  float64
 11  ALIGNMENT                      901446 non-null  object 
 12  ROADWAY_SURFACE_COND          

In [4]:
crashes.columns = crashes.columns.str.lower()

In [5]:
# Convert all string values in object columns to lowercase
for col in crashes.select_dtypes(include='object').columns:
    crashes[col] = crashes[col].str.lower()

In [6]:
missing_percentage = round((crashes.isna().sum()/len(crashes)*100), 2)
missing_percentage

crash_record_id                   0.00
crash_date_est_i                 92.62
crash_date                        0.00
posted_speed_limit                0.00
traffic_control_device            0.00
device_condition                  0.00
weather_condition                 0.00
lighting_condition                0.00
first_crash_type                  0.00
trafficway_type                   0.00
lane_cnt                         77.92
alignment                         0.00
roadway_surface_cond              0.00
road_defect                       0.00
report_type                       3.11
crash_type                        0.00
intersection_related_i           77.03
not_right_of_way_i               95.45
hit_and_run_i                    68.64
damage                            0.00
date_police_notified              0.00
prim_contributory_cause           0.00
sec_contributory_cause            0.00
street_no                         0.00
street_direction                  0.00
street_name              

In [7]:
# selecting all features with 90% or more of its values are null
high_null_features = crashes.columns[(crashes.isna().sum() / len(crashes) * 100) >= 90]
high_null_features

# creating a list of features with 90% or more null values
high_null_features_list = list(high_null_features)
high_null_features_list

crashes_cleaned = crashes.drop(columns=high_null_features_list)


In [8]:
# selecting all features with 90% or more of its values are null
high_null_features = crashes.columns[(crashes.isna().sum() / len(crashes) * 100) >= 90]
high_null_features

Index(['crash_date_est_i', 'not_right_of_way_i', 'photos_taken_i',
       'statements_taken_i', 'dooring_i', 'work_zone_i', 'work_zone_type',
       'workers_present_i'],
      dtype='object')

In [9]:
# creating a list of features with 90% or more null values
high_null_features_list = list(high_null_features)
high_null_features_list

['crash_date_est_i',
 'not_right_of_way_i',
 'photos_taken_i',
 'statements_taken_i',
 'dooring_i',
 'work_zone_i',
 'work_zone_type',
 'workers_present_i']

In [10]:
crashes_cleaned = crashes.drop(columns=high_null_features_list)


In [11]:
medium_null_features = crashes.columns[
    ((crashes.isna().sum() / len(crashes) * 100) >= 60) &
    ((crashes.isna().sum() / len(crashes) * 100) < 90)
]

In [12]:
medium_null_features_list = list(medium_null_features)

In [13]:
for feature in medium_null_features_list:
    print(f"Value counts for column '{feature}':")
    print(crashes_cleaned[feature].value_counts())
    print("-"* 32)

Value counts for column 'lane_cnt':
2.0          91162
4.0          49589
1.0          32550
3.0           8678
0.0           8032
6.0           4502
5.0           1940
8.0           1908
7.0            184
10.0           162
99.0           108
9.0             66
11.0            30
12.0            29
20.0            15
22.0            13
15.0             7
16.0             7
14.0             5
30.0             5
40.0             4
60.0             3
21.0             3
25.0             2
100.0            2
902.0            1
24.0             1
80.0             1
218474.0         1
45.0             1
17.0             1
299679.0         1
19.0             1
400.0            1
13.0             1
1191625.0        1
35.0             1
433634.0         1
41.0             1
28.0             1
44.0             1
Name: lane_cnt, dtype: int64
--------------------------------
Value counts for column 'intersection_related_i':
y    197181
n      9873
Name: intersection_related_i, dtype: int64
------

hit_and_run_i is an aftermath, not a contributor so we can remove that

lane_cnt could be important. Will need to reduce cardinality though.

intersection_related_i... could be helpful. A little vague and subjective according to CDOT description. "A field observation by the police officer whether an intersection played a role in the crash. Does not represent whether or not the crash occurred within the intersection."

In [14]:
crashes_cleaned['crash_date'].value_counts()

12/29/2020 05:00:00 pm    30
11/10/2017 10:30:00 am    27
02/17/2022 03:30:00 pm    21
11/21/2024 10:30:00 am    20
11/21/2024 10:00:00 am    20
                          ..
12/23/2016 12:41:00 pm     1
10/03/2020 05:32:00 pm     1
08/02/2021 05:15:00 pm     1
01/08/2020 02:35:00 pm     1
09/13/2023 01:08:00 pm     1
Name: crash_date, Length: 592919, dtype: int64

Will remove this feature. This information is captured in crash_hour, crash_day_of_the_week, crash_month

In [15]:
crashes_cleaned['posted_speed_limit'].value_counts()

30    664045
35     59626
25     57789
20     37717
15     32112
10     21096
40      8612
0       7584
45      5951
5       4957
55       883
50       276
3        221
9         96
39        95
99        66
60        53
1         41
24        38
2         31
65        20
32        20
34        16
33        14
11        11
26        11
36         8
6          7
70         7
7          6
18         4
12         4
22         4
14         4
23         3
29         3
31         2
8          2
38         2
16         2
4          2
62         1
63         1
44         1
49         1
46         1
Name: posted_speed_limit, dtype: int64

This feature could be important, but will require cardinality reduction

In [16]:
crashes_cleaned['traffic_control_device'].value_counts()

no controls                 510287
traffic signal              249882
stop sign/flasher            89361
unknown                      38328
other                         6096
yield                         1365
lane use marking              1226
other reg. sign               1103
pedestrian crossing sign       636
railroad crossing gate         581
flashing control signal        373
school zone                    353
delineators                    352
police/flagman                 309
rr crossing sign               195
other railroad crossing        192
no passing                      58
bicycle crossing sign           34
Name: traffic_control_device, dtype: int64

In [17]:
crashes_cleaned['device_condition'].value_counts()

no controls                 516329
functioning properly        307784
unknown                      63428
other                         6836
functioning improperly        4113
not functioning               2562
worn reflective material       295
missing                         99
Name: device_condition, dtype: int64

The top two categories that make up the majority of this feature are no controls and functioning properly. And then the next two frequent are unknown and other. We can drop this. won't be useful for analysis

In [18]:
crashes_cleaned['weather_condition'].value_counts()

clear                       709235
rain                         77962
unknown                      51500
snow                         28844
cloudy/overcast              26333
other                         2789
freezing rain/drizzle         1787
fog/smoke/haze                1353
sleet/hail                    1026
blowing snow                   453
severe cross wind gate         156
blowing sand, soil, dirt         8
Name: weather_condition, dtype: int64

In [19]:
crashes_cleaned['lighting_condition'].value_counts()

daylight                  578548
darkness, lighted road    197098
unknown                    42569
darkness                   42455
dusk                       25737
dawn                       15039
Name: lighting_condition, dtype: int64

In [20]:
crashes_cleaned['first_crash_type'].value_counts()

parked motor vehicle            208646
rear end                        199321
sideswipe same direction        138501
turning                         129668
angle                            97996
fixed object                     41874
pedestrian                       21320
pedalcyclist                     14331
sideswipe opposite direction     12509
rear to front                     9252
other object                      8981
head on                           7639
rear to side                      5512
other noncollision                2745
rear to rear                      1907
animal                             655
overturned                         543
train                               46
Name: first_crash_type, dtype: int64

This seems like it could be useful. 

In [21]:
crashes_cleaned['trafficway_type'].value_counts()

not divided                        388246
divided - w/median (not raised)    142466
one-way                            114072
four way                            62561
parking lot                         61011
divided - w/median barrier          50946
other                               24388
alley                               14802
t-intersection                      12409
unknown                             10603
center turn lane                     6374
driveway                             2890
ramp                                 2834
unknown intersection type            2762
five point, or more                  1385
y-intersection                       1350
traffic route                        1166
not reported                          687
roundabout                            308
l-intersection                        186
Name: trafficway_type, dtype: int64

This is important will keep this

In [22]:
crashes_cleaned['alignment'].value_counts()

straight and level       880103
straight on grade         11022
curve, level               6352
straight on hillcrest      2267
curve on grade             1313
curve on hillcrest          389
Name: alignment, dtype: int64

While this feature would ideally be helpful in analysis, the data here is not conducive for analysis. Most of the entries are 'straight and level'. will remove.

In [23]:
crashes_cleaned['roadway_surface_cond'].value_counts()

dry                667224
wet                117323
unknown             80085
snow or slush       28524
ice                  5678
other                2290
sand, mud, dirt       322
Name: roadway_surface_cond, dtype: int64

This is somewhat redundant with weather condition. Will remove weather_condition  and keep roadway_surface_cond due to its lower cardinality

In [24]:
crashes_cleaned['street_direction'].value_counts()

w    322771
s    301079
n    216752
e     60840
Name: street_direction, dtype: int64

In [25]:
crashes_cleaned['road_defect'].value_counts()

no defects           718022
unknown              166233
rut, holes             6350
other                  4893
worn surface           3741
shoulder defect        1547
debris on roadway       660
Name: road_defect, dtype: int64

The main two categories here are "no defects" and unknown. This will not be helpful for analysis. Will remove

In [26]:
crashes_cleaned['crash_type'].value_counts()

no injury / drive away              658842
injury and / or tow due to crash    242604
Name: crash_type, dtype: int64

This describes the aftermath, not helpful for contributory factors. Will remove

In [27]:
crashes_cleaned['beat_of_occurrence'].value_counts()

1834.0    10913
114.0      9281
813.0      9093
815.0      8590
1831.0     8244
          ...  
1653.0      502
1655.0      313
1652.0      241
1650.0       69
6100.0        7
Name: beat_of_occurrence, Length: 276, dtype: int64

High cardinality, unlikely to be useful for analysis. to remove

In [28]:
crashes_cleaned['most_severe_injury'].value_counts()

no indication of injury     772801
nonincapacitating injury     71130
reported, not evident        39463
incapacitating injury        15074
fatal                          985
Name: most_severe_injury, dtype: int64

In [29]:
crashes_cleaned['injuries_total'].value_counts()

0.0     772815
1.0      95189
2.0      21269
3.0       6479
4.0       2302
5.0        825
6.0        325
7.0        133
8.0         53
9.0         27
10.0        16
11.0         9
15.0         8
12.0         6
21.0         4
13.0         3
17.0         1
14.0         1
19.0         1
16.0         1
Name: injuries_total, dtype: int64

In [30]:
crashes_cleaned['injuries_fatal'].value_counts()

0.0    898482
1.0       912
2.0        64
3.0         8
4.0         1
Name: injuries_fatal, dtype: int64

In [31]:
crashes_cleaned['injuries_incapacitating'].value_counts()

0.0     884243
1.0      13370
2.0       1395
3.0        312
4.0        107
5.0         29
6.0          7
7.0          2
10.0         1
8.0          1
Name: injuries_incapacitating, dtype: int64

The 'injuries_...' features are redundant. This information is captured in the 'most_severe_injury' feature. Will remove all 'injuries_...' features and keep most_severe_injury. 

In [32]:
crashes_cleaned['prim_contributory_cause'].value_counts()

unable to determine                                                                 352689
failing to yield right-of-way                                                        99589
following too closely                                                                86950
not applicable                                                                       47632
improper overtaking/passing                                                          44963
failing to reduce speed to avoid crash                                               37868
improper backing                                                                     34796
improper lane usage                                                                  32108
driving skills/knowledge/experience                                                  30632
improper turning/no signal                                                           30203
disregarding traffic signals                                                         17608

In [33]:
crashes_cleaned['sec_contributory_cause'].value_counts()

not applicable                                                                      371652
unable to determine                                                                 324878
failing to reduce speed to avoid crash                                               33161
failing to yield right-of-way                                                        28925
driving skills/knowledge/experience                                                  28101
following too closely                                                                23735
improper overtaking/passing                                                          14021
improper lane usage                                                                  12692
weather                                                                               9915
improper turning/no signal                                                            9382
improper backing                                                                      7194

This feature is redundant to prim_contributory_cause. A high majority of values are either not applicable or unable to determine so it will be dropped. 

Damage, 'date_police_notified' both deal with aftermath, Not contributory factors. To be removed

'street_no'... unhelpful. Will remove

'num_units' not helpful in contributory factor, to remove

remove latitude and longitude as these are captured in location feature. 

In [34]:
crash_features_to_remove = [
    'crash_date',
    'hit_and_run_i',
    'device_condition',
    'weather_condition',
    'road_defect',
    'crash_type',
    'damage',
    'date_police_notified',
    'sec_contributory_cause',
    'street_no',
    'report_type',
    'beat_of_occurrence',
    'num_units',
    'alignment',
    'injuries_total',
    'injuries_fatal',
     'injuries_incapacitating',
     'injuries_non_incapacitating',
     'injuries_reported_not_evident',
     'injuries_no_indication',
    'injuries_unknown',
    'latitude',
     'longitude',
    'street_direction'
]

In [35]:
crashes_cleaned = crashes_cleaned.drop(columns=crash_features_to_remove)
crashes_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901446 entries, 0 to 901445
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   crash_record_id          901446 non-null  object 
 1   posted_speed_limit       901446 non-null  int64  
 2   traffic_control_device   901446 non-null  object 
 3   lighting_condition       901446 non-null  object 
 4   first_crash_type         901446 non-null  object 
 5   trafficway_type          901446 non-null  object 
 6   lane_cnt                 199022 non-null  float64
 7   roadway_surface_cond     901446 non-null  object 
 8   intersection_related_i   207054 non-null  object 
 9   prim_contributory_cause  901446 non-null  object 
 10  street_name              901445 non-null  object 
 11  most_severe_injury       899453 non-null  object 
 12  crash_hour               901446 non-null  int64  
 13  crash_day_of_week        901446 non-null  int64  
 14  cras

In [36]:
crashes_cleaned['posted_speed_limit'].value_counts()

30    664045
35     59626
25     57789
20     37717
15     32112
10     21096
40      8612
0       7584
45      5951
5       4957
55       883
50       276
3        221
9         96
39        95
99        66
60        53
1         41
24        38
2         31
65        20
32        20
34        16
33        14
11        11
26        11
36         8
6          7
70         7
7          6
18         4
12         4
22         4
14         4
23         3
29         3
31         2
8          2
38         2
16         2
4          2
62         1
63         1
44         1
49         1
46         1
Name: posted_speed_limit, dtype: int64

In [37]:
crashes_cleaned['traffic_control_device'].value_counts()

no controls                 510287
traffic signal              249882
stop sign/flasher            89361
unknown                      38328
other                         6096
yield                         1365
lane use marking              1226
other reg. sign               1103
pedestrian crossing sign       636
railroad crossing gate         581
flashing control signal        373
school zone                    353
delineators                    352
police/flagman                 309
rr crossing sign               195
other railroad crossing        192
no passing                      58
bicycle crossing sign           34
Name: traffic_control_device, dtype: int64

In [38]:
crashes_cleaned['lighting_condition'].value_counts()

daylight                  578548
darkness, lighted road    197098
unknown                    42569
darkness                   42455
dusk                       25737
dawn                       15039
Name: lighting_condition, dtype: int64

In [39]:
crashes_cleaned['first_crash_type'].value_counts()

parked motor vehicle            208646
rear end                        199321
sideswipe same direction        138501
turning                         129668
angle                            97996
fixed object                     41874
pedestrian                       21320
pedalcyclist                     14331
sideswipe opposite direction     12509
rear to front                     9252
other object                      8981
head on                           7639
rear to side                      5512
other noncollision                2745
rear to rear                      1907
animal                             655
overturned                         543
train                               46
Name: first_crash_type, dtype: int64

In [40]:
crashes_cleaned['trafficway_type'].value_counts()

not divided                        388246
divided - w/median (not raised)    142466
one-way                            114072
four way                            62561
parking lot                         61011
divided - w/median barrier          50946
other                               24388
alley                               14802
t-intersection                      12409
unknown                             10603
center turn lane                     6374
driveway                             2890
ramp                                 2834
unknown intersection type            2762
five point, or more                  1385
y-intersection                       1350
traffic route                        1166
not reported                          687
roundabout                            308
l-intersection                        186
Name: trafficway_type, dtype: int64

In [41]:
crashes_cleaned['lane_cnt'].value_counts()

2.0          91162
4.0          49589
1.0          32550
3.0           8678
0.0           8032
6.0           4502
5.0           1940
8.0           1908
7.0            184
10.0           162
99.0           108
9.0             66
11.0            30
12.0            29
20.0            15
22.0            13
15.0             7
16.0             7
14.0             5
30.0             5
40.0             4
60.0             3
21.0             3
25.0             2
100.0            2
902.0            1
24.0             1
80.0             1
218474.0         1
45.0             1
17.0             1
299679.0         1
19.0             1
400.0            1
13.0             1
1191625.0        1
35.0             1
433634.0         1
41.0             1
28.0             1
44.0             1
Name: lane_cnt, dtype: int64

In [42]:
crashes_cleaned.columns

Index(['crash_record_id', 'posted_speed_limit', 'traffic_control_device',
       'lighting_condition', 'first_crash_type', 'trafficway_type', 'lane_cnt',
       'roadway_surface_cond', 'intersection_related_i',
       'prim_contributory_cause', 'street_name', 'most_severe_injury',
       'crash_hour', 'crash_day_of_week', 'crash_month', 'location'],
      dtype='object')

In [43]:
crashes_categorical = ['prim_contributory_cause',
                       'traffic_control_device',
                       'lighting_condition',
                       'first_crash_type',
                       'trafficway_type',
                       'roadway_surface_cond',
                       'most_severe_injury',
                      ]

In [44]:
# Convert the specified columns to string type
crashes_cleaned[crashes_categorical] = crashes_cleaned[crashes_categorical].astype(str)

In [45]:
crashes_numeric = ['posted_speed_limit',
                   
    
]

In [46]:
crashes_cleaned['crash_hour'].value_counts()

15    69825
16    68993
17    67144
14    60189
18    55381
13    54478
12    52818
8     47683
11    45742
9     41217
10    40942
19    40838
7     38207
20    33003
21    29440
22    27107
23    23508
0     19638
6     19488
1     16760
2     14336
5     12390
3     11848
4     10471
Name: crash_hour, dtype: int64

In [47]:
crashes_cleaned['crash_day_of_week'].value_counts()

6    146122
7    133158
5    129717
3    128456
4    127880
2    123620
1    112493
Name: crash_day_of_week, dtype: int64

In [48]:
crashes_cleaned['crash_month'].value_counts()

10    86680
9     82227
8     80821
7     78568
11    78175
6     77697
5     77268
12    74429
3     67812
4     66417
1     66068
2     65284
Name: crash_month, dtype: int64

In [49]:
crashes_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901446 entries, 0 to 901445
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   crash_record_id          901446 non-null  object 
 1   posted_speed_limit       901446 non-null  int64  
 2   traffic_control_device   901446 non-null  object 
 3   lighting_condition       901446 non-null  object 
 4   first_crash_type         901446 non-null  object 
 5   trafficway_type          901446 non-null  object 
 6   lane_cnt                 199022 non-null  float64
 7   roadway_surface_cond     901446 non-null  object 
 8   intersection_related_i   207054 non-null  object 
 9   prim_contributory_cause  901446 non-null  object 
 10  street_name              901445 non-null  object 
 11  most_severe_injury       901446 non-null  object 
 12  crash_hour               901446 non-null  int64  
 13  crash_day_of_week        901446 non-null  int64  
 14  cras

In [50]:
# combining trafficway_type and lane_cnt into 

# Define intersection types
intersection_types = ['roundabout', 'l-intersection', 'y-intersection', 
                      'five point, or more', 'center turn lane', 
                      't-intersection', 'unknown intersection type']

# Conditions
conditions = [
    (crashes_cleaned['trafficway_type'] == 'one-way') & (crashes_cleaned['lane_cnt'] == 1),
    (crashes_cleaned['trafficway_type'] == 'one-way') & (crashes_cleaned['lane_cnt'] > 1),
    (crashes_cleaned['trafficway_type'].isin(intersection_types)),
    (crashes_cleaned['trafficway_type'].isin(['unknown', 'not reported'])) | 
    (pd.isnull(crashes_cleaned['trafficway_type'])) | 
    (pd.isnull(crashes_cleaned['lane_cnt'])),
    (crashes_cleaned['trafficway_type'].isin(['parking lot', 'driveway', 'ramp', 'alley', 'other'])),
    (crashes_cleaned['lane_cnt'] > 1) & 
    (~crashes_cleaned['trafficway_type'].isin(['one-way', 'four way', 'unknown', 'not reported']))
]

# Corresponding categories
choices = [
    'single-lane one way',
    'multi-lane one way',
    'intersection',
    'unknown',  # Combined "unknown" and "not reported"
    'other',
    'multi-lane bidirectional'
]

# Apply classification
crashes_cleaned['road_category'] = np.select(conditions, choices, default='unknown')

In [51]:
# Modify the condition for 'multi-lane bidirectional'
conditions = [
    (crashes_cleaned['trafficway_type'] == 'one-way') & (crashes_cleaned['lane_cnt'] == 1),
    (crashes_cleaned['trafficway_type'] == 'one-way') & (crashes_cleaned['lane_cnt'] > 1),
    (crashes_cleaned['trafficway_type'].isin(intersection_types)),
    (crashes_cleaned['trafficway_type'].isin(['unknown', 'not reported'])) | 
    (pd.isnull(crashes_cleaned['trafficway_type'])) | 
    (pd.isnull(crashes_cleaned['lane_cnt'])),
    (crashes_cleaned['trafficway_type'].isin(['parking lot', 'driveway', 'ramp', 'alley', 'other'])),
    (crashes_cleaned['lane_cnt'] > 1) & 
    (~crashes_cleaned['trafficway_type'].isin(['one-way', 'four way', 'unknown', 'not reported', 'other', 'parking lot', 'driveway', 'ramp', 'alley']))
]

# Apply classification
crashes_cleaned['road_category'] = np.select(conditions, choices, default='unknown')

In [52]:
# Check the distribution of categories in the new column
crashes_cleaned['road_category'].value_counts()

unknown                     689310
multi-lane bidirectional    138918
intersection                 24774
other                        19589
single-lane one way          17992
multi-lane one way           10863
Name: road_category, dtype: int64

In [53]:
crashes_cleaned['time_of_day'] = pd.cut(
    crashes_cleaned['crash_hour'], 
    bins=[-1, 5, 11, 17, 23], 
    labels=['Night (Late)', 'Morning', 'Afternoon', 'Night (Early)'],
    right=True
)

In [54]:
crashes_cleaned['day_of_week'] = crashes_cleaned['crash_day_of_week'].replace({
    1: 'Sun',
    2: 'Mon',
    3: 'Tues',
    4: 'Wed',
    5: 'Thur',
    6: 'Fri',
    7: 'Sat'
})

In [55]:
crashes_cleaned['season'] = pd.cut(
    crashes_cleaned['crash_month'], 
    bins=[0, 2, 5, 8, 11, 12], 
    labels=['Winter', 'Spring', 'Summer', 'Fall', 'Winter'],
    right=True,
    ordered=False
)

In [56]:
crashes_cleaned['season'].value_counts()

Fall      247082
Summer    237086
Spring    211497
Winter    205781
Name: season, dtype: int64

In [57]:
crashes_cleaned['day_of_week'].value_counts()

Fri     146122
Sat     133158
Thur    129717
Tues    128456
Wed     127880
Mon     123620
Sun     112493
Name: day_of_week, dtype: int64

In [58]:
# Categorize speed limits directly without using a function
crashes_cleaned['speed_limit_category'] = pd.cut(
    crashes_cleaned['posted_speed_limit'],
    bins=[-float('inf'), 25, 40, float('inf')],
    labels=['Low', 'Medium', 'High'],
    right=True
)

# Check the result
crashes_cleaned['speed_limit_category'].value_counts()

Medium    732454
Low       161731
High        7261
Name: speed_limit_category, dtype: int64

In [59]:
# Create a dictionary to map the original 'traffic_control_device' values to more specific categories
traffic_control_mapping = {
    'traffic signal': 'Signal',
    'flashing control signal': 'Signal',
    'pedestrian crossing sign': 'Signal',  # If it's a signal
    'railroad crossing gate': 'Signal',    # If it uses lights
    
    'stop sign/flasher': 'Sign',
    'yield': 'Sign',
    'school zone': 'Sign',
    'railroad crossing sign': 'Sign',      # If static sign
    'other warning sign': 'Sign',
    'bicycle crossing sign': 'Sign',
    'no passing': 'Sign',
    
    'lane use marking': 'Markings & Lanes',
    'delineators': 'Markings & Lanes',
    
    'no controls': 'Other',
    'unknown': 'Other',
    'other': 'Other',
    'police/flagman': 'Other',
    'other railroad crossing': 'Other'
}

# Apply the mapping to the 'traffic_control_device' column
crashes_cleaned['traffic_control_category'] = crashes_cleaned['traffic_control_device'].map(traffic_control_mapping)

# Check the value counts for the new grouped categories
crashes_cleaned['traffic_control_category'].value_counts()

Other               555212
Signal              251472
Sign                 91886
Markings & Lanes      1578
Name: traffic_control_category, dtype: int64

In [60]:
# Create a mapping for the primary contributory causes
cause_mapping = {
    'distraction - from inside vehicle': 'Distraction',
    'distraction - from outside vehicle': 'Distraction',
    'cell phone use other than texting': 'Distraction',
    'distraction - other electronic device (navigation device, dvd player, etc.)': 'Distraction',
    'texting': 'Distraction',
    'bicycle advancing legally on red light': 'Distraction',
    'motorcycle advancing legally on red light': 'Distraction',
    
    'operating vehicle in erratic, reckless, careless, negligent or aggressive manner': 'Aggressive/Reckless Driving',
    'failing to reduce speed to avoid crash': 'Aggressive/Reckless Driving',
    'exceeding authorized speed limit': 'Aggressive/Reckless Driving',
    'exceeding safe speed for conditions': 'Aggressive/Reckless Driving',
    'driving on wrong side/wrong way': 'Aggressive/Reckless Driving',
    'disregarding stop sign': 'Aggressive/Reckless Driving',
    'disregarding traffic signals': 'Aggressive/Reckless Driving',
    'disregarding yield sign': 'Aggressive/Reckless Driving',
    'passing stopped school bus': 'Aggressive/Reckless Driving',
    'improper overtaking/passing': 'Aggressive/Reckless Driving',
    'failing to yield right-of-way': 'Aggressive/Reckless Driving',
    'following too closely': 'Aggressive/Reckless Driving',
    'improper lane usage': 'Aggressive/Reckless Driving',
    'improper turning/no signal': 'Aggressive/Reckless Driving',
    
    'driving skills/knowledge/experience': 'Driver\'s Condition/Experience',
    'physical condition of driver': 'Driver\'s Condition/Experience',
    'vision obscured (signs, tree limbs, buildings, etc.)': 'Driver\'s Condition/Experience',
    'under the influence of alcohol/drugs (use when arrest is effected)': 'Driver\'s Condition/Experience',
    'had been drinking (use when arrest is not made)': 'Driver\'s Condition/Experience',
    
    'weather': 'Environmental and Road Conditions',
    'road engineering/surface/marking defects': 'Environmental and Road Conditions',
    'road construction/maintenance': 'Environmental and Road Conditions',
    'evasive action due to animal, object, nonmotorist': 'Environmental and Road Conditions',
    'animal': 'Environmental and Road Conditions',
    
    'unable to determine': 'Unknown/Other',
    'not applicable': 'Unknown/Other',
    'related to bus stop': 'Unknown/Other',
    'obstructed crosswalks': 'Unknown/Other',
    
    # Add the missing categories
    'improper backing': 'Aggressive/Reckless Driving',
    'equipment - vehicle condition': 'Driver\'s Condition/Experience',
    'disregarding other traffic signs': 'Aggressive/Reckless Driving',
    'disregarding road markings': 'Aggressive/Reckless Driving',
    'turning right on red': 'Aggressive/Reckless Driving'
}

# Apply the mapping to categorize the causes
crashes_cleaned['crash_cause_category'] = crashes_cleaned['prim_contributory_cause'].map(cause_mapping)

In [61]:
crashes_cleaned['crash_cause_category'].value_counts()

# Check the value counts in the new category column
#crashes_cleaned['crash_cause_category'].value_counts()

Aggressive/Reckless Driving          417688
Unknown/Other                        400902
Driver's Condition/Experience         51717
Environmental and Road Conditions     19367
Distraction                           11772
Name: crash_cause_category, dtype: int64

In [62]:
# Find unique values in 'prim_contributory_cause' that are not in the 'cause_mapping'
missing_values = crashes_cleaned[~crashes_cleaned['prim_contributory_cause'].isin(cause_mapping.keys())]['prim_contributory_cause'].unique()

print(missing_values)

[]


In [63]:
# Replace the string 'nan' with actual NaN values
crashes_cleaned['most_severe_injury'] = crashes_cleaned['most_severe_injury'].replace('nan', np.nan)

# Now categorize the injuries into 'Serious' and 'Non-serious'
crashes_cleaned['severity_category'] = crashes_cleaned['most_severe_injury'].replace({
    'no indication of injury': 'Non-serious',
    'nonincapacitating injury': 'Non-serious',
    'reported, not evident': 'Non-serious',
    'incapacitating injury': 'Serious',
    'fatal': 'Serious'
})

# Optionally, drop rows with NaN in 'severity_category' column
#crashes_cleaned = crashes_cleaned.dropna(subset=['severity_category'])

In [64]:
crashes_cleaned.isna().sum()

crash_record_id                  0
posted_speed_limit               0
traffic_control_device           0
lighting_condition               0
first_crash_type                 0
trafficway_type                  0
lane_cnt                    702424
roadway_surface_cond             0
intersection_related_i      694392
prim_contributory_cause          0
street_name                      1
most_severe_injury            1993
crash_hour                       0
crash_day_of_week                0
crash_month                      0
location                      6526
road_category                    0
time_of_day                      0
day_of_week                      0
season                           0
speed_limit_category             0
traffic_control_category      1298
crash_cause_category             0
severity_category             1993
dtype: int64

In [65]:
crashes_cleaned.drop(columns = [
    'lane_cnt', 
    'intersection_related_i',
    'trafficway_type', 
    'crash_hour', 
    'crash_day_of_week', 
    'crash_month', 
    'posted_speed_limit', 
    'traffic_control_device', 
    'street_name', 
    'most_severe_injury',
    'prim_contributory_cause'
], inplace = True)

In [66]:
crashes_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901446 entries, 0 to 901445
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   crash_record_id           901446 non-null  object  
 1   lighting_condition        901446 non-null  object  
 2   first_crash_type          901446 non-null  object  
 3   roadway_surface_cond      901446 non-null  object  
 4   location                  894920 non-null  object  
 5   road_category             901446 non-null  object  
 6   time_of_day               901446 non-null  category
 7   day_of_week               901446 non-null  object  
 8   season                    901446 non-null  category
 9   speed_limit_category      901446 non-null  category
 10  traffic_control_category  900148 non-null  object  
 11  crash_cause_category      901446 non-null  object  
 12  severity_category         899453 non-null  object  
dtypes: category(3), object(10)
me

In [67]:
(crashes_cleaned.isna().sum()/ len(crashes_cleaned))* 100

crash_record_id             0.000000
lighting_condition          0.000000
first_crash_type            0.000000
roadway_surface_cond        0.000000
location                    0.723948
road_category               0.000000
time_of_day                 0.000000
day_of_week                 0.000000
season                      0.000000
speed_limit_category        0.000000
traffic_control_category    0.143991
crash_cause_category        0.000000
severity_category           0.221089
dtype: float64

In [68]:
crashes_cleaned['severity_category'].value_counts()

Non-serious    883394
Serious         16059
Name: severity_category, dtype: int64

In [69]:
crashes_cleaned.dropna(inplace = True)

In [70]:
crashes_cleaned['severity_category'].value_counts()

Non-serious    875731
Serious         15938
Name: severity_category, dtype: int64

### People

In [71]:
people = pd.read_csv('./data/people.csv', low_memory = False)

In [72]:
people.head()

Unnamed: 0,PERSON_ID,PERSON_TYPE,CRASH_RECORD_ID,VEHICLE_ID,CRASH_DATE,SEAT_NO,CITY,STATE,ZIPCODE,SEX,...,EMS_RUN_NO,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,PEDPEDAL_ACTION,PEDPEDAL_VISIBILITY,PEDPEDAL_LOCATION,BAC_RESULT,BAC_RESULT VALUE,CELL_PHONE_USE
0,O749947,DRIVER,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,834816.0,09/28/2019 03:30:00 AM,,CHICAGO,IL,60651.0,M,...,,UNKNOWN,UNKNOWN,UNKNOWN,,,,TEST NOT OFFERED,,
1,O871921,DRIVER,af84fb5c8d996fcd3aefd36593c3a02e6e7509eeb27568...,827212.0,04/13/2020 10:50:00 PM,,CHICAGO,IL,60620.0,M,...,,NONE,NOT OBSCURED,NORMAL,,,,TEST NOT OFFERED,,
2,O10018,DRIVER,71162af7bf22799b776547132ebf134b5b438dcf3dac6b...,9579.0,11/01/2015 05:00:00 AM,,,,,X,...,,IMPROPER BACKING,UNKNOWN,UNKNOWN,,,,TEST NOT OFFERED,,
3,O10038,DRIVER,c21c476e2ccc41af550b5d858d22aaac4ffc88745a1700...,9598.0,11/01/2015 08:00:00 AM,,,,,X,...,,UNKNOWN,UNKNOWN,UNKNOWN,,,,TEST NOT OFFERED,,
4,O10039,DRIVER,eb390a4c8e114c69488f5fb8a097fe629f5a92fd528cf4...,9600.0,11/01/2015 10:15:00 AM,,,,,X,...,,UNKNOWN,UNKNOWN,UNKNOWN,,,,TEST NOT OFFERED,,


In [73]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979859 entries, 0 to 1979858
Data columns (total 29 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   PERSON_ID              object 
 1   PERSON_TYPE            object 
 2   CRASH_RECORD_ID        object 
 3   VEHICLE_ID             float64
 4   CRASH_DATE             object 
 5   SEAT_NO                float64
 6   CITY                   object 
 7   STATE                  object 
 8   ZIPCODE                object 
 9   SEX                    object 
 10  AGE                    float64
 11  DRIVERS_LICENSE_STATE  object 
 12  DRIVERS_LICENSE_CLASS  object 
 13  SAFETY_EQUIPMENT       object 
 14  AIRBAG_DEPLOYED        object 
 15  EJECTION               object 
 16  INJURY_CLASSIFICATION  object 
 17  HOSPITAL               object 
 18  EMS_AGENCY             object 
 19  EMS_RUN_NO             object 
 20  DRIVER_ACTION          object 
 21  DRIVER_VISION          object 
 22  PHYSICAL_CONDITION

In [74]:
people.columns = people.columns.str.lower()

In [75]:
# Convert all string values in object columns to lowercase
for col in people.select_dtypes(include='object').columns:
    people[col] = people[col].str.lower()

In [76]:
round((people.isna().sum()/ len(people)*100), 2)

person_id                 0.00
person_type               0.00
crash_record_id           0.00
vehicle_id                2.05
crash_date                0.00
seat_no                  79.78
city                     27.23
state                    26.13
zipcode                  33.05
sex                       1.68
age                      29.11
drivers_license_state    41.46
drivers_license_class    51.34
safety_equipment          0.28
airbag_deployed           1.98
ejection                  1.26
injury_classification     0.04
hospital                 83.81
ems_agency               90.01
ems_run_no               98.33
driver_action            20.40
driver_vision            20.43
physical_condition       20.34
pedpedal_action          98.04
pedpedal_visibility      98.04
pedpedal_location        98.04
bac_result               20.35
bac_result value         99.89
cell_phone_use           99.94
dtype: float64

In [77]:
# selecting all features with 90% or more of its values are null
ppl_high_null_features = people.columns[(people.isna().sum() / len(people) * 100) >= 90]
ppl_high_null_features

Index(['ems_agency', 'ems_run_no', 'pedpedal_action', 'pedpedal_visibility',
       'pedpedal_location', 'bac_result value', 'cell_phone_use'],
      dtype='object')

In [78]:
# creating a list of features with 90% or more null values
ppl_high_null_features_list = list(ppl_high_null_features)
ppl_high_null_features_list

['ems_agency',
 'ems_run_no',
 'pedpedal_action',
 'pedpedal_visibility',
 'pedpedal_location',
 'bac_result value',
 'cell_phone_use']

In [79]:
people_cleaned = people.drop(columns=ppl_high_null_features_list)
people_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979859 entries, 0 to 1979858
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   person_id              object 
 1   person_type            object 
 2   crash_record_id        object 
 3   vehicle_id             float64
 4   crash_date             object 
 5   seat_no                float64
 6   city                   object 
 7   state                  object 
 8   zipcode                object 
 9   sex                    object 
 10  age                    float64
 11  drivers_license_state  object 
 12  drivers_license_class  object 
 13  safety_equipment       object 
 14  airbag_deployed        object 
 15  ejection               object 
 16  injury_classification  object 
 17  hospital               object 
 18  driver_action          object 
 19  driver_vision          object 
 20  physical_condition     object 
 21  bac_result             object 
dtypes: float64(3), obj

In [80]:
people_cleaned['person_type'].value_counts()

driver                 1539401
passenger               400333
pedestrian               23417
bicycle                  14716
non-motor vehicle         1665
non-contact vehicle        327
Name: person_type, dtype: int64

In [81]:
people_cleaned['crash_date'].value_counts()

12/29/2020 05:00:00 pm    72
11/10/2017 10:30:00 am    64
03/16/2018 10:17:00 am    61
08/21/2024 01:45:00 am    56
06/22/2019 06:15:00 pm    55
                          ..
11/21/2023 11:50:00 pm     1
08/20/2023 01:44:00 pm     1
06/26/2022 08:35:00 pm     1
08/04/2018 10:55:00 am     1
07/07/2019 09:46:00 pm     1
Name: crash_date, Length: 591866, dtype: int64

In [82]:
people_cleaned['seat_no'].value_counts()

3.0     193103
6.0      67158
4.0      53319
5.0      18870
1.0      17966
2.0      16957
12.0     10605
7.0       9259
10.0      9017
11.0      3548
8.0        531
Name: seat_no, dtype: int64

In [83]:
people_cleaned['safety_equipment'].value_counts(normalize = True)

usage unknown                                 0.476501
safety belt used                              0.464880
none present                                  0.033816
safety belt not used                          0.005434
helmet not used                               0.005104
child restraint used                          0.003962
child restraint - forward facing              0.002597
bicycle helmet (pedacyclist involved only)    0.002004
child restraint - type unknown                0.001373
child restraint - rear facing                 0.001190
dot compliant motorcycle helmet               0.000802
helmet used                                   0.000688
booster seat                                  0.000678
child restraint not used                      0.000437
not dot compliant motorcycle helmet           0.000157
should/lap belt used improperly               0.000145
wheelchair                                    0.000123
child restraint used improperly               0.000085
stretcher 

In [84]:
people_cleaned['airbag_deployed'].value_counts()

did not deploy                            987711
not applicable                            424194
deployment unknown                        397461
deployed, front                            61565
deployed, combination                      50895
deployed, side                             17944
deployed other (knee, air, belt, etc.)       973
Name: airbag_deployed, dtype: int64

In [85]:
people_cleaned['ejection'].value_counts()

none                  1820806
unknown                125622
totally ejected          5904
partially ejected        1449
trapped/extricated       1196
Name: ejection, dtype: int64

This feature might be normally be helpful, but it is far too skewed to be helpful for this analysis. Only about 8.5k values other than 'none' or 'unknown'. This will be removed

In [86]:
people_cleaned['injury_classification'].value_counts()

no indication of injury     1803602
nonincapacitating injury      98271
reported, not evident         58267
incapacitating injury         17878
fatal                          1089
Name: injury_classification, dtype: int64

We can drop this. It contains similar information to 'most_severe_injury' but is more imbalanced so I will drop it and keep 'most_severe_injury' as my target. 

In [87]:
round(people_cleaned['driver_action'].value_counts(normalize = True), 2)

none                                 0.36
unknown                              0.25
failed to yield                      0.09
other                                0.09
followed too closely                 0.06
improper backing                     0.03
improper turn                        0.03
improper lane change                 0.03
improper passing                     0.02
disregarded control devices          0.02
too fast for conditions              0.01
wrong way/side                       0.00
improper parking                     0.00
overcorrected                        0.00
evading police vehicle               0.00
cell phone use other than texting    0.00
emergency vehicle on call            0.00
texting                              0.00
stopped school bus                   0.00
license restrictions                 0.00
Name: driver_action, dtype: float64

similar to 'prim_contributory_cause' in crashes. but this one contains 20% nulls. Will drop this and keep prim_contributory_cause.

In [88]:
people_cleaned['driver_vision'].value_counts()

not obscured              784184
unknown                   753832
other                      15389
moving vehicles             8800
parked vehicles             5429
windshield (water/ice)      4169
blinded - sunlight          1879
trees, plants                614
buildings                    558
blinded - headlights         168
blowing materials            108
hillcrest                    102
embankment                    85
signboard                     38
Name: driver_vision, dtype: int64

This feature is too skewed to provide any real analytical benefit. The top two by 500k values are 'not_obscured' and 'unknown'

In [89]:
people_cleaned['physical_condition'].value_counts()

normal                          1020511
unknown                          527441
impaired - alcohol                 6635
removed by ems                     5697
other                              4585
emotional                          4213
fatigued/asleep                    4108
illness/fainted                    1408
had been drinking                  1128
impaired - drugs                    726
impaired - alcohol and drugs        416
medicated                           193
Name: physical_condition, dtype: int64

Most are normal or unknown. Not particularly helpful. Remove

In [90]:
people_cleaned['bac_result'].value_counts()

test not offered                   1554246
test refused                         16163
test performed, results unknown       3715
test taken                            2784
Name: bac_result, dtype: int64

most are test not offered and test refused. Again, not very helpful, so will drop

In [91]:
people_cleaned.columns

Index(['person_id', 'person_type', 'crash_record_id', 'vehicle_id',
       'crash_date', 'seat_no', 'city', 'state', 'zipcode', 'sex', 'age',
       'drivers_license_state', 'drivers_license_class', 'safety_equipment',
       'airbag_deployed', 'ejection', 'injury_classification', 'hospital',
       'driver_action', 'driver_vision', 'physical_condition', 'bac_result'],
      dtype='object')

In [92]:
ppl_cols_to_remove = ['person_id',
                      'person_type',
                      'vehicle_id',
                      'drivers_license_state', 
                      'drivers_license_class',
                      'city', 
                      'state', 
                      'zipcode',
                      'hospital', 
                      'crash_date',
                      'seat_no',
                      'ejection',
                      'injury_classification',
                      'driver_vision',
                      'driver_action',
                      'physical_condition',
                      'bac_result',
                     ]

Drivers license state and class is not relevant. 

In [93]:
people_cleaned = people_cleaned.drop(columns=ppl_cols_to_remove)
people_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979859 entries, 0 to 1979858
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   crash_record_id   object 
 1   sex               object 
 2   age               float64
 3   safety_equipment  object 
 4   airbag_deployed   object 
dtypes: float64(1), object(4)
memory usage: 75.5+ MB


In [94]:
list(people_cleaned.columns)

['crash_record_id', 'sex', 'age', 'safety_equipment', 'airbag_deployed']

In [95]:
for feature in people_cleaned.columns:
    print(people_cleaned[feature].value_counts())
    print()
    print("-" * 32)

31ecf6862c691ff12d3856213b902c146b07337b42a5692e3a176a66d684d221028bb5118ef6d67a313bcaed9e97bee1855cb1f5e8650f49e8dc17663475a1ee    61
64d6480b38e77266a4d16717d0049f9df21075fa2dbf015cbbc83e59f019e6a69d3628ae1e4a9a70ec5dbf435fbdcd3b979cab5c991af14c08509f4c337f4a3d    54
13026c7fb51566d9ca487a093e38c6f5621c2ec25be48c306b6574983b61daeee589524b96bb2bfe66ddd0f695c8d2bf3ab0297558528e9c7a70363c763d6bd1    50
3eda323ea45cd6e2b459bf5ba570dcf74e71f3fe1aa449231a47fd1dd20ce71de888840d420dc54b61ca643159b46494979dad05e407d8138438a675c615575d    48
1829f52c1281a0396ef94692331b3dc530bc4be5a54cd55e94c24a5e5e49b800fbcf9f24dabe4c8277c8964ad05aadc89e90fd94021959d6dff5fad55480d595    46
                                                                                                                                    ..
325e0c0eb47728d87eafdcaa7f1d4fe90dd7cfce4a5c572832603f7281ee5a62b661c0872a6893f0fef11b8e6241975dac44b4b18406d8c90e2acf30da8baa7f     1
973ada3483d748f779ef204aa8611f0427e0d22cb57b252a849620b

In [96]:
# Sample data (replace with your actual DataFrame)
age_bins_df = pd.DataFrame({
    'age': [5, 15, 16, 25, 30, 60, 100, 120, -5, 200]
})

# Define the bins for age groups
age_bins = [1, 16, 27, 66, 115]  # Updated bins based on your ranges
age_labels = ['1-15', '16-26', '27-65', '65+']  # Labels for the age groups

# Apply corrections for age values outside the valid range (negative, 0, or greater than 115)
people_cleaned['age'] = people_cleaned['age'].apply(lambda x: np.nan if x < 1 or x > 115 else x)

# Apply pd.cut() to create a new 'age_group' column
people_cleaned['age_group'] = pd.cut(people_cleaned['age'], bins=age_bins, labels=age_labels, right=False)

# Print the first few rows to verify the new grouping
print(people_cleaned[['age', 'age_group']].head())

    age age_group
0  25.0     16-26
1  37.0     27-65
2   NaN       NaN
3   NaN       NaN
4   NaN       NaN


In [97]:
# Create a dictionary to map the original 'safety_equipment' values to broader categories
safety_equipment_mapping = {
    # Used Equipment
    'safety belt used': 'Used',
    'child restraint used': 'Used',
    'child restraint - forward facing': 'Used',
    'bicycle helmet (pedacyclist involved only)': 'Used',
    'child restraint - type unknown': 'Used',
    'child restraint - rear facing': 'Used',
    'dot compliant motorcycle helmet': 'Used',
    'helmet used': 'Used',
    'booster seat': 'Used',
    'child restraint used improperly': 'Used',

    # Not Used Equipment
    'safety belt not used': 'Not Used',
    'helmet not used': 'Not Used',
    'child restraint not used': 'Not Used',
    'not dot compliant motorcycle helmet': 'Not Used',
    'should/lap belt used improperly': 'Not Used',

    # Unknown Equipment Usage
    'usage unknown': 'Unknown',

    # Other/Special Case Equipment
    'none present': 'Other/Special Case', 
    'wheelchair': 'Other/Special Case',
    'stretcher': 'Other/Special Case',
    'unknown': 'Other/Special Case',  # Catch-all for any unknown or missing values
}

# Apply the mapping to the 'safety_equipment' column
people_cleaned['safety_equipment_category'] = people_cleaned['safety_equipment'].map(safety_equipment_mapping)

# Check the value counts for the new grouped categories
people_cleaned['safety_equipment_category'].value_counts()


Used                  944246
Unknown               940775
Other/Special Case     67053
Not Used               22264
Name: safety_equipment_category, dtype: int64

In [127]:
people_cleaned.isna().sum()

crash_record_id                   0
sex                           33339
airbag_deployed               39116
age_group                    591812
safety_equipment_category      5521
dtype: int64

In [124]:
len(people_cleaned)

1979859

In [125]:
people_cleaned.drop(columns = ['age', 'safety_equipment'], inplace = True)

In [128]:
people_cleaned.dropna(inplace = True)

In [129]:
people_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1350223 entries, 0 to 1979858
Data columns (total 5 columns):
 #   Column                     Non-Null Count    Dtype   
---  ------                     --------------    -----   
 0   crash_record_id            1350223 non-null  object  
 1   sex                        1350223 non-null  object  
 2   airbag_deployed            1350223 non-null  object  
 3   age_group                  1350223 non-null  category
 4   safety_equipment_category  1350223 non-null  object  
dtypes: category(1), object(4)
memory usage: 52.8+ MB


In [99]:
people[['age', 'sex', 'airbag_deployed']].isna().sum()

age                576406
sex                 33339
airbag_deployed     39116
dtype: int64

### Vehicles

In [100]:
vehicles = pd.read_csv('./data/vehicles.csv', low_memory = False)
vehicles.head()

Unnamed: 0,CRASH_UNIT_ID,CRASH_RECORD_ID,CRASH_DATE,UNIT_NO,UNIT_TYPE,NUM_PASSENGERS,VEHICLE_ID,CMRC_VEH_I,MAKE,MODEL,...,TRAILER1_LENGTH,TRAILER2_LENGTH,TOTAL_VEHICLE_LENGTH,AXLE_CNT,VEHICLE_CONFIG,CARGO_BODY_TYPE,LOAD_TYPE,HAZMAT_OUT_OF_SERVICE_I,MCS_OUT_OF_SERVICE_I,HAZMAT_CLASS
0,1717556,7b1763088507f77e0e552c009a6bf89a4d6330c7527706...,12/06/2023 03:24:00 PM,1,DRIVER,,1634931.0,,NISSAN,SENTRA,...,,,,,,,,,,
1,1717574,2603ff5a88f0b9b54576934c5ed4e4a64e8278e005687b...,12/06/2023 04:00:00 PM,2,DRIVER,,1634978.0,,CHRYSLER,SEBRING,...,,,,,,,,,,
2,1717579,a52ef70e33d468b855b5be44e8638a564434dcf99c0edf...,12/06/2023 04:30:00 PM,1,DRIVER,,1634948.0,,SUBARU,OUTBACK,...,,,,,,,,,,
3,1720118,609055f4b1a72a44d6ec40ba9036cefd7c1287a755eb6c...,12/10/2023 12:12:00 PM,1,DRIVER,,1637401.0,,TOYOTA,RAV4,...,,,,,,,,,,
4,1720119,609055f4b1a72a44d6ec40ba9036cefd7c1287a755eb6c...,12/10/2023 12:12:00 PM,2,DRIVER,,1637408.0,,SUBARU,OUTBACK,...,,,,,,,,,,


In [101]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838822 entries, 0 to 1838821
Data columns (total 71 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   CRASH_UNIT_ID             int64  
 1   CRASH_RECORD_ID           object 
 2   CRASH_DATE                object 
 3   UNIT_NO                   int64  
 4   UNIT_TYPE                 object 
 5   NUM_PASSENGERS            float64
 6   VEHICLE_ID                float64
 7   CMRC_VEH_I                object 
 8   MAKE                      object 
 9   MODEL                     object 
 10  LIC_PLATE_STATE           object 
 11  VEHICLE_YEAR              float64
 12  VEHICLE_DEFECT            object 
 13  VEHICLE_TYPE              object 
 14  VEHICLE_USE               object 
 15  TRAVEL_DIRECTION          object 
 16  MANEUVER                  object 
 17  TOWED_I                   object 
 18  FIRE_I                    object 
 19  OCCUPANT_CNT              float64
 20  EXCEED_SPEED_LIMIT_I    

In [102]:
vehicles.columns = vehicles.columns.str.lower()

In [103]:
# Convert all string values in object columns to lowercase
for col in vehicles.select_dtypes(include='object').columns:
    vehicles[col] = vehicles[col].str.lower()

In [104]:
# selecting all features with 90% or more of its values are null
high_null_features = vehicles.columns[(vehicles.isna().sum() / len(vehicles) * 100) >= 90]

In [105]:
# creating a list of features with 90% or more null values
high_null_features_list = list(high_null_features)

In [106]:
vehicles_cleaned = vehicles.drop(columns=high_null_features_list)

In [107]:
vehicles_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838822 entries, 0 to 1838821
Data columns (total 29 columns):
 #   Column               Dtype  
---  ------               -----  
 0   crash_unit_id        int64  
 1   crash_record_id      object 
 2   crash_date           object 
 3   unit_no              int64  
 4   unit_type            object 
 5   num_passengers       float64
 6   vehicle_id           float64
 7   make                 object 
 8   model                object 
 9   lic_plate_state      object 
 10  vehicle_year         float64
 11  vehicle_defect       object 
 12  vehicle_type         object 
 13  vehicle_use          object 
 14  travel_direction     object 
 15  maneuver             object 
 16  towed_i              object 
 17  occupant_cnt         float64
 18  area_01_i            object 
 19  area_02_i            object 
 20  area_05_i            object 
 21  area_06_i            object 
 22  area_07_i            object 
 23  area_08_i            object 
 24

In [108]:
vehicles_cleaned['num_passengers'].value_counts()

1.0     192482
2.0      50351
3.0      19667
4.0       6490
5.0       1745
6.0        699
7.0        259
8.0        121
10.0        80
9.0         72
11.0        46
12.0        40
14.0        22
15.0        18
13.0        17
17.0        13
19.0        11
18.0        10
16.0         9
27.0         8
24.0         6
26.0         6
20.0         5
28.0         5
43.0         4
35.0         4
29.0         4
40.0         4
25.0         4
21.0         4
34.0         4
32.0         3
22.0         3
42.0         2
33.0         2
46.0         2
36.0         2
23.0         2
38.0         2
30.0         2
31.0         1
59.0         1
37.0         1
52.0         1
Name: num_passengers, dtype: int64

This is redundant information. This information does not include the driver, but this information is captured in occupant count. will drop this one. 

In [109]:
vehicles_cleaned['unit_no'].value_counts()

1          905826
2          853710
3           61407
4           12627
5            3368
6            1058
7             404
8             177
9              82
10             43
0              37
11             23
12             15
13             10
14              9
15              8
16              7
17              5
18              5
3778035         1
Name: unit_no, dtype: int64

This is aftermath. unhelpful. remove

In [110]:
vehicles_cleaned['unit_type'].value_counts()

driver                 1539401
parked                  242214
pedestrian               23417
bicycle                  14716
driverless               14540
non-motor vehicle         1665
non-contact vehicle        327
disabled vehicle           280
equestrian                   8
Name: unit_type, dtype: int64

Most of the values are drivers or parked cars. this will not be useful for analysis

In [111]:
vehicles_cleaned['make'].value_counts()

chevrolet                                                    207108
ford                                                         179385
unknown                                                      178486
nissan                                                       144424
honda                                                        132729
                                                              ...  
columbia                                                          1
medical coaches, inc.                                             1
middlebury(mfd. by coachman homes, div.of coachmen                1
warrenville trailer manufacturing, inc. (warrenville, il)         1
maverick                                                          1
Name: make, Length: 1394, dtype: int64

high cardinality

In [112]:
vehicles_cleaned['model'].value_counts()

unknown                                      180803
other (explain in narrative)                 174151
camry                                         57164
corolla                                       36302
civic                                         34455
                                              ...  
new paris traveler corp., new paris, ind.         1
mikasa                                            1
bertolini container co.                           1
mulsanne                                          1
808 series                                        1
Name: model, Length: 2643, dtype: int64

This feels like it could be helpful, but many unknowns and 'other', and very high cardinality. The important information that we'd gain from this is already included in vehicle_type. So we can drop

In [113]:
vehicles_cleaned['vehicle_defect'].value_counts()

none                958779
unknown             817496
other                10626
brakes                5517
tires                  900
steering               814
wheels                 449
suspension             284
fuel system            265
engine/motor           231
windows                116
lights                 109
cargo                   65
signals                 42
restraint system        27
trailer coupling        25
exhaust                 21
Name: vehicle_defect, dtype: int64

Most of the values are none or unknown. This will not be particularly useful for analysis. can drop

In [114]:
vehicles_cleaned['vehicle_type'].value_counts()

passenger                                 1126457
sport utility vehicle (suv)                250398
unknown/na                                 163775
van/mini-van                                84122
pickup                                      58998
truck - single unit                         33685
other                                       22103
bus over 15 pass.                           19719
tractor w/ semi-trailer                     16961
bus up to 15 pass.                           5348
motorcycle (over 150cc)                      4383
single unit truck with trailer               3047
other vehicle with trailer                   2456
tractor w/o semi-trailer                     2233
autocycle                                     680
moped or motorized bicycle                    676
motor driven cycle                            328
all-terrain vehicle (atv)                     199
farm equipment                                 87
3-wheeled motorcycle (2 rear wheels)           73


In [115]:
vehicles_cleaned['travel_direction'].value_counts()

n          421689
s          413205
w          372992
e          365800
unknown    143678
se          23025
nw          21056
sw          17411
ne          16910
Name: travel_direction, dtype: int64

Unhelpful for analysis. Remove

In [116]:
vehicles_cleaned['maneuver'].value_counts()

straight ahead                        831882
parked                                246368
unknown/na                            136331
slow/stop in traffic                  130774
turning left                          107438
backing                                72110
turning right                          60105
passing/overtaking                     43818
changing lanes                         34383
other                                  30275
entering traffic lane from parking     21177
merging                                12694
u-turn                                 10318
starting in traffic                    10011
leaving traffic lane to park            8682
avoiding vehicles/objects               7538
skidding/control loss                   6585
enter from drive/alley                  6313
parked in traffic lane                  5547
slow/stop - left turn                   3042
driving wrong way                       2699
negotiating a curve                     2167
slow/stop 

This feature could be important as it has to do with what was happening prior to the crash.

In [117]:
vehicles_cleaned['towed_i'].value_counts()

y    215790
n     12402
Name: towed_i, dtype: int64

Aftermath; Unhelpful for analysis

In [118]:
vehicles_cleaned['occupant_cnt'].value_counts()

1.0     1301432
0.0      236967
2.0      181570
3.0       47614
4.0       18898
5.0        6204
6.0        1649
7.0         661
8.0         249
9.0         117
11.0         75
10.0         70
12.0         44
13.0         39
15.0         20
14.0         17
16.0         16
18.0         13
20.0         12
19.0         10
17.0          8
28.0          8
25.0          6
26.0          6
36.0          5
21.0          5
29.0          5
44.0          4
27.0          4
41.0          4
35.0          4
33.0          3
23.0          3
30.0          3
22.0          3
39.0          2
31.0          2
43.0          2
34.0          2
47.0          2
99.0          2
38.0          1
37.0          1
60.0          1
24.0          1
32.0          1
53.0          1
Name: occupant_cnt, dtype: int64

It is unclear what the area_##_i features represent. They will be removed

In [119]:
vehicles_cleaned['first_contact_point'].value_counts()

front                 352490
rear                  237180
unknown               171216
side-left             123853
front-left-corner     120599
front-right-corner    118087
side-right            117983
front-left             81524
front-right            76903
rear-left              68089
rear-left-corner       53479
other                  41867
rear-right-corner      39230
rear-right             36488
side-left-rear         30569
total (all areas)      28268
side-right-rear        23168
side-left-front        20132
side-right-front       17338
none                   14038
roof                   11958
under carriage          5900
top                     2259
Name: first_contact_point, dtype: int64

This feature could indicate

In [120]:
vehicle_features_to_drop = ['num_passengers', 
                            'crash_unit_id',
                            'crash_date',
                            'unit_type',
                            'make', 
                            'model',
                            'vehicle_id',
                           'vehicle_defect',
                           'unit_no',
                           'lic_plate_state',
                            'vehicle_year',
                           'vehicle_use',
                           'travel_direction',
                           'towed_i',
                            'area_01_i',
                           'area_02_i', 
                            'area_05_i',
                            'area_06_i',
                            'area_07_i',
                            'area_08_i',
                            'area_10_i',
                            'area_11_i',
                            'area_12_i',
                            'area_99_i', 
                           'first_contact_point']

In [121]:
vehicles_cleaned = vehicles_cleaned.drop(columns=vehicle_features_to_drop)

In [122]:
vehicles_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838822 entries, 0 to 1838821
Data columns (total 4 columns):
 #   Column           Dtype  
---  ------           -----  
 0   crash_record_id  object 
 1   vehicle_type     object 
 2   maneuver         object 
 3   occupant_cnt     float64
dtypes: float64(1), object(3)
memory usage: 56.1+ MB


In [123]:
list(vehicles_cleaned.columns)

['crash_record_id', 'vehicle_type', 'maneuver', 'occupant_cnt']

In [136]:
# Create a dictionary to map the original vehicle types to more specific categories
vehicle_type_mapping = {
    'passenger': 'Passenger Vehicles',
    'sport utility vehicle (suv)': 'SUVs',
    'van/mini-van': 'Passenger Vehicles',
    'pickup': 'Trucks',
    'truck - single unit': 'Trucks',
    'single unit truck with trailer': 'Trucks',
    'other': 'Other',
    'bus over 15 pass.': 'Buses',
    'bus up to 15 pass.': 'Buses',
    'tractor w/ semi-trailer': 'Trucks',
    'tractor w/o semi-trailer': 'Trucks',
    'motorcycle (over 150cc)': 'Motorcycles',
    'other vehicle with trailer': 'Other',
    'autocycle': 'Motorcycles',
    'moped or motorized bicycle': 'Motorcycles',
    'motor driven cycle': 'Motorcycles',
    'all-terrain vehicle (atv)': 'Recreational/Off-Highway Vehicles',
    'farm equipment': 'Farm and Specialized Equipment',
    '3-wheeled motorcycle (2 rear wheels)': 'Motorcycles',
    'recreational off-highway vehicle (rov)': 'Recreational/Off-Highway Vehicles',
    'snowmobile': 'Recreational/Off-Highway Vehicles',
    'unknown/na': np.nan  # Set 'unknown/na' to NaN
}

# Apply the mapping to the 'vehicle_type' column
vehicles_cleaned['vehicle_category'] = vehicles_cleaned['vehicle_type'].map(vehicle_type_mapping)

# Check the value counts for the new grouped categories
vehicles_cleaned['vehicle_category'].value_counts()

Passenger Vehicles    1210579
SUVs                   250398
Trucks                 114924
Buses                   25067
Other                   24559
Motorcycles              6140
Name: vehicle_category, dtype: int64

In [132]:
# Filter out rows with 'Recreational/Off-Highway Vehicles' and 'Farm and Specialized Equipment'
vehicles_cleaned = vehicles_cleaned[~vehicles_cleaned['vehicle_category'].isin(['Recreational/Off-Highway Vehicles', 'Farm and Specialized Equipment'])]

# Check the value counts after removing those categories
vehicles_cleaned['vehicle_category'].value_counts()


Passenger Vehicles    1210579
SUVs                   250398
Unknown/NA             163775
Trucks                 114924
Buses                   25067
Other                   24559
Motorcycles              6140
Name: vehicle_category, dtype: int64

In [133]:
vehicles_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1838498 entries, 0 to 1838821
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   crash_record_id   object 
 1   vehicle_type      object 
 2   maneuver          object 
 3   occupant_cnt      float64
 4   vehicle_category  object 
dtypes: float64(1), object(4)
memory usage: 84.2+ MB


In [135]:
# Modify the maneuver mapping to treat 'unknown/na' as NaN
maneuver_mapping = {
    'straight ahead': 'Standard Movement',
    'slow/stop in traffic': 'Standard Movement',
    'passing/overtaking': 'Standard Movement',
    'unknown/na': np.nan,  # Set 'unknown/na' to NaN
    
    'parked': 'Reversing/Stopping',
    'entering traffic lane from parking': 'Reversing/Stopping',
    'starting in traffic': 'Reversing/Stopping',
    
    'turning left': 'Turn/Change of Direction',
    'turning right': 'Turn/Change of Direction',
    'u-turn': 'Turn/Change of Direction',
    'changing lanes': 'Turn/Change of Direction',
    'turning on red': 'Turn/Change of Direction',
    
    'backing': 'Reversing/Stopping',
    'avoiding vehicles/objects': 'Avoidance/Emergency Response',
    'skidding/control loss': 'Avoidance/Emergency Response',
    'negotiating a curve': 'Avoidance/Emergency Response',
    
    'leaving traffic lane to park': 'Reversing/Stopping',
    'enter from drive/alley': 'Reversing/Stopping',
    
    'driving wrong way': 'Special Cases',
    'diverging': 'Special Cases',
    'driverless': 'Special Cases',
    'disabled': 'Special Cases',
    
    'other': 'Special Cases',
}

# Apply the mapping to the 'maneuver' column
vehicles_cleaned['maneuver_category'] = vehicles_cleaned['maneuver'].map(maneuver_mapping)

# Check the value counts for the new 'maneuver_category'
vehicles_cleaned['maneuver_category'].value_counts()

Standard Movement               1006307
Reversing/Stopping               364590
Turn/Change of Direction         212901
Special Cases                     34191
Avoidance/Emergency Response      16282
Name: maneuver_category, dtype: int64

In [137]:
# Replace 0, 99, and negative values with NaN
vehicles_cleaned['occupant_cnt'] = vehicles_cleaned['occupant_cnt'].replace([0, 99], np.nan)
vehicles_cleaned['occupant_cnt'] = vehicles_cleaned['occupant_cnt'].where(vehicles_cleaned['occupant_cnt'] > 0, np.nan)

# Now, apply pd.cut or categorization as needed
# Define the bins and corresponding labels for 'occupant_cnt'
bins = [1, 5, 10, 19, 29, 39, 49, 59, float('inf')]  # Define ranges for categories
labels = ['Small Group', 'Medium Group', 'Large Group', 
          'Very Large Group', 'Very Large Group', 'Very Large Group', 
          'Very Large Group', 'Very Large Group']

# Apply pd.cut to categorize the 'occupant_cnt' column based on the defined bins
vehicles_cleaned['occupant_category'] = pd.cut(vehicles_cleaned['occupant_cnt'], bins=bins, labels=labels, right=False)

# Handle any NaN or undefined values
#vehicles_cleaned['occupant_category'].fillna('Unknown/NA', inplace=True)

# Check the value counts for the new grouped categories
vehicles_cleaned['occupant_category'].value_counts()

ValueError: labels must be unique if ordered=True; pass ordered=False for duplicate labels