In [17]:
import numpy as np
import pandas as pd
print("Numpy Version", np. __version__)
print("Pandas Version", pd. __version__)

Numpy Version 1.24.3
Pandas Version 2.0.3


# Austin Car Crash Dataset
Crash data is obtained from the Texas Department of Transportation (TXDOT) Crash Record Information System (CRIS) database, which is populated by reports submitted by Texas Peace Officers throughout the state, including Austin Police Department (APD), and maintained by TXDOT.
This dataset contains crash-level records for crashes which have occurred in the last ten years. Crash data may take several days or weeks to be initially provided and finalized as it is furnished to the Austin Transportation & Public Works Department, therefore a two-week delay is implemented to help ensure more accurate and complete results.
Please note that the data and information on this website is for informational purposes only. While we seek to provide accurate information, please note that errors may be present and information presented may not be complete.

https://data.austintexas.gov/Transportation-and-Mobility/Austin-Crash-Report-Data-Crash-Level-Records/y2wy-tgr5

https://data.austintexas.gov/d/y2wy-tgr5?category=Transportation-and-Mobility&view_name=Austin-Crash-Report-Data-Crash-Level-Records

You can find a good description of the dataset here:
https://data.austintexas.gov/Transportation-and-Mobility/Austin-Crash-Report-Data-Crash-Level-Records/y2wy-tgr5/about_data


<img src="https://data.austintexas.gov/api/views/y2wy-tgr5/files/b5e18a1a-5071-4f7f-a658-07b498fb954e?download=true&filename=52029539143_bed94b40db_c.jpg" width="500" height="340">


In [18]:
crashes = pd.read_csv("https://github.com/kiat/Elements-of-Data-Analytics/raw/main/datasets/car_crash/Austin_Crash_Report_Data_Crash_Level_Records.csv.bz2", \
                      compression="bz2", low_memory=False) 
crashes.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,case_id,rpt_latitude,rpt_longitude,rpt_block_num,rpt_street_pfx,rpt_street_name,...,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,onsys_fl,private_dr_fl,micromobility_serious_injury_count,micromobility_death_count,micromobility_fl
0,13719812,N,02/17/2014 03:03:00 AM,03:03:00,140480167,,,2000 W,,NOT REPORTED,...,0,0,0,0,0,Y,N,0,0,
1,13688137,N,01/29/2014 09:39:00 PM,21:39:00,140291668,,,10100,,DEASSAU,...,0,0,0,0,0,N,N,0,0,
2,13705309,N,02/12/2014 11:35:00 AM,11:35:00,140430690,,,2700,,NOT REPORTED,...,0,0,0,0,0,Y,N,0,0,
3,13707782,N,02/11/2014 06:02:00 PM,18:02:00,140421191,,,,,NOT REPORTED,...,0,0,0,0,0,Y,N,0,0,
4,13711454,N,02/16/2014 11:00:00 PM,23:00:00,140471694,,,,N,MOPAC NB TO EB 290,...,0,0,0,0,0,Y,N,0,0,


In [19]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148039 entries, 0 to 148038
Data columns (total 54 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   crash_id                            148039 non-null  int64  
 1   crash_fatal_fl                      148039 non-null  object 
 2   crash_date                          148039 non-null  object 
 3   crash_time                          148039 non-null  object 
 4   case_id                             146167 non-null  object 
 5   rpt_latitude                        9480 non-null    float64
 6   rpt_longitude                       9480 non-null    float64
 7   rpt_block_num                       128195 non-null  object 
 8   rpt_street_pfx                      79891 non-null   object 
 9   rpt_street_name                     148033 non-null  object 
 10  rpt_street_sfx                      98176 non-null   object 
 11  crash_speed_limit         

In [20]:
crashes.isna().sum()

crash_id                                   0
crash_fatal_fl                             0
crash_date                                 0
crash_time                                 0
case_id                                 1872
rpt_latitude                          138559
rpt_longitude                         138559
rpt_block_num                          19844
rpt_street_pfx                         68148
rpt_street_name                            6
rpt_street_sfx                         49863
crash_speed_limit                          5
road_constr_zone_fl                        5
latitude                                2467
longitude                               2467
street_name                                5
street_nbr                             87142
street_name_2                          81601
street_nbr_2                          148039
crash_sev_id                               1
sus_serious_injry_cnt                      0
nonincap_injry_cnt                         3
poss_injry

In [21]:
crashes.shape

(148039, 54)

In [22]:
crashes[['rpt_street_name']].head()

Unnamed: 0,rpt_street_name
0,NOT REPORTED
1,DEASSAU
2,NOT REPORTED
3,NOT REPORTED
4,MOPAC NB TO EB 290


In [23]:
# We can drop a row if the value of a specific column is NA. 
crashes.dropna(subset=['rpt_street_name'], inplace=True)
crashes.shape

(148033, 54)

# Question 1 - Which top 10 streets in Austin has the highst car crashes in the last 10 years?

street_name

In [42]:
crashes[['rpt_street_name']]['rpt_street_name'].value_counts().nlargest(10)

rpt_street_name
NOT REPORTED    10509
IH 35            3937
LAMAR            3865
MOPAC            3680
PARMER           1785
CONGRESS         1780
RESEARCH         1674
BEN WHITE        1562
IH 35 SB         1463
RIVERSIDE        1386
Name: count, dtype: int64

# Time and Day of the Crash

In [25]:

# Check if the dtype is object (string-like)
is_date_format = crashes['crash_date'].dtypes == 'object'
print(is_date_format)


# https://stackoverflow.com/questions/33365055/attributeerror-can-only-use-dt-accessor-with-datetimelike-values
crashes['crash_date'] = pd.to_datetime(crashes['crash_date'], errors='coerce')


# Convert if needed
# crashes['date'] = pd.to_datetime(crashes['crash_date'])


# extract the weekday from the date column
crashes['weekday'] = crashes['crash_date'].dt.weekday

print(crashes['weekday'])

# Extact hour of the day
crashes['hour'] = crashes['crash_date'].dt.hour

crashes['hour']

True
0         0
1         2
2         2
3         1
4         6
         ..
148034    5
148035    4
148036    0
148037    3
148038    3
Name: weekday, Length: 148033, dtype: int32


0          3
1         21
2         11
3         18
4         23
          ..
148034    16
148035    18
148036     0
148037    15
148038     8
Name: hour, Length: 148033, dtype: int32

# Question 2 - Which weekday has the highest car crashes?

**Thoughts:** How will we count crashes? We can simply choose any column with no NA values as they are included in every row. The most logical variable to use to count crashes is crash_id.

Is there duplicates of crash_id?

In [46]:
# Will return true if row is duplicated
crashes[['crash_id']].duplicated().value_counts()

False    148033
Name: count, dtype: int64

In [33]:
crashes.groupby('weekday')['crash_id'].count().nlargest(1)

weekday
4    24112
Name: crash_id, dtype: int64

# Question 3 - Which hour of the day has the highest car crashes?

In [34]:
crashes.columns

Index(['crash_id', 'crash_fatal_fl', 'crash_date', 'crash_time', 'case_id',
       'rpt_latitude', 'rpt_longitude', 'rpt_block_num', 'rpt_street_pfx',
       'rpt_street_name', 'rpt_street_sfx', 'crash_speed_limit',
       'road_constr_zone_fl', 'latitude', 'longitude', 'street_name',
       'street_nbr', 'street_name_2', 'street_nbr_2', 'crash_sev_id',
       'sus_serious_injry_cnt', 'nonincap_injry_cnt', 'poss_injry_cnt',
       'non_injry_cnt', 'unkn_injry_cnt', 'tot_injry_cnt', 'death_cnt',
       'contrib_factr_p1_id', 'contrib_factr_p2_id', 'units_involved',
       'atd_mode_category_metadata', 'pedestrian_fl', 'motor_vehicle_fl',
       'motorcycle_fl', 'bicycle_fl', 'other_fl', 'point',
       'apd_confirmed_fatality', 'apd_confirmed_death_count',
       'motor_vehicle_death_count', 'motor_vehicle_serious_injury_count',
       'bicycle_death_count', 'bicycle_serious_injury_count',
       'pedestrian_death_count', 'pedestrian_serious_injury_count',
       'motorcycle_death_count

In [37]:
crashes.groupby('hour')['crash_id'].count().nlargest(1)

hour
17    11405
Name: crash_id, dtype: int64

# Question 4 - Which Month of Year has the highest car crashes?

In [38]:
crashes.columns

Index(['crash_id', 'crash_fatal_fl', 'crash_date', 'crash_time', 'case_id',
       'rpt_latitude', 'rpt_longitude', 'rpt_block_num', 'rpt_street_pfx',
       'rpt_street_name', 'rpt_street_sfx', 'crash_speed_limit',
       'road_constr_zone_fl', 'latitude', 'longitude', 'street_name',
       'street_nbr', 'street_name_2', 'street_nbr_2', 'crash_sev_id',
       'sus_serious_injry_cnt', 'nonincap_injry_cnt', 'poss_injry_cnt',
       'non_injry_cnt', 'unkn_injry_cnt', 'tot_injry_cnt', 'death_cnt',
       'contrib_factr_p1_id', 'contrib_factr_p2_id', 'units_involved',
       'atd_mode_category_metadata', 'pedestrian_fl', 'motor_vehicle_fl',
       'motorcycle_fl', 'bicycle_fl', 'other_fl', 'point',
       'apd_confirmed_fatality', 'apd_confirmed_death_count',
       'motor_vehicle_death_count', 'motor_vehicle_serious_injury_count',
       'bicycle_death_count', 'bicycle_serious_injury_count',
       'pedestrian_death_count', 'pedestrian_serious_injury_count',
       'motorcycle_death_count

In [40]:
crashes.groupby(crashes['crash_date'].dt.month)['crash_id'].count().nlargest(1)

crash_date
10    13625
Name: crash_id, dtype: int64

# Question 5 - Are the total number of car crashes different in different years? 
Show a table of the total car crashes in the past 10 years!

In [51]:
crashes.groupby(crashes['crash_date'].dt.year)['crash_id'].count()

crash_date
2014    13433
2015    15228
2016    16878
2017    16349
2018    16611
2019    16957
2020    12528
2021    13284
2022    13719
2023    12839
2024      207
Name: crash_id, dtype: int64

# Reason and Involvment 

# Question 6 - Which ratio of crashes occurred in or was related to a construction, maintenance, or utility work zone?

Cosnider the colum

'road_constr_zone_fl'
Construction Zone - Indicates whether the crash occurred in or was related to a construction, maintenance, or utility work zone, regardless of whether or not workers were actually present at the time of the crash



In [57]:
print(((crashes[crashes['road_constr_zone_fl'] == 'Y']['road_constr_zone_fl'].count())/(crashes['crash_id'].count()))*100,'%')

5.196814223855491 %


In [None]:
count = crashes[crashes['road_constr_zone_fl']== 'Y']['road_constr_zone_fl'].count()
total = crashes['crash_id'].count()
print((count/total) * 100)

5.196814223855491


# Question 7 - Which ratio of crashes include involvement of pedestrain, motorcycle, and bicycles? What is the ratio of each in comparision to total number of crashes?

Use the columns of:

pedestrian_fl  , motor_vehicle_fl , motorcycle_fl  and bicycle_fl


In [63]:
(crashes[(crashes['pedestrian_fl'] == 'Y') & (crashes['motor_vehicle_fl'] == 'Y') & (crashes['motorcycle_fl'] == 'Y') & (crashes['bicycle_fl'] == 'Y')]['crash_id'].count())/(crashes['crash_id'].count())*100

0.0

In [None]:
df = crashes[(crashes['pedestrian_fl']=='Y') & (crashes['motor_vehicle_fl']=='Y') & (crashes['motorcycle_fl']=='Y') &(crashes['bicycle_fl']=='Y') ]
all_three = df['crash_id'].count()
all_three
print((all_three / crashes['crash_id'].count())*100)

0.0


# Deaths and Fatalities &#x1F622;  


# Question 8 - What are the death counts of each year?

cosider the 'death_cnt' 

In [None]:
crashes.groupby('year')['death_cnt'].sum()

year
2014     56
2015    102
2016     78
2017     75
2018     71
2019     88
2020     92
2021    115
2022    117
2023     89
2024      1
Name: death_cnt, dtype: int64

# Question 9 - What are the death counts of each year?

cosider the 'death_cnt'

In [None]:
crashes.groupby('year')['death_cnt'].sum()

year
2014     56
2015    102
2016     78
2017     75
2018     71
2019     88
2020     92
2021    115
2022    117
2023     89
2024      1
Name: death_cnt, dtype: int64

# Question 10 - What is the ratio of crashes with death count>=1?

cosider the 'death_cnt'

In [None]:
count = crashes[crashes['death_cnt']>=1]['death_cnt'].count()
total = crashes['death_cnt'].count()
print((count/total) * 100)

0.5667655184992536


# Question 11 - What are the total death counts of crashes of each different accident involvements ( pedestrain, motorcycle, and bicycles) ?

cosider the 'death_cnt'
and 

'pedestrian_fl' , 'motor_vehicle_fl' , 'motorcycle_fl' and 'bicycle_fl'

In [None]:
total_deaths_by_involvement = crashes.groupby(['pedestrian_fl', 'motorcycle_fl', 'bicycle_fl'])['death_cnt'].sum('death_cnt')
print(total_deaths_by_involvement)

Series([], Name: death_cnt, dtype: int64)


In [None]:
print(crashes.groupby('pedestrian_fl')['death_cnt'].sum())
print(crashes.groupby('motor_vehicle_fl')['death_cnt'].sum())
print(crashes.groupby('motorcycle_fl')['death_cnt'].sum())
print(crashes.groupby('bicycle_fl')['death_cnt'].sum())



pedestrian_fl
Y    319
Name: death_cnt, dtype: int64
motor_vehicle_fl
Y    845
Name: death_cnt, dtype: int64
motorcycle_fl
Y    126
Name: death_cnt, dtype: int64
bicycle_fl
Y    28
Name: death_cnt, dtype: int64
