# NYC traffic accidents over a 3 year period 
## Filter and Subset

Ingest <a href="https://raw.githubusercontent.com/sandeepmj/datasets/main/nyc-accidents.csv">this dataset</a> stored GitHub.

In [1]:
## import necessary libraries
import pandas as pd

In [2]:
## read the dataset into notebook
df = pd.read_csv("https://raw.githubusercontent.com/sandeepmj/datasets/main/nyc-accidents.csv")

In [3]:
## see the overall info about this dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324383 entries, 0 to 324382
Data columns (total 17 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   CRASH DATE                     324383 non-null  object
 1   CRASH TIME                     324383 non-null  object
 2   BOROUGH                        210698 non-null  object
 3   LOCATION                       298090 non-null  object
 4   NUMBER OF PERSONS INJURED      324383 non-null  int64 
 5   NUMBER OF PERSONS KILLED       324383 non-null  int64 
 6   NUMBER OF PEDESTRIANS INJURED  324383 non-null  int64 
 7   NUMBER OF PEDESTRIANS KILLED   324383 non-null  int64 
 8   NUMBER OF CYCLIST INJURED      324383 non-null  int64 
 9   NUMBER OF CYCLIST KILLED       324383 non-null  int64 
 10  NUMBER OF MOTORIST INJURED     324383 non-null  int64 
 11  NUMBER OF MOTORIST KILLED      324383 non-null  int64 
 12  CONTRIBUTING FACTOR VEHICLE 1  323137 non-nu

In [4]:
## create a series for borough using dot notation
df.BOROUGH

0          BROOKLYN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
324378     BROOKLYN
324379        BRONX
324380        BRONX
324381     BROOKLYN
324382    MANHATTAN
Name: BOROUGH, Length: 324383, dtype: object

In [5]:
## create a series of crash dates.
df["CRASH DATE"]

0          5/21/19
1          1/21/20
2         12/31/20
3         12/25/20
4          4/15/20
            ...   
324378      1/1/19
324379      1/1/19
324380      1/1/19
324381      1/1/19
324382      1/1/19
Name: CRASH DATE, Length: 324383, dtype: object

In [6]:
## Which borough had the most crashes?
df["BOROUGH"].value_counts()

#Brooklyn did

BOROUGH
BROOKLYN         69944
QUEENS           60529
MANHATTAN        37446
BRONX            36741
STATEN ISLAND     6038
Name: count, dtype: int64

In [7]:
## which type of vehicle was primary vehicle involved in crashes?
## SHOW ONLY THE TOP 7
df["VEHICLE TYPE CODE 1"].value_counts().head(7)

#Sedan

VEHICLE TYPE CODE 1
Sedan                                  147440
Station Wagon/Sport Utility Vehicle    120571
Taxi                                    13592
Pick-up Truck                            8958
Box Truck                                6266
Bus                                      4980
Bike                                     3124
Name: count, dtype: int64

In [8]:
## these top 7 but as percentages as a dataframe, with the header "pct"
(df["VEHICLE TYPE CODE 1"].value_counts(normalize = True).to_frame("pct")*100).head(7)

Unnamed: 0_level_0,pct
VEHICLE TYPE CODE 1,Unnamed: 1_level_1
Sedan,45.798492
Station Wagon/Sport Utility Vehicle,37.452319
Taxi,4.22201
Pick-up Truck,2.782575
Box Truck,1.946374
Bus,1.546911
Bike,0.970391


In [9]:
## What were a 15 unusual primary vehicles to get into a crash?

## what where the top 15 least frequent reasons for the crashes.

df["CONTRIBUTING FACTOR VEHICLE 1"].value_counts(ascending = True).head(15)

CONTRIBUTING FACTOR VEHICLE 1
Windshield Inadequate                5
Listening/Using Headphones           5
Texting                              6
Shoulders Defective/Improper        14
Cell Phone (hands-free)             16
Headlights Defective                24
Other Lighting Defects              25
Other Electronic Device             26
Vehicle Vandalism                   30
Prescription Medication             30
Eating or Drinking                  31
Using On Board Navigation Device    32
Tinted Windows                      35
Tow Hitch Defective                 43
Physical Disability                 96
Name: count, dtype: int64

In [10]:
## create a subset of data for only Queens
## place it in a dataframe called df_q

df["BOROUGH"].unique() ## first step: check spelling of Queens - whether it's in all caps or lowercase
df_q = df.query(' BOROUGH == "QUEENS" ') ## create a subset of data only for Queens
df_q ## check to see if df_q number of rows matches frequency of Queens in step 6, 60529

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
15,5/2/20,17:30,QUEENS,"(40.67376, -73.79473)",0,0,0,0,0,0,0,0,Unsafe Lane Changing,Unspecified,4412513,Station Wagon/Sport Utility Vehicle,Pick-up Truck
74,12/16/20,16:20,QUEENS,"(40.7139, -73.7539)",0,0,0,0,0,0,0,0,Driver Inexperience,,4376676,Box Truck,
97,6/4/20,7:30,QUEENS,"(40.744232, -73.861275)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4423984,Bus,Station Wagon/Sport Utility Vehicle
105,12/3/20,15:10,QUEENS,,0,0,0,0,0,0,0,0,Passing or Lane Usage Improper,Unspecified,4373032,Sedan,
129,12/19/20,16:15,QUEENS,"(40.72362, -73.88802)",1,0,1,0,0,0,0,0,Backing Unsafely,,4379293,Station Wagon/Sport Utility Vehicle,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324366,1/1/19,11:00,QUEENS,"(40.698704, -73.89974)",0,0,0,0,0,0,0,0,Unspecified,,4065827,Sedan,
324367,1/1/19,12:30,QUEENS,"(40.74114, -73.85747)",0,0,0,0,0,0,0,0,Unspecified,,4062343,Sedan,
324371,1/1/19,2:18,QUEENS,"(40.716507, -73.84711)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4060413,Sedan,
324372,1/1/19,13:00,QUEENS,"(40.665497, -73.75573)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4060511,Sedan,Sedan


In [11]:
## number of people killed but return as a frame with a label "number_killed"
df_killed = df["NUMBER OF PERSONS KILLED"].to_frame("number_killed")
df_killed

Unnamed: 0,number_killed
0,0
1,0
2,0
3,0
4,0
...,...
324378,0
324379,0
324380,0
324381,0


In [12]:
## create a dataset for Manhattan that involved taxi cabs as the primary vehicle cause
df_m = df.query("(BOROUGH == 'MANHATTAN') and (`VEHICLE TYPE CODE 1` == 'Taxi') ") 
df_m

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
343,12/22/20,23:55,MANHATTAN,"(40.78911, -73.96656)",1,0,1,0,0,0,0,0,Unspecified,,4378322,Taxi,
528,12/17/20,3:40,MANHATTAN,"(40.821636, -73.93909)",0,0,0,0,0,0,0,0,Traffic Control Disregarded,Unspecified,4377075,Taxi,
697,12/23/20,21:38,MANHATTAN,"(40.8188, -73.95603)",1,0,1,0,0,0,0,0,Failure to Yield Right-of-Way,,4378603,Taxi,
1330,12/30/20,13:25,MANHATTAN,"(40.748512, -73.98872)",0,0,0,0,0,0,0,0,Driver Inattention/Distraction,Unspecified,4380283,Taxi,Taxi
1486,12/19/20,13:45,MANHATTAN,"(40.79844, -73.96509)",0,0,0,0,0,0,0,0,Driver Inexperience,Unspecified,4377284,Taxi,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324013,1/1/19,1:30,MANHATTAN,"(40.770573, -73.982155)",0,0,0,0,0,0,0,0,Other Vehicular,Driver Inattention/Distraction,4060445,Taxi,Station Wagon/Sport Utility Vehicle
324093,1/1/19,16:00,MANHATTAN,"(40.74147, -73.985435)",2,0,0,0,0,0,2,0,Traffic Control Disregarded,Unspecified,4061524,Taxi,Station Wagon/Sport Utility Vehicle
324261,1/1/19,16:15,MANHATTAN,"(40.741524, -73.97979)",0,0,0,0,0,0,0,0,Passenger Distraction,Passing Too Closely,4060796,Taxi,Sedan
324296,1/1/19,20:30,MANHATTAN,"(40.821636, -73.93909)",0,0,0,0,0,0,0,0,Unspecified,,4060662,Taxi,


In [13]:
## What were the top 8 causes of accidents across all the boroughs?
## by primary vehicle cause
df["CONTRIBUTING FACTOR VEHICLE 1"].value_counts().head(8)

CONTRIBUTING FACTOR VEHICLE 1
Driver Inattention/Distraction    83809
Unspecified                       78621
Following Too Closely             26689
Failure to Yield Right-of-Way     21943
Backing Unsafely                  13497
Passing or Lane Usage Improper    13406
Passing Too Closely               12649
Other Vehicular                    9646
Name: count, dtype: int64

In [14]:
## What were the top 8 causes of accidents across all the boroughs?
## by primary vehicle cause
### as a percent in a frame with header pct_

(df["CONTRIBUTING FACTOR VEHICLE 1"].value_counts(normalize = True)*100).head(8).to_frame("pct")

Unnamed: 0_level_0,pct
CONTRIBUTING FACTOR VEHICLE 1,Unnamed: 1_level_1
Driver Inattention/Distraction,25.936058
Unspecified,24.330547
Following Too Closely,8.259345
Failure to Yield Right-of-Way,6.790618
Backing Unsafely,4.176866
Passing or Lane Usage Improper,4.148705
Passing Too Closely,3.914439
Other Vehicular,2.985112


In [15]:
## What were the top 8 causes of accidents across all the boroughs?
## by secondary vehicle cause

df["CONTRIBUTING FACTOR VEHICLE 2"].value_counts().head(8)

CONTRIBUTING FACTOR VEHICLE 2
Unspecified                       221742
Driver Inattention/Distraction     18105
Following Too Closely               4507
Other Vehicular                     4268
Passing or Lane Usage Improper      2919
Failure to Yield Right-of-Way       2522
Passing Too Closely                 2124
Unsafe Lane Changing                1502
Name: count, dtype: int64

In [16]:
## What were the 5 fewest causes for primary vehicles causing the accident

df["CONTRIBUTING FACTOR VEHICLE 1"].value_counts(ascending = True).head(5)

CONTRIBUTING FACTOR VEHICLE 1
Windshield Inadequate            5
Listening/Using Headphones       5
Texting                          6
Shoulders Defective/Improper    14
Cell Phone (hands-free)         16
Name: count, dtype: int64

In [17]:
## list ALL the causes for vehicle 1 as unique values (in other words, create a list of the causes)

df["CONTRIBUTING FACTOR VEHICLE 2"].unique()

array(['Unspecified', nan, 'Following Too Closely',
       'Passing or Lane Usage Improper', 'Unsafe Lane Changing',
       'Other Vehicular', 'Driver Inattention/Distraction',
       'Obstruction/Debris', 'Traffic Control Disregarded',
       'Failure to Yield Right-of-Way', 'Unsafe Speed',
       'Turning Improperly', 'Pavement Slippery', 'Driver Inexperience',
       'Passing Too Closely', 'Failure to Keep Right',
       'Reaction to Uninvolved Vehicle',
       'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
       'Aggressive Driving/Road Rage', 'View Obstructed/Limited',
       'Alcohol Involvement', 'Tire Failure/Inadequate',
       'Backing Unsafely', 'Traffic Control Device Improper/Non-Working',
       'Listening/Using Headphones', 'Outside Car Distraction',
       'Oversized Vehicle', 'Passenger Distraction',
       'Driverless/Runaway Vehicle', 'Pavement Defective',
       'Lane Marking Improper/Inadequate', 'Brakes Defective',
       'Vehicle Vandalism', 'Headlight

In [18]:
## find all incidents of defective pavements causing accidents 
## and sort by borough

df["CONTRIBUTING FACTOR VEHICLE 1"].value_counts() #to see pavement variable name
df_defect = df.query(' `CONTRIBUTING FACTOR VEHICLE 1` == "Pavement Defective" ').sort_values(by= "BOROUGH") ## to see list

In [19]:
## find all incidents in which more than 3 people were killed

df_over3 = df.query(' `NUMBER OF PERSONS KILLED` > 3 ')
df_over3

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
91050,1/25/20,5:35,,"(40.748398, -73.72879)",0,4,0,0,0,0,0,4,Unsafe Speed,Unspecified,4278634,Sedan,Sedan


In [20]:
## find all incidents in which between 2 and 3 people were killed

df_btwn23 = df.query(' 2 <= `NUMBER OF PERSONS KILLED` <= 3 ')
df_btwn23

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
4348,12/12/20,22:52,BROOKLYN,"(40.652752, -73.88629)",0,2,0,0,0,0,0,2,Traffic Control Disregarded,Unspecified,4375377,Sedan,Sedan
7071,10/30/20,17:18,QUEENS,"(40.705147, -73.7957)",3,2,3,2,0,0,0,0,Driver Inexperience,,4362781,Station Wagon/Sport Utility Vehicle,
9559,11/20/20,11:45,QUEENS,"(40.651405, -73.758896)",2,2,0,0,0,0,2,2,Driver Inattention/Distraction,,4369300,Sedan,
20179,10/26/20,16:01,,"(40.865314, -73.836365)",1,2,0,0,0,0,1,2,Illnes,Unspecified,4362350,Sedan,Station Wagon/Sport Utility Vehicle
39644,10/6/20,3:52,QUEENS,"(40.66549, -73.819534)",1,3,0,0,0,0,1,3,Unsafe Speed,,4355333,Sedan,
47224,7/11/20,20:18,BROOKLYN,"(40.58993, -73.90059)",4,3,0,0,0,0,4,3,Driver Inexperience,Driver Inexperience,4327676,Sedan,Sedan
54863,7/20/20,23:46,,"(40.835087, -73.82538)",5,2,0,0,0,0,5,2,Unsafe Speed,Unspecified,4330670,Sedan,Sedan
59719,6/21/20,21:23,,,5,2,0,0,0,0,5,2,Failure to Keep Right,Unspecified,4322266,Sedan,Sedan
69748,4/26/20,3:35,,"(40.811428, -73.90091)",2,2,0,0,0,0,2,2,Unsafe Speed,,4310365,Sedan,
72195,6/11/20,1:34,,,0,2,0,1,0,0,0,1,Unsafe Speed,,4319230,Sedan,


In [21]:
## extra step: check value counts to assess results for two previous entries
df["NUMBER OF PERSONS KILLED"].value_counts()

NUMBER OF PERSONS KILLED
0    323891
1       475
2        14
3         2
4         1
Name: count, dtype: int64