In [37]:
# NYC Car Crash Injuries for 2016 and 2017

In [38]:
import pandas as pd
from config import db_password
from pathlib import Path
from sqlalchemy import create_engine, inspect
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [39]:
# create enginge instance to connect to postgresDB to retrieve data
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/NYC_Vehicle_Collision_Prediction"
engine = create_engine(db_string)
connection = engine.connect()

In [40]:
# retrieve tables from postgres connection
inspector = inspect(engine)
inspector.get_table_names()

# Query ALL Records in the the Database
collision_df = pd.read_sql("SELECT * from collisions", connection)
collision_df.drop(columns=["index"])

Unnamed: 0,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID
0,2.0,0.0,4455765
1,1.0,0.0,4513547
2,0.0,0.0,4541903
3,0.0,0.0,4456314
4,0.0,0.0,4486609
...,...,...,...
1936663,1.0,0.0,4572947
1936664,0.0,0.0,4572848
1936665,2.0,0.0,4573314
1936666,2.0,0.0,4572706


In [41]:
# retrieve tables from postgres connection
inspector = inspect(engine)
inspector.get_table_names()

# Query ALL Records in the the Database
datetime_df = pd.read_sql("SELECT * from date_time", connection)
datetime_df.drop(columns=["index"])


Unnamed: 0,CRASH DATE,CRASH TIME,COLLISION_ID
0,2021-09-11,2:39,4455765
1,2022-03-26,11:45,4513547
2,2022-06-29,6:55,4541903
3,2021-09-11,9:35,4456314
4,2021-12-14,8:13,4486609
...,...,...,...
1936700,2022-10-14,20:22,4572947
1936701,2022-10-12,16:30,4572848
1936702,2022-10-14,22:00,4573314
1936703,2022-10-14,1:00,4572706


In [42]:
# retrieve tables from postgres connection
inspector = inspect(engine)
inspector.get_table_names()

# Query ALL Records in the the Database
locations_df = pd.read_sql("SELECT * from locations", connection)
locations_df.drop(columns=["index"])

Unnamed: 0,BOROUGH,ZIP CODE,COLLISION_ID
0,BROOKLYN,11208.0,4456314
1,BROOKLYN,11233.0,4486609
2,BRONX,10475.0,4486660
3,BROOKLYN,11207.0,4487074
4,MANHATTAN,10017.0,4486519
...,...,...,...
1335440,QUEENS,11385.0,4573422
1335441,QUEENS,11362.0,4572694
1335442,BROOKLYN,11234.0,4573135
1335443,QUEENS,11366.0,4573314


In [43]:
## After importing Collision, Datetime, and Locations tables from postgres, we have to merge all tables to create one unique data set.

In [44]:
# First merge of collision and datetime tables using "COLLISION_ID"
merge_df = pd.merge(collision_df, datetime_df, how="left", on=["COLLISION_ID", "COLLISION_ID"])
merge_df

Unnamed: 0,index_x,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID,index_y,CRASH DATE,CRASH TIME
0,0,2.0,0.0,4455765,0,2021-09-11,2:39
1,1,1.0,0.0,4513547,1,2022-03-26,11:45
2,2,0.0,0.0,4541903,2,2022-06-29,6:55
3,3,0.0,0.0,4456314,3,2021-09-11,9:35
4,4,0.0,0.0,4486609,4,2021-12-14,8:13
...,...,...,...,...,...,...,...
1936663,1936663,1.0,0.0,4572947,1936700,2022-10-14,20:22
1936664,1936664,0.0,0.0,4572848,1936701,2022-10-12,16:30
1936665,1936665,2.0,0.0,4573314,1936702,2022-10-14,22:00
1936666,1936666,2.0,0.0,4572706,1936703,2022-10-14,1:00


In [45]:
# Deleting extra indices
merge_df.drop(columns=["index_x","index_y"])

Unnamed: 0,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID,CRASH DATE,CRASH TIME
0,2.0,0.0,4455765,2021-09-11,2:39
1,1.0,0.0,4513547,2022-03-26,11:45
2,0.0,0.0,4541903,2022-06-29,6:55
3,0.0,0.0,4456314,2021-09-11,9:35
4,0.0,0.0,4486609,2021-12-14,8:13
...,...,...,...,...,...
1936663,1.0,0.0,4572947,2022-10-14,20:22
1936664,0.0,0.0,4572848,2022-10-12,16:30
1936665,2.0,0.0,4573314,2022-10-14,22:00
1936666,2.0,0.0,4572706,2022-10-14,1:00


In [46]:
# Second merge with locations table using "COLLISION_ID"
secondmerge_df = pd.merge(merge_df, locations_df, how="left", on=["COLLISION_ID", "COLLISION_ID"])
secondmerge_df

Unnamed: 0,index_x,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID,index_y,CRASH DATE,CRASH TIME,index,BOROUGH,ZIP CODE
0,0,2.0,0.0,4455765,0,2021-09-11,2:39,,,
1,1,1.0,0.0,4513547,1,2022-03-26,11:45,,,
2,2,0.0,0.0,4541903,2,2022-06-29,6:55,,,
3,3,0.0,0.0,4456314,3,2021-09-11,9:35,0.0,BROOKLYN,11208.0
4,4,0.0,0.0,4486609,4,2021-12-14,8:13,1.0,BROOKLYN,11233.0
...,...,...,...,...,...,...,...,...,...,...
1936663,1936663,1.0,0.0,4572947,1936700,2022-10-14,20:22,,,
1936664,1936664,0.0,0.0,4572848,1936701,2022-10-12,16:30,,,
1936665,1936665,2.0,0.0,4573314,1936702,2022-10-14,22:00,1335443.0,QUEENS,11366.0
1936666,1936666,2.0,0.0,4572706,1936703,2022-10-14,1:00,,,


In [47]:
# Creating dataframe to include only number of persons injured.
injured_df=secondmerge_df.drop(columns=["index_x","index_y", "index", "COLLISION_ID", "NUMBER OF PERSONS KILLED"])

In [48]:
# Removing n/a values from table
injured_df.dropna(axis=0,how='any',inplace=True)

In [49]:
#Renaming column name
injured_df.rename(columns={'CRASH TIME': 'CRASHTIME'})

Unnamed: 0,NUMBER OF PERSONS INJURED,CRASH DATE,CRASHTIME,BOROUGH,ZIP CODE
3,0.0,2021-09-11,9:35,BROOKLYN,11208.0
4,0.0,2021-12-14,8:13,BROOKLYN,11233.0
7,2.0,2021-12-14,8:17,BRONX,10475.0
8,0.0,2021-12-14,21:10,BROOKLYN,11207.0
9,0.0,2021-12-14,14:58,MANHATTAN,10017.0
...,...,...,...,...,...
1936657,1.0,2022-10-14,14:55,QUEENS,11385.0
1936660,0.0,2022-10-13,16:07,QUEENS,11362.0
1936661,0.0,2022-10-14,8:00,BROOKLYN,11234.0
1936665,2.0,2022-10-14,22:00,QUEENS,11366.0


In [50]:
# Dataframe types
injured_df.dtypes

NUMBER OF PERSONS INJURED    float64
CRASH DATE                    object
CRASH TIME                    object
BOROUGH                       object
ZIP CODE                     float64
dtype: object

In [51]:
# Changing 'CRASH DATE' from object to datetime64
injured_df['CRASH DATE']= pd.to_datetime(injured_df['CRASH DATE'])

In [52]:
# Filtering 2016 and 2017 data
year2016_2017_df = injured_df.loc[(injured_df['CRASH DATE'].dt.year == 2016) | (injured_df['CRASH DATE'].dt.year == 2017)]
year2016_2017_df

Unnamed: 0,NUMBER OF PERSONS INJURED,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE
844,0.0,2016-04-16,14:20,BROOKLYN,11214.0
42999,1.0,2016-06-02,7:28,BROOKLYN,11210.0
45817,0.0,2016-06-06,14:56,BROOKLYN,11212.0
45818,0.0,2016-09-26,16:58,BROOKLYN,11203.0
64161,0.0,2017-02-19,16:10,BROOKLYN,11203.0
...,...,...,...,...,...
1200671,0.0,2016-01-07,16:45,QUEENS,11373.0
1200672,1.0,2016-01-19,12:00,BROOKLYN,11214.0
1200673,0.0,2016-01-08,15:05,MANHATTAN,10017.0
1200674,0.0,2016-01-08,13:34,BRONX,10473.0


In [53]:
# Renaming columns in filtered data
df=year2016_2017_df.rename(columns={'NUMBER OF PERSONS INJURED': 'INJURED', 'CRASH DATE': 'DATE', 'CRASH TIME':'TIME'})

In [54]:
df.head()

Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
844,0.0,2016-04-16,14:20,BROOKLYN,11214.0
42999,1.0,2016-06-02,7:28,BROOKLYN,11210.0
45817,0.0,2016-06-06,14:56,BROOKLYN,11212.0
45818,0.0,2016-09-26,16:58,BROOKLYN,11203.0
64161,0.0,2017-02-19,16:10,BROOKLYN,11203.0


In [55]:
df.dtypes

INJURED            float64
DATE        datetime64[ns]
TIME                object
BOROUGH             object
ZIP CODE           float64
dtype: object

In [56]:
# Changing 'DATE' from object to float64
df['DATE']= pd.to_datetime(df['DATE'])
df['DATE'] = (df['DATE'] - df['DATE'].min())  / np.timedelta64(1,'D')

In [57]:
# Changing 'TIME' from object to float64
df['TIME']= pd.to_datetime(df['TIME'])
df['TIME'] = (df['TIME'] - df['TIME'].min())  / np.timedelta64(1,'D')

In [58]:
df.dtypes

INJURED     float64
DATE        float64
TIME        float64
BOROUGH      object
ZIP CODE    float64
dtype: object

In [59]:
# Hot encoding boroughs
Injuries_binary_encoded = pd.get_dummies(df, columns=["BOROUGH"])
Injuries_binary_encoded

Unnamed: 0,INJURED,DATE,TIME,ZIP CODE,BOROUGH_BRONX,BOROUGH_BROOKLYN,BOROUGH_MANHATTAN,BOROUGH_QUEENS,BOROUGH_STATEN ISLAND
844,0.0,106.0,0.597222,11214.0,0,1,0,0,0
42999,1.0,153.0,0.311111,11210.0,0,1,0,0,0
45817,0.0,157.0,0.622222,11212.0,0,1,0,0,0
45818,0.0,269.0,0.706944,11203.0,0,1,0,0,0
64161,0.0,415.0,0.673611,11203.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
1200671,0.0,6.0,0.697917,11373.0,0,0,0,1,0
1200672,1.0,18.0,0.500000,11214.0,0,1,0,0,0
1200673,0.0,7.0,0.628472,10017.0,0,0,1,0,0
1200674,0.0,7.0,0.565278,10473.0,1,0,0,0,0


In [60]:
# Define the features set.
X = Injuries_binary_encoded.copy()
X = X.drop("INJURED", axis=1)
X.head()

Unnamed: 0,DATE,TIME,ZIP CODE,BOROUGH_BRONX,BOROUGH_BROOKLYN,BOROUGH_MANHATTAN,BOROUGH_QUEENS,BOROUGH_STATEN ISLAND
844,106.0,0.597222,11214.0,0,1,0,0,0
42999,153.0,0.311111,11210.0,0,1,0,0,0
45817,157.0,0.622222,11212.0,0,1,0,0,0
45818,269.0,0.706944,11203.0,0,1,0,0,0
64161,415.0,0.673611,11203.0,0,1,0,0,0


In [61]:
# Define the target set.
y = Injuries_binary_encoded["INJURED"].ravel()
y[:5]

array([0., 1., 0., 0., 0.])

In [62]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [63]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [64]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [65]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [66]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0., 0., 0., ..., 0., 0., 1.])

In [67]:
# Counting predicted injuries
zeros = 0
ones = 0
for i in predictions:
    if i==0:
        zeros = zeros + 1
    else:
        ones = ones + 1
    
print("accidents with no injuries: ", zeros)
print("injuries: ", ones)

accidents with no injuries:  71341
injuries:  2697


In [68]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [69]:
# Displaying results
print("Data")
display(Injuries_binary_encoded)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Data


Unnamed: 0,INJURED,DATE,TIME,ZIP CODE,BOROUGH_BRONX,BOROUGH_BROOKLYN,BOROUGH_MANHATTAN,BOROUGH_QUEENS,BOROUGH_STATEN ISLAND
844,0.0,106.0,0.597222,11214.0,0,1,0,0,0
42999,1.0,153.0,0.311111,11210.0,0,1,0,0,0
45817,0.0,157.0,0.622222,11212.0,0,1,0,0,0
45818,0.0,269.0,0.706944,11203.0,0,1,0,0,0
64161,0.0,415.0,0.673611,11203.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
1200671,0.0,6.0,0.697917,11373.0,0,0,0,1,0
1200672,1.0,18.0,0.500000,11214.0,0,1,0,0,0
1200673,0.0,7.0,0.628472,10017.0,0,0,1,0,0
1200674,0.0,7.0,0.565278,10473.0,1,0,0,0,0


Accuracy Score : 0.7923903941219373
Classification Report
              precision    recall  f1-score   support

         0.0       0.82      0.97      0.88     60218
         1.0       0.20      0.04      0.07     10878
         2.0       0.09      0.01      0.02      1914
         3.0       0.11      0.01      0.02       615
         4.0       0.04      0.00      0.01       236
         5.0       0.14      0.01      0.02       105
         6.0       0.20      0.03      0.05        37
         7.0       0.00      0.00      0.00        18
         8.0       0.00      0.00      0.00         5
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         2
        31.0       0.00      0.00      0.00         1

    accuracy                           0.79     74038
   macro avg       0.1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([4.78444856e-01, 3.67904510e-01, 1.51902848e-01, 1.86382071e-04,
       3.98843151e-04, 7.88969233e-04, 2.23909266e-04, 1.49682336e-04])

In [71]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4784448559633917, 'DATE'),
 (0.3679045097207584, 'TIME'),
 (0.15190284826064268, 'ZIP CODE'),
 (0.0007889692327753584, 'BOROUGH_MANHATTAN'),
 (0.00039884315054678697, 'BOROUGH_BROOKLYN'),
 (0.00022390926577285194, 'BOROUGH_QUEENS'),
 (0.00018638207051945745, 'BOROUGH_BRONX'),
 (0.0001496823355926966, 'BOROUGH_STATEN ISLAND')]

# Validation of Model with 2018 Data

In [72]:
# Filtering data to only include accidents from 2018.
year2018_df = injured_df.loc[(injured_df['CRASH DATE'].dt.year == 2018)]
year2018_df

Unnamed: 0,NUMBER OF PERSONS INJURED,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE
53401,1.0,2018-05-28,7:00,BRONX,10467.0
64162,2.0,2018-03-26,0:06,BROOKLYN,11203.0
106424,0.0,2018-12-13,21:50,MANHATTAN,10075.0
123876,1.0,2018-03-22,17:00,BROOKLYN,11238.0
203713,0.0,2018-08-30,14:00,BROOKLYN,11204.0
...,...,...,...,...,...
736066,0.0,2018-01-13,17:30,QUEENS,11385.0
736067,0.0,2018-01-26,22:36,BROOKLYN,11203.0
736069,1.0,2018-01-14,12:20,MANHATTAN,10034.0
736071,0.0,2018-01-17,5:49,BROOKLYN,11222.0


In [79]:
# Renaming column names
df_2018 = year2018_df.rename(columns={'NUMBER OF PERSONS INJURED': 'INJURED', 'CRASH DATE': 'DATE', 'CRASH TIME':'TIME'})

In [80]:
# Hot encoding boroughs
Injuries_binary_encoded_2018 = pd.get_dummies(df_2018, columns=["BOROUGH"])
Injuries_binary_encoded_2018

Unnamed: 0,INJURED,DATE,TIME,ZIP CODE,BOROUGH_BRONX,BOROUGH_BROOKLYN,BOROUGH_MANHATTAN,BOROUGH_QUEENS,BOROUGH_STATEN ISLAND
53401,1.0,2018-05-28,7:00,10467.0,1,0,0,0,0
64162,2.0,2018-03-26,0:06,11203.0,0,1,0,0,0
106424,0.0,2018-12-13,21:50,10075.0,0,0,1,0,0
123876,1.0,2018-03-22,17:00,11238.0,0,1,0,0,0
203713,0.0,2018-08-30,14:00,11204.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
736066,0.0,2018-01-13,17:30,11385.0,0,0,0,1,0
736067,0.0,2018-01-26,22:36,11203.0,0,1,0,0,0
736069,1.0,2018-01-14,12:20,10034.0,0,0,1,0,0
736071,0.0,2018-01-17,5:49,11222.0,0,1,0,0,0


In [82]:
# Changing 'DATE' from object to float64
Injuries_binary_encoded_2018['DATE']= pd.to_datetime(Injuries_binary_encoded_2018['DATE'])
Injuries_binary_encoded_2018['DATE'] = (Injuries_binary_encoded_2018['DATE'] - Injuries_binary_encoded_2018['DATE'].min())  / np.timedelta64(1,'D')

In [83]:
# Changing 'TIME' from object to float64
Injuries_binary_encoded_2018['TIME']= pd.to_datetime(Injuries_binary_encoded_2018['TIME'])
Injuries_binary_encoded_2018['TIME'] = (Injuries_binary_encoded_2018['TIME'] - Injuries_binary_encoded_2018['TIME'].min())  / np.timedelta64(1,'D')

In [84]:
# Define the features set.
X_validation = Injuries_binary_encoded_2018.copy()
X_validation = X_validation.drop("INJURED", axis=1)
X_validation.head()

Unnamed: 0,DATE,TIME,ZIP CODE,BOROUGH_BRONX,BOROUGH_BROOKLYN,BOROUGH_MANHATTAN,BOROUGH_QUEENS,BOROUGH_STATEN ISLAND
53401,1.701389e-12,0.291667,10467.0,1,0,0,0,0
64162,9.722222e-13,0.004167,11203.0,0,1,0,0,0
106424,4.00463e-12,0.909722,10075.0,0,0,1,0,0
123876,9.259259e-13,0.708333,11238.0,0,1,0,0,0
203713,2.789352e-12,0.583333,11204.0,0,1,0,0,0


In [85]:
# Define the target set.
y_validation = Injuries_binary_encoded_2018["INJURED"].ravel()
y_validation[:5]

array([1., 2., 0., 1., 0.])

In [86]:
# Splitting into Train and Test sets.
X_validation_train, X_validation_test, y_validation_train, y_validation_test = train_test_split(X_validation, y_validation, random_state=78)

In [87]:
len(y_validation_test)

37296

In [88]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler_validation = scaler.fit(X_validation_train)

# Scaling the data.
X_train_scaled_validation = X_scaler_validation.transform(X_validation_train)
X_test_scaled_validation = X_scaler_validation.transform(X_validation_test)

In [89]:
# Making predictions using the testing data.
validation = rf_model.predict(X_test_scaled_validation)

In [90]:
# Counting predicted injuries for 2018 data
zeros = 0
ones = 0
for i in validation:
    if i==0:
        zeros = zeros + 1
    else:
        ones = ones + 1
    
print("accidents with no injuries: ", zeros)
print("injuries: ", ones)

accidents with no injuries:  35150
injuries:  2146


In [91]:
# Calculating the accuracy score.
validation_score= accuracy_score(y_validation_test, validation)
validation_score

0.7710478335478336