In [1]:
import pandas as pd
from config import db_password
from pathlib import Path
from sqlalchemy import create_engine, inspect
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# create enginge instance to connect to postgresDB to retrieve data
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/NYC_Vehicle_Collision_Prediction"
engine = create_engine(db_string)
connection = engine.connect()

In [3]:
# retrieve tables from postgres connection
inspector = inspect(engine)
inspector.get_table_names()

# Query ALL Records in the the Database
collision_df = pd.read_sql("SELECT * from collisions", connection)
collision_df.drop(columns=["index"])

Unnamed: 0,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID
0,2.0,0.0,4455765
1,1.0,0.0,4513547
2,0.0,0.0,4541903
3,0.0,0.0,4456314
4,0.0,0.0,4486609
...,...,...,...
1936663,1.0,0.0,4572947
1936664,0.0,0.0,4572848
1936665,2.0,0.0,4573314
1936666,2.0,0.0,4572706


In [4]:
# retrieve tables from postgres connection
inspector = inspect(engine)
inspector.get_table_names()

# Query ALL Records in the the Database
datetime_df = pd.read_sql("SELECT * from date_time", connection)
datetime_df.drop(columns=["index"])


Unnamed: 0,CRASH DATE,CRASH TIME,COLLISION_ID
0,2021-09-11,2:39,4455765
1,2022-03-26,11:45,4513547
2,2022-06-29,6:55,4541903
3,2021-09-11,9:35,4456314
4,2021-12-14,8:13,4486609
...,...,...,...
1936700,2022-10-14,20:22,4572947
1936701,2022-10-12,16:30,4572848
1936702,2022-10-14,22:00,4573314
1936703,2022-10-14,1:00,4572706


In [5]:
# retrieve tables from postgres connection
inspector = inspect(engine)
inspector.get_table_names()

# Query ALL Records in the the Database
locations_df = pd.read_sql("SELECT * from locations", connection)
locations_df.drop(columns=["index"])

Unnamed: 0,BOROUGH,ZIP CODE,COLLISION_ID
0,BROOKLYN,11208.0,4456314
1,BROOKLYN,11233.0,4486609
2,BRONX,10475.0,4486660
3,BROOKLYN,11207.0,4487074
4,MANHATTAN,10017.0,4486519
...,...,...,...
1335440,QUEENS,11385.0,4573422
1335441,QUEENS,11362.0,4572694
1335442,BROOKLYN,11234.0,4573135
1335443,QUEENS,11366.0,4573314


In [6]:
merge_df = pd.merge(collision_df, datetime_df, how="left", on=["COLLISION_ID", "COLLISION_ID"])
merge_df

Unnamed: 0,index_x,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID,index_y,CRASH DATE,CRASH TIME
0,0,2.0,0.0,4455765,0,2021-09-11,2:39
1,1,1.0,0.0,4513547,1,2022-03-26,11:45
2,2,0.0,0.0,4541903,2,2022-06-29,6:55
3,3,0.0,0.0,4456314,3,2021-09-11,9:35
4,4,0.0,0.0,4486609,4,2021-12-14,8:13
...,...,...,...,...,...,...,...
1936663,1936663,1.0,0.0,4572947,1936700,2022-10-14,20:22
1936664,1936664,0.0,0.0,4572848,1936701,2022-10-12,16:30
1936665,1936665,2.0,0.0,4573314,1936702,2022-10-14,22:00
1936666,1936666,2.0,0.0,4572706,1936703,2022-10-14,1:00


In [7]:
merge_df.drop(columns=["index_x","index_y"])

Unnamed: 0,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID,CRASH DATE,CRASH TIME
0,2.0,0.0,4455765,2021-09-11,2:39
1,1.0,0.0,4513547,2022-03-26,11:45
2,0.0,0.0,4541903,2022-06-29,6:55
3,0.0,0.0,4456314,2021-09-11,9:35
4,0.0,0.0,4486609,2021-12-14,8:13
...,...,...,...,...,...
1936663,1.0,0.0,4572947,2022-10-14,20:22
1936664,0.0,0.0,4572848,2022-10-12,16:30
1936665,2.0,0.0,4573314,2022-10-14,22:00
1936666,2.0,0.0,4572706,2022-10-14,1:00


In [8]:
secondmerge_df = pd.merge(merge_df, locations_df, how="left", on=["COLLISION_ID", "COLLISION_ID"])
secondmerge_df

Unnamed: 0,index_x,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,COLLISION_ID,index_y,CRASH DATE,CRASH TIME,index,BOROUGH,ZIP CODE
0,0,2.0,0.0,4455765,0,2021-09-11,2:39,,,
1,1,1.0,0.0,4513547,1,2022-03-26,11:45,,,
2,2,0.0,0.0,4541903,2,2022-06-29,6:55,,,
3,3,0.0,0.0,4456314,3,2021-09-11,9:35,0.0,BROOKLYN,11208.0
4,4,0.0,0.0,4486609,4,2021-12-14,8:13,1.0,BROOKLYN,11233.0
...,...,...,...,...,...,...,...,...,...,...
1936663,1936663,1.0,0.0,4572947,1936700,2022-10-14,20:22,,,
1936664,1936664,0.0,0.0,4572848,1936701,2022-10-12,16:30,,,
1936665,1936665,2.0,0.0,4573314,1936702,2022-10-14,22:00,1335443.0,QUEENS,11366.0
1936666,1936666,2.0,0.0,4572706,1936703,2022-10-14,1:00,,,


In [9]:
injured_df=secondmerge_df.drop(columns=["index_x","index_y", "index", "COLLISION_ID", "NUMBER OF PERSONS KILLED"])

In [10]:
injured_df.dropna(axis=0,how='any',inplace=True)

In [11]:
injured_df.rename(columns={'CRASH TIME': 'CRASHTIME'})

Unnamed: 0,NUMBER OF PERSONS INJURED,CRASH DATE,CRASHTIME,BOROUGH,ZIP CODE
3,0.0,2021-09-11,9:35,BROOKLYN,11208.0
4,0.0,2021-12-14,8:13,BROOKLYN,11233.0
7,2.0,2021-12-14,8:17,BRONX,10475.0
8,0.0,2021-12-14,21:10,BROOKLYN,11207.0
9,0.0,2021-12-14,14:58,MANHATTAN,10017.0
...,...,...,...,...,...
1936657,1.0,2022-10-14,14:55,QUEENS,11385.0
1936660,0.0,2022-10-13,16:07,QUEENS,11362.0
1936661,0.0,2022-10-14,8:00,BROOKLYN,11234.0
1936665,2.0,2022-10-14,22:00,QUEENS,11366.0


In [12]:
injured_df.dtypes

NUMBER OF PERSONS INJURED    float64
CRASH DATE                    object
CRASH TIME                    object
BOROUGH                       object
ZIP CODE                     float64
dtype: object

In [13]:
injured_df['CRASH DATE']= pd.to_datetime(injured_df['CRASH DATE'])

In [14]:
year2016_df = injured_df.loc[injured_df['CRASH DATE'].dt.year == 2016]
year2016_df

Unnamed: 0,NUMBER OF PERSONS INJURED,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE
844,0.0,2016-04-16,14:20,BROOKLYN,11214.0
42999,1.0,2016-06-02,7:28,BROOKLYN,11210.0
45817,0.0,2016-06-06,14:56,BROOKLYN,11212.0
45818,0.0,2016-09-26,16:58,BROOKLYN,11203.0
919973,0.0,2016-12-29,13:15,BRONX,10467.0
...,...,...,...,...,...
1200671,0.0,2016-01-07,16:45,QUEENS,11373.0
1200672,1.0,2016-01-19,12:00,BROOKLYN,11214.0
1200673,0.0,2016-01-08,15:05,MANHATTAN,10017.0
1200674,0.0,2016-01-08,13:34,BRONX,10473.0


In [15]:
df=year2016_df.rename(columns={'NUMBER OF PERSONS INJURED': 'INJURED', 'CRASH DATE': 'DATE', 'CRASH TIME':'TIME'})

In [16]:
df.head()

Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
844,0.0,2016-04-16,14:20,BROOKLYN,11214.0
42999,1.0,2016-06-02,7:28,BROOKLYN,11210.0
45817,0.0,2016-06-06,14:56,BROOKLYN,11212.0
45818,0.0,2016-09-26,16:58,BROOKLYN,11203.0
919973,0.0,2016-12-29,13:15,BRONX,10467.0


In [17]:
df.dtypes

INJURED            float64
DATE        datetime64[ns]
TIME                object
BOROUGH             object
ZIP CODE           float64
dtype: object

In [18]:
df['DATE']= pd.to_datetime(df['DATE'])
df['DATE'] = (df['DATE'] - df['DATE'].min())  / np.timedelta64(1,'D')

In [19]:
df['TIME']= pd.to_datetime(df['TIME'])
df['TIME'] = (df['TIME'] - df['TIME'].min())  / np.timedelta64(1,'D')

In [20]:
df.dtypes

INJURED     float64
DATE        float64
TIME        float64
BOROUGH      object
ZIP CODE    float64
dtype: object

In [21]:
borough_mapping = {
    "BROOKLYN": 0,
    "BRONX": 1,
    "QUEENS": 2,
    "MANHATTAN": 3,
    "STATEN ISLAND": 4
}

In [22]:
df["BOROUGH"] = df["BOROUGH"].apply(lambda x: borough_mapping[x])

In [23]:
df

Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
844,0.0,106.0,0.597222,0,11214.0
42999,1.0,153.0,0.311111,0,11210.0
45817,0.0,157.0,0.622222,0,11212.0
45818,0.0,269.0,0.706944,0,11203.0
919973,0.0,363.0,0.552083,1,10467.0
...,...,...,...,...,...
1200671,0.0,6.0,0.697917,2,11373.0
1200672,1.0,18.0,0.500000,0,11214.0
1200673,0.0,7.0,0.628472,3,10017.0
1200674,0.0,7.0,0.565278,1,10473.0


In [24]:
# Define the features set.
X = df.copy()
X = X.drop("INJURED", axis=1)
X.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE
844,106.0,0.597222,0,11214.0
42999,153.0,0.311111,0,11210.0
45817,157.0,0.622222,0,11212.0
45818,269.0,0.706944,0,11203.0
919973,363.0,0.552083,1,10467.0


In [25]:
# Define the target set.
y = df["INJURED"].ravel()
y[:5]

array([0., 1., 0., 0., 0.])

In [26]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [27]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [29]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [30]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [31]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [32]:
# Displaying results
print("Confusion Matrix")
display(df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
844,0.0,106.0,0.597222,0,11214.0
42999,1.0,153.0,0.311111,0,11210.0
45817,0.0,157.0,0.622222,0,11212.0
45818,0.0,269.0,0.706944,0,11203.0
919973,0.0,363.0,0.552083,1,10467.0
...,...,...,...,...,...
1200671,0.0,6.0,0.697917,2,11373.0
1200672,1.0,18.0,0.500000,0,11214.0
1200673,0.0,7.0,0.628472,3,10017.0
1200674,0.0,7.0,0.565278,1,10473.0


Accuracy Score : 0.7930053503849668
Classification Report
              precision    recall  f1-score   support

         0.0       0.81      0.97      0.88     31078
         1.0       0.20      0.04      0.06      5652
         2.0       0.08      0.01      0.01      1059
         3.0       0.04      0.00      0.01       341
         4.0       0.00      0.00      0.00       121
         5.0       0.00      0.00      0.00        35
         6.0       0.00      0.00      0.00        20
         7.0       0.00      0.00      0.00         6
         8.0       0.00      0.00      0.00         1
         9.0       0.00      0.00      0.00         1
        11.0       0.00      0.00      0.00         1

    accuracy                           0.79     38315
   macro avg       0.10      0.09      0.09     38315
weighted avg       0.69      0.79      0.73     38315



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.41253156, 0.35709898, 0.01008846, 0.220281  ])

In [34]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4125315606791572, 'DATE'),
 (0.35709897784524125, 'TIME'),
 (0.22028100123992322, 'ZIP CODE'),
 (0.010088460235678191, 'BOROUGH')]

In [35]:
y = df.INJURED
X = df.drop(columns=['INJURED'])

In [36]:
y

844        0.0
42999      1.0
45817      0.0
45818      0.0
919973     0.0
          ... 
1200671    0.0
1200672    1.0
1200673    0.0
1200674    0.0
1200675    0.0
Name: INJURED, Length: 153258, dtype: float64

In [37]:
X.dtypes

DATE        float64
TIME        float64
BOROUGH       int64
ZIP CODE    float64
dtype: object

In [38]:
model = LinearRegression()

In [39]:
model.fit(X, y)

LinearRegression()

In [40]:
y_pred = model.predict(X)
print(y_pred.shape)

(153258,)


In [41]:
y_pred

array([0.2855035 , 0.2773512 , 0.29258856, ..., 0.17352449, 0.22534364,
       0.25452145])

## 2017 Data

In [42]:
year2017_df = injured_df.loc[injured_df['CRASH DATE'].dt.year == 2017]
year2017_df

Unnamed: 0,NUMBER OF PERSONS INJURED,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE
64161,0.0,2017-02-19,16:10,BROOKLYN,11203.0
153856,0.0,2017-11-22,4:17,BRONX,10456.0
153859,0.0,2017-11-15,17:05,BRONX,10455.0
153860,0.0,2017-11-30,14:20,MANHATTAN,10168.0
153862,2.0,2017-11-09,7:12,MANHATTAN,10019.0
...,...,...,...,...,...
969982,0.0,2017-01-07,19:30,QUEENS,11427.0
969983,0.0,2017-01-17,5:06,QUEENS,11422.0
969984,0.0,2017-01-15,7:00,QUEENS,11373.0
969987,1.0,2017-01-10,20:22,BROOKLYN,11229.0


In [43]:
df_2017=year2017_df.rename(columns={'NUMBER OF PERSONS INJURED': 'INJURED', 'CRASH DATE': 'DATE', 'CRASH TIME':'TIME'})
df_2017

Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
64161,0.0,2017-02-19,16:10,BROOKLYN,11203.0
153856,0.0,2017-11-22,4:17,BRONX,10456.0
153859,0.0,2017-11-15,17:05,BRONX,10455.0
153860,0.0,2017-11-30,14:20,MANHATTAN,10168.0
153862,2.0,2017-11-09,7:12,MANHATTAN,10019.0
...,...,...,...,...,...
969982,0.0,2017-01-07,19:30,QUEENS,11427.0
969983,0.0,2017-01-17,5:06,QUEENS,11422.0
969984,0.0,2017-01-15,7:00,QUEENS,11373.0
969987,1.0,2017-01-10,20:22,BROOKLYN,11229.0


In [44]:
#Updating dates to float64
df_2017['DATE']= pd.to_datetime(df_2017['DATE'])
df_2017['DATE'] = (df_2017['DATE'] - df_2017['DATE'].min())  / np.timedelta64(1,'D')

In [45]:
#Updating dates to float64
df_2017['TIME']= pd.to_datetime(df_2017['TIME'])
df_2017['TIME'] = (df_2017['TIME'] - df_2017['TIME'].min())  / np.timedelta64(1,'D')

In [46]:
df_2017.dtypes

INJURED     float64
DATE        float64
TIME        float64
BOROUGH      object
ZIP CODE    float64
dtype: object

In [47]:
borough_mapping_2017= {
    "BROOKLYN": 0,
    "BRONX": 1,
    "QUEENS": 2,
    "MANHATTAN": 3,
    "STATEN ISLAND": 4
}

In [48]:
df_2017["BOROUGH"] = df_2017["BOROUGH"].apply(lambda x: borough_mapping_2017[x])

In [49]:
df_2017

Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
64161,0.0,49.0,0.673611,0,11203.0
153856,0.0,325.0,0.178472,1,10456.0
153859,0.0,318.0,0.711806,1,10455.0
153860,0.0,333.0,0.597222,3,10168.0
153862,2.0,312.0,0.300000,3,10019.0
...,...,...,...,...,...
969982,0.0,6.0,0.812500,2,11427.0
969983,0.0,16.0,0.212500,2,11422.0
969984,0.0,14.0,0.291667,2,11373.0
969987,1.0,9.0,0.848611,0,11229.0


In [50]:
# Define the features set.
X = df_2017.copy()
X = X.drop("INJURED", axis=1)
X.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE
64161,49.0,0.673611,0,11203.0
153856,325.0,0.178472,1,10456.0
153859,318.0,0.711806,1,10455.0
153860,333.0,0.597222,3,10168.0
153862,312.0,0.3,3,10019.0


In [51]:
# Define the target set.
y = df_2017["INJURED"].ravel()
y[:5]

array([0., 0., 0., 0., 2.])

In [52]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [53]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [54]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [55]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [56]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [57]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [58]:
# Displaying results
print("Confusion Matrix")
display(df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,INJURED,DATE,TIME,BOROUGH,ZIP CODE
844,0.0,106.0,0.597222,0,11214.0
42999,1.0,153.0,0.311111,0,11210.0
45817,0.0,157.0,0.622222,0,11212.0
45818,0.0,269.0,0.706944,0,11203.0
919973,0.0,363.0,0.552083,1,10467.0
...,...,...,...,...,...
1200671,0.0,6.0,0.697917,2,11373.0
1200672,1.0,18.0,0.500000,0,11214.0
1200673,0.0,7.0,0.628472,3,10017.0
1200674,0.0,7.0,0.565278,1,10473.0


Accuracy Score : 0.7965176496934748
Classification Report
              precision    recall  f1-score   support

         0.0       0.81      0.97      0.89     29000
         1.0       0.21      0.04      0.06      5300
         2.0       0.13      0.01      0.02       916
         3.0       0.23      0.02      0.03       314
         4.0       0.00      0.00      0.00       104
         5.0       0.00      0.00      0.00        54
         6.0       0.00      0.00      0.00        17
         7.0       0.00      0.00      0.00         5
         8.0       0.00      0.00      0.00         3
         9.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         1
        12.0       0.00      0.00      0.00         1
        15.0       0.00      0.00      0.00         2
        27.0       0.00      0.00      0.00         1

    accuracy                           0.80     35723
   macro avg       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
