In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
df = pd.read_csv(Path('CSV/Injured_year2016.csv'))
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,NUMBER OF PERSONS INJURED
0,2016-04-16,14:20,BROOKLYN,11214.0,0.0
1,2016-06-02,7:28,BROOKLYN,11210.0,1.0
2,2016-06-06,14:56,BROOKLYN,11212.0,0.0
3,2016-09-26,16:58,BROOKLYN,11203.0,0.0
4,2016-12-29,13:15,BRONX,10467.0,0.0


In [3]:
df =df.rename(columns={'NUMBER OF PERSONS INJURED': 'INJURED', 'CRASH DATE': 'DATE', 'CRASH TIME':'TIME'})

In [4]:
df.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,INJURED
0,2016-04-16,14:20,BROOKLYN,11214.0,0.0
1,2016-06-02,7:28,BROOKLYN,11210.0,1.0
2,2016-06-06,14:56,BROOKLYN,11212.0,0.0
3,2016-09-26,16:58,BROOKLYN,11203.0,0.0
4,2016-12-29,13:15,BRONX,10467.0,0.0


In [5]:
df.dtypes

DATE         object
TIME         object
BOROUGH      object
ZIP CODE    float64
INJURED     float64
dtype: object

In [6]:
df['DATE']= pd.to_datetime(df['DATE'])
df['DATE'] = (df['DATE'] - df['DATE'].min())  / np.timedelta64(1,'D')

In [7]:
df['TIME']= pd.to_datetime(df['TIME'])
df['TIME'] = (df['TIME'] - df['TIME'].min())  / np.timedelta64(1,'D')

In [8]:
df.dtypes

DATE        float64
TIME        float64
BOROUGH      object
ZIP CODE    float64
INJURED     float64
dtype: object

In [9]:
borough_mapping = {
    "BROOKLYN": 0,
    "BRONX": 1,
    "QUEENS": 2,
    "MANHATTAN": 3,
    "STATEN ISLAND": 4
}

In [10]:
df["BOROUGH"] = df["BOROUGH"].apply(lambda x: borough_mapping[x])

In [11]:
df

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,INJURED
0,106.0,0.597222,0,11214.0,0.0
1,153.0,0.311111,0,11210.0,1.0
2,157.0,0.622222,0,11212.0,0.0
3,269.0,0.706944,0,11203.0,0.0
4,363.0,0.552083,1,10467.0,0.0
...,...,...,...,...,...
153253,6.0,0.697917,2,11373.0,0.0
153254,18.0,0.500000,0,11214.0,1.0
153255,7.0,0.628472,3,10017.0,0.0
153256,7.0,0.565278,1,10473.0,0.0


In [12]:
# Define the features set.
X = df.copy()
X = X.drop("INJURED", axis=1)
X.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE
0,106.0,0.597222,0,11214.0
1,153.0,0.311111,0,11210.0
2,157.0,0.622222,0,11212.0
3,269.0,0.706944,0,11203.0
4,363.0,0.552083,1,10467.0


In [13]:
# Define the target set.
y = df["INJURED"].ravel()
y[:5]

array([0., 1., 0., 0., 0.])

In [14]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [17]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [18]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [19]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [20]:
# Displaying results
print("Confusion Matrix")
display(df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,INJURED
0,106.0,0.597222,0,11214.0,0.0
1,153.0,0.311111,0,11210.0,1.0
2,157.0,0.622222,0,11212.0,0.0
3,269.0,0.706944,0,11203.0,0.0
4,363.0,0.552083,1,10467.0,0.0
...,...,...,...,...,...
153253,6.0,0.697917,2,11373.0,0.0
153254,18.0,0.500000,0,11214.0,1.0
153255,7.0,0.628472,3,10017.0,0.0
153256,7.0,0.565278,1,10473.0,0.0


Accuracy Score : 0.7930053503849668
Classification Report
              precision    recall  f1-score   support

         0.0       0.81      0.97      0.88     31078
         1.0       0.20      0.04      0.06      5652
         2.0       0.08      0.01      0.01      1059
         3.0       0.04      0.00      0.01       341
         4.0       0.00      0.00      0.00       121
         5.0       0.00      0.00      0.00        35
         6.0       0.00      0.00      0.00        20
         7.0       0.00      0.00      0.00         6
         8.0       0.00      0.00      0.00         1
         9.0       0.00      0.00      0.00         1
        11.0       0.00      0.00      0.00         1

    accuracy                           0.79     38315
   macro avg       0.10      0.09      0.09     38315
weighted avg       0.69      0.79      0.73     38315



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.41253156, 0.35709898, 0.01008846, 0.220281  ])

In [22]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4125315606791572, 'DATE'),
 (0.35709897784524125, 'TIME'),
 (0.22028100123992322, 'ZIP CODE'),
 (0.010088460235678191, 'BOROUGH')]

In [None]:
y = df.INJURED
X = df.drop(columns=['INJURED'])

In [106]:
y

0         0.0
1         1.0
2         0.0
3         0.0
4         0.0
         ... 
153253    0.0
153254    1.0
153255    0.0
153256    0.0
153257    0.0
Name: INJURED, Length: 153258, dtype: float64

In [107]:
X.dtypes

DATE        float64
TIME        float64
BOROUGH       int64
ZIP CODE    float64
dtype: object

In [108]:
model = LinearRegression()

In [109]:
model.fit(X, y)

LinearRegression()

In [110]:
y_pred = model.predict(X)
print(y_pred.shape)

(153258,)


In [112]:
y_pred

array([0.2855035 , 0.2773512 , 0.29258856, ..., 0.17352449, 0.22534364,
       0.25452145])

In [None]:
# Define the features set.
X = df_loans.copy()
X = X.drop("bad", axis=1)
X.head()