# 44688-Data Analytics Capstone Project

## FDIP (Fire Department Incident Prediction)

##### 03/13/23 - 04/28/23

##### Debra D. "DeeDee" Walker

##### Northwest Missouri State University, Maryville MO 64468, USA

In [1]:
#Import the pandas framework to import and work with the dataset
import pandas as pd

#Import numpy
import numpy as np

#Import datetime so we can work with dates and times
import datetime as dt

#Original data was accessed through the csv file download due to the limitations of the API on the site for this large dataset. The API endpoint limits hits and requires data to be requested by page
# Data was pulled from https://data.cityofnewyork.us/Public-Safety/Incidents-Responded-to-by-Fire-Companies/tm6d-hbzd on March 16, 2023 then cleaned for use.
#Use the function read_csv from pandas and create a dataframe assigned to variable df
df = pd.read_csv('fdip_clean.csv', sep=',')

#convert INCIDENT_DATE_TIME from object to datetime format
df['INCIDENT_DATE_TIME'] = pd.to_datetime(df['INCIDENT_DATE_TIME'], infer_datetime_format = True)

df.head(n=5)

Unnamed: 0,IM_INCIDENT_KEY,INCIDENT_DATE_TIME,UNITS_ONSCENE,TOTAL_INCIDENT_DURATION,ZIP_CODE,BOROUGH_DESC,INCIDENT_CATEGORY,INCIDENT_CATNUM,LEVEL_CATEGORY,BOROUGH_NUM,Day_of_week,Hour_of_day,MONTH,DAY_NUM,INCIDENT_LENGTH,UNITS_CATEGORY
0,63583742,2018-07-21 12:18:19,1,0.66,10475,2 - Bronx,RESCUE & EMS,3,1,2,Saturday,12,7,6,30min-45min,1
1,63583743,2018-07-21 12:18:26,1,0.12,11230,4 - Brooklyn,RESCUE & EMS,3,1,4,Saturday,12,7,6,<=15min,1
2,63584267,2018-07-21 14:16:40,2,0.02,11204,4 - Brooklyn,HAZARDOUS CONDITION-NO FIRE,4,1,4,Saturday,14,7,6,<=15min,2
3,63584484,2018-07-21 15:07:51,1,0.37,11235,4 - Brooklyn,SERVICE CALL,5,1,4,Saturday,15,7,6,15min-30min,1
4,63584485,2018-07-21 15:08:36,1,0.32,11208,4 - Brooklyn,RESCUE & EMS,3,1,4,Saturday,15,7,6,15min-30min,1


##### Level Catergory is being dropped from the predictions because it is too skewed to the level of one to gain valid information and we can gain enough
#####  information from incident category, incident length, and units catergory

#### Split Data and extract features

In [2]:
from sklearn.model_selection import train_test_split

Xfeatures = df[['ZIP_CODE','BOROUGH_NUM','MONTH','Hour_of_day','DAY_NUM']]
ylabels = df[['INCIDENT_CATEGORY','INCIDENT_LENGTH','UNITS_CATEGORY']]
Xtrain, Xtest, ytrain, ytest = train_test_split(Xfeatures, ylabels, train_size=0.8, random_state=50)
print('Train size: ', len(Xtrain), len(ytrain), 'Test size: ', len(Xtest), len(ytest))

Train size:  1872331 1872331 Test size:  468083 468083


#### Import sklearn classifiers, pipelines, etc. for models

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputClassifier

#### Random Forest with MultiOutput

In [4]:
forest_model = RandomForestClassifier(random_state=50, max_depth=6)
multi_forest = MultiOutputClassifier(forest_model)
multi_forest.fit(Xtrain,ytrain)
y_pred = multi_forest.predict(Xtrain)
print("Predicted y:", y_pred)
print()
print("Predicted probabilities:", multi_forest.predict_proba(Xtrain))
print()
print("The mean accuracy on the given test data and labels:", multi_forest.score(Xtrain,ytrain))

Predicted y: [['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']
 ...
 ['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']]

Predicted probabilities: [array([[7.46755184e-02, 1.45133085e-01, 7.44992014e-02, ...,
        5.04652065e-01, 1.08171309e-01, 1.90378311e-04],
       [7.49288513e-02, 1.49601942e-01, 7.51066994e-02, ...,
        5.10025674e-01, 9.38335868e-02, 2.20612037e-04],
       [7.99319510e-02, 1.32734845e-01, 8.65142016e-02, ...,
        4.88052641e-01, 1.08306071e-01, 3.92011471e-04],
       ...,
       [5.88627105e-02, 1.93682580e-01, 6.41083245e-02, ...,
        5.49007001e-01, 6.38361123e-02, 1.88244162e-04],
       [7.47025701e-02, 1.46160315e-01, 7.90306671e-02, ...,
        5.17219042e-01, 9.22862831e-02, 1.49119335e-04],
       [8.40275022e-02, 1.36774438e-01, 8.37151418e-02, ...,
        4.80295737e-01, 1.09656609e-01, 2.83628793e-04]]), array([[0.02798953, 0.39026281, 0.004401

##### Neural Network (MLP Classifier) with MultiOutput

In [5]:
mlp_model = MLPClassifier(random_state=50)
multi_mlp = MultiOutputClassifier(mlp_model)
multi_mlp.fit(Xtrain,ytrain)
y_pred = multi_mlp.predict(Xtrain)
print("Predicted y:", y_pred)
print()
print("Predicted probabilities:", multi_mlp.predict_proba(Xtrain))
print()
print("The mean accuracy on the given test data and labels:", multi_mlp.score(Xtrain,ytrain))

Predicted y: [['RESCUE & EMS' '15min-30min' '1']
 ['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']
 ...
 ['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']
 ['RESCUE & EMS' '<=15min' '1']]

Predicted probabilities: [array([[7.95953745e-02, 1.64549882e-01, 7.35318745e-02, ...,
        4.91311608e-01, 1.03594490e-01, 1.50535983e-04],
       [7.95953745e-02, 1.64549882e-01, 7.35318745e-02, ...,
        4.91311608e-01, 1.03594490e-01, 1.50535983e-04],
       [7.95953745e-02, 1.64549882e-01, 7.35318745e-02, ...,
        4.91311608e-01, 1.03594490e-01, 1.50535983e-04],
       ...,
       [7.95953745e-02, 1.64549882e-01, 7.35318745e-02, ...,
        4.91311608e-01, 1.03594490e-01, 1.50535983e-04],
       [7.95953745e-02, 1.64549882e-01, 7.35318745e-02, ...,
        4.91311608e-01, 1.03594490e-01, 1.50535983e-04],
       [7.95953745e-02, 1.64549882e-01, 7.35318745e-02, ...,
        4.91311608e-01, 1.03594490e-01, 1.50535983e-04]]), array([[0.0441957 , 0.41383952, 0.00

##### Stochastic gradient boosting (SGDClassifier) with MultiOutput

In [11]:
sgd_model = SGDClassifier(random_state=50, early_stopping=True, loss='modified_huber')
multi_sgd = MultiOutputClassifier(sgd_model)
multi_sgd.fit(Xtrain,ytrain)
y_pred = multi_sgd.predict(Xtrain)
print("Predicted y:", y_pred)
print()
print("Predicted probabilities:", multi_sgd.predict_proba(Xtrain))
print()
print("The mean accuracy on the given test data and labels:", multi_sgd.score(Xtrain,ytrain))

Predicted y: [['RESCUE & EMS' '15min-30min' '1']
 ['RESCUE & EMS' '15min-30min' '1']
 ['RESCUE & EMS' '15min-30min' '1']
 ...
 ['RESCUE & EMS' '15min-30min' '1']
 ['RESCUE & EMS' '15min-30min' '1']
 ['RESCUE & EMS' '15min-30min' '1']]

Predicted probabilities: [array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]]), array([[0.14285714, 0.14285714, 0.14285714, ..., 0.14285714, 0.14285714,
        0.14285714],
       [0.14285714, 0.14285714, 0.14285714, ..., 0.14285714, 0.14285714,
        0.14285714],
       [0.14285714, 0.14285714, 0.14285714, ..., 0.14285714, 0.14285714,
        0.14285714],
       ...,
       [0.14285714, 0.14285714, 0.14285714, ..., 0.14285714, 0.14285714,
        0.14285714],
       [0.14285714, 0.14285714, 0.14285714, ..., 0.14285714, 0.14285714,
        0.14285714],
       [0.1428