In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('data/ems_weather.csv')

## **Linear Regression Models**

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,CAD_INCIDENT_ID,INITIAL_SEVERITY_LEVEL_CODE,FINAL_SEVERITY_LEVEL_CODE,FIRST_ASSIGNMENT_DATETIME,VALID_DISPATCH_RSPNS_TIME_INDC,DISPATCH_RESPONSE_SECONDS_QY,FIRST_ACTIVATION_DATETIME,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,...,CALL_MONTH_OCT,CALL_MONTH_SEP,CALL_MONTH,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,TAVG_CALC
0,0,110011884,4,4,9.0,1,9,16.0,1,480.0,...,0,0,1,3.13,0.0,0.0,12.0,53.0,40.0,46.5
1,1,110013013,6,6,28.0,1,28,127.0,1,674.0,...,0,0,1,3.13,0.0,0.0,12.0,53.0,40.0,46.5
2,2,110014121,4,4,10.0,1,10,19.0,1,399.0,...,0,0,1,3.13,0.0,0.0,12.0,53.0,40.0,46.5
3,3,110015204,2,2,16.0,1,16,44.0,1,272.0,...,0,0,1,3.13,0.0,0.0,12.0,53.0,40.0,46.5
4,4,110020794,4,6,27.0,1,27,46.0,1,920.0,...,0,0,1,5.14,0.01,0.0,9.0,52.0,35.0,43.5


#### Setting up y and X (incl removing dispatch time)

In [4]:
y = df['INCIDENT_RESPONSE_SECONDS_QY']

features = df.columns
# definitely drop these:
drop_features = ['Unnamed: 0',
                 'INCIDENT_RESPONSE_SECONDS_QY', 
                 'CAD_INCIDENT_ID', 
                 'FIRST_ASSIGNMENT_DATETIME', 
                 'DISPATCH_RESPONSE_SECONDS_QY',
]
# suggest dropping these: 
drop_features += list(features[230:594]) # zip codes
drop_features += list(features[670:831]) # city council district, community district, community school dist, congressional district
drop_features += ['AWND']

X = df.drop(columns=drop_features)

#### Setting up new X (removing travel time instead of dispatch time)

In [5]:
# definitely drop these:
drop_features1 = ['Unnamed: 0',
                 'INCIDENT_RESPONSE_SECONDS_QY', 
                 'CAD_INCIDENT_ID', 
                 'FIRST_ASSIGNMENT_DATETIME', 
                 'INCIDENT_TRAVEL_TM_SECONDS_QY']
# suggest dropping these: 
drop_features1 += list(features[230:594]) # zip codes
drop_features1 += list(features[670:831]) # city council district, community district, community school dist, congressional district
drop_features1 += ['AWND']

X1 = df.drop(columns=drop_features1)

In [23]:
X.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,FINAL_SEVERITY_LEVEL_CODE,VALID_DISPATCH_RSPNS_TIME_INDC,FIRST_ACTIVATION_DATETIME,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_TRAVEL_TM_SECONDS_QY,INCIDENT_CLOSE_DATETIME,HELD_INDICATOR,REOPEN_INDICATOR,SPECIAL_EVENT_INDICATOR,...,CALL_MONTH_NOV,CALL_MONTH_OCT,CALL_MONTH_SEP,CALL_MONTH,PRCP,SNOW,SNWD,TMAX,TMIN,TAVG_CALC
0,4,4,1,16.0,1,471.0,1583.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
1,6,6,1,127.0,1,646.0,48712.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
2,4,4,1,19.0,1,389.0,2654.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
3,2,2,1,44.0,1,256.0,591.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
4,4,6,1,46.0,1,893.0,5868.0,0,0,0,...,0,0,0,1,0.01,0.0,9.0,52.0,35.0,43.5


In [16]:
X1.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,FINAL_SEVERITY_LEVEL_CODE,VALID_DISPATCH_RSPNS_TIME_INDC,DISPATCH_RESPONSE_SECONDS_QY,FIRST_ACTIVATION_DATETIME,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_CLOSE_DATETIME,HELD_INDICATOR,REOPEN_INDICATOR,SPECIAL_EVENT_INDICATOR,...,CALL_MONTH_NOV,CALL_MONTH_OCT,CALL_MONTH_SEP,CALL_MONTH,PRCP,SNOW,SNWD,TMAX,TMIN,TAVG_CALC
0,4,4,1,9,16.0,1,1583.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
1,6,6,1,28,127.0,1,48712.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
2,4,4,1,10,19.0,1,2654.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
3,2,2,1,16,44.0,1,591.0,0,0,0,...,0,0,0,1,0.0,0.0,12.0,53.0,40.0,46.5
4,4,6,1,27,46.0,1,5868.0,0,0,0,...,0,0,0,1,0.01,0.0,9.0,52.0,35.0,43.5


#### Train test split on both Xs

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [6]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y, test_size=.3)

#### Scale both Xs

In [25]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [26]:
sc1 = StandardScaler()
X_train_sc1 = sc1.fit_transform(X_train1)
X_test_sc1 = sc1.transform(X_test1)

### Linear Regression

In [54]:
linreg = LinearRegression()

linreg.fit(X_train_sc, y_train)

print(f'Training score: {linreg.score(X_train_sc, y_train)}')
print(f'Testing score: {linreg.score(X_test_sc, y_test)}')

Training score: 0.8155436514925168
Testing score: -1.2719641180298604e+25


In [28]:
linreg.coef_

array([ 1.26480095e+01,  1.24041366e-01, -3.65555211e+14,  1.92860365e+02,
        4.18604939e+14,  3.14591142e+02,  3.52846140e+00,  9.02812333e+01,
        5.61339771e+00,  5.53609736e+14,  8.95416530e+14, -2.10030317e+14,
        1.01445312e+01,  1.01074219e+00,  6.93750000e+00,  2.75087214e+14,
       -3.05664062e-01, -1.06576014e+14,  7.94921875e+00,  1.96854555e+14,
        8.04687500e+00,  1.09159162e+15,  1.72656250e+00,  9.70312500e+00,
        3.32617188e+00, -2.17122328e+14,  9.04687500e+00,  7.26562500e-01,
       -6.07964939e+14,  4.15395286e+14,  3.58886719e+00,  2.69921875e+00,
        2.03515625e+00, -1.38476562e+00,  4.85156250e+00,  2.11093750e+01,
        2.53515625e+00,  1.78710938e+00,  1.03320312e+01,  4.46422086e+14,
        8.90625000e-01,  1.24804688e+00,  2.53477193e+14, -6.67968750e-01,
        9.43359375e-01,  2.89843750e+00,  8.13476562e-01,  3.67187500e-01,
        8.25390625e+00,  2.47644215e+14,  3.84570312e+00, -5.81132201e+14,
        3.28906250e+00,  

Linear Regression did not do well most likely due to the huge volume of columns in our dataset. We are going to try Lasso and Ridge to regularize and reduce overfitting.

### Lasso

#### Fitting & scoring lasso models on both Xs

In [29]:
lasso = Lasso()

lasso.fit(X_train_sc, y_train)

print(f'Training: {lasso.score(X_train_sc, y_train)}')
print(f'Testing: {lasso.score(X_test_sc, y_test)}')

Training: 0.8153672724024563
Testing: 0.8368473327311271


In [34]:
preds = lasso.predict(X_test_sc)
mse = mean_squared_error(y_test, preds)
np.sqrt(mse)

225.62225731476641

In [40]:
lasso1 = Lasso()

lasso1.fit(X_train_sc1, y_train1)

print(f'Training: {lasso1.score(X_train_sc1, y_train1)}')
print(f'Testing: {lasso1.score(X_test_sc1, y_test1)}')

Training: 0.6333629288083515
Testing: -0.32140595507828484


Score was better using the first X (no dispatch time) vs the second X (no travel time). Have to decide how to handle travel time as it is highly correlated to response time.

#### Looking at features the Lasso models kept

In [37]:
features = X_train.columns

In [54]:
features1 = X_train1.columns

In [48]:
lasso_coefs = lasso.coef_

In [49]:
lasso1_coefs = lasso1.coef_

In [51]:
coef_df = pd.DataFrame({'features':features, 'coefficients':lasso_coefs})

In [55]:
coef_df1 = pd.DataFrame({'features':features1, 'coefficients':lasso1_coefs})

In [52]:
coef_df[coef_df['coefficients'] != 0]

Unnamed: 0,features,coefficients
0,INITIAL_SEVERITY_LEVEL_CODE,3.472085
1,FINAL_SEVERITY_LEVEL_CODE,0.449342
3,FIRST_ACTIVATION_DATETIME,193.018453
5,INCIDENT_TRAVEL_TM_SECONDS_QY,313.779920
6,INCIDENT_CLOSE_DATETIME,2.349293
...,...,...
312,CALL_MONTH_MAR,3.093972
316,CALL_MONTH_SEP,-3.338527
317,CALL_MONTH,-0.234961
318,PRCP,2.538173


In [58]:
coef_df1[coef_df1['coefficients'] != 0]

Unnamed: 0,features,coefficients
0,INITIAL_SEVERITY_LEVEL_CODE,35.901861
1,FINAL_SEVERITY_LEVEL_CODE,4.469111
3,DISPATCH_RESPONSE_SECONDS_QY,5.702355
4,FIRST_ACTIVATION_DATETIME,319.166665
6,INCIDENT_CLOSE_DATETIME,9.538882
...,...,...
314,CALL_MONTH_NOV,-0.779306
316,CALL_MONTH_SEP,2.454233
318,PRCP,4.373495
319,SNOW,4.841491


In [53]:
coef_df.to_csv('data/lasso_coefs.csv')

In [59]:
coef_df1.to_csv('data/lasso_coefs_no-travel-time.csv')

### Ridge

#### Fitting & scoring ridge models on first X (no dispatch time)

In [36]:
ridge = Ridge()

ridge.fit(X_train_sc, y_train)

print(f'Training: {ridge.score(X_train_sc, y_train)}')
print(f'Testing: {ridge.score(X_test_sc, y_test)}')

Training: 0.8160104388710144
Testing: 0.8371544566275841


In [56]:
preds = ridge.predict(X_test_sc)
mse = mean_squared_error(y_test, preds)
np.sqrt(mse)

225.40979793853137

In [24]:
ridge.coef_

array([-7.09507021e-12,  1.25536982e-11, -8.09927613e-08,  1.90851390e-07,
       -7.57802073e-10,  0.00000000e+00, -8.83723170e-10,  2.67956195e-09,
        0.00000000e+00, -7.52536490e-10,  1.15193135e-09,  1.94076051e-12,
        1.11935599e-07,  3.24807889e-07,  0.00000000e+00,  1.76488806e-07,
        2.75997629e-07, -6.84088092e-08,  0.00000000e+00,  1.14336279e-07,
       -1.70039261e-06,  1.37465962e-07,  0.00000000e+00, -8.79668974e-07,
        0.00000000e+00, -5.32456812e-07,  1.58475206e-06,  1.17857027e-06,
        5.89575939e-07, -1.08049929e-06,  2.60996751e-06,  7.78611146e-07,
        1.30834717e-07,  1.19882550e-06,  1.25492446e-07, -2.56271554e-06,
       -1.84260955e-06,  6.75613536e-08, -1.00928853e-06,  4.83664042e-07,
        2.54820323e-07,  2.03594277e-06,  3.05799652e-06, -3.91564489e-07,
        0.00000000e+00,  1.05013540e-07,  8.63530607e-07, -1.15731155e-06,
       -7.80555813e-07,  1.79524756e-06,  1.39671369e-06, -1.00721927e-06,
        1.68072720e-06,  

We got decent scores for Lasso and Ridge (~82% for train, ~84% for test). Next we're going to try hyperparamater tuning.

In [60]:
l_alphas = np.logspace(-3, 0, 100)

lasso_cv = LassoCV(alphas=[0.01, 0.1, 1], max_iter=10000, cv=5)

lasso_cv.fit(X_train_sc1, y_train1)

print(f'Training: {lasso_cv.score(X_train_sc1, y_train1)}')
print(f'Testing: {lasso_cv.score(X_test_sc1, y_test1)}')

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Training: 0.6333629288083515
Testing: -0.32140595507828484
