In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data/calls_weather_tfk.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,hour,num_calls,BRONX,BROOKLYN,MANHATTAN,QUEENS,...,STATION,NAME,DATE,PRCP,SNOW,SNWD,TMAX,TMIN,TAVG_CALC,Incidences
0,0,2010,1,2,2,93,25,23,20,20,...,USW00094728,"NY CITY CENTRAL PARK, NY US",2010-01-02,0.02,0.2,0.0,34.0,17.0,25.5,1
1,1,2010,1,2,3,88,22,28,19,15,...,USW00094728,"NY CITY CENTRAL PARK, NY US",2010-01-02,0.02,0.2,0.0,34.0,17.0,25.5,1
2,2,2010,1,3,12,144,43,42,33,26,...,USW00094728,"NY CITY CENTRAL PARK, NY US",2010-01-03,0.0,0.0,0.0,22.0,17.0,19.5,1
3,3,2010,1,4,1,94,23,34,20,11,...,USW00094728,"NY CITY CENTRAL PARK, NY US",2010-01-04,0.0,0.0,0.0,30.0,19.0,24.5,2
4,4,2010,1,4,10,219,53,67,54,34,...,USW00094728,"NY CITY CENTRAL PARK, NY US",2010-01-04,0.0,0.0,0.0,30.0,19.0,24.5,2


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25241 entries, 0 to 25240
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                25241 non-null  int64  
 1   year                      25241 non-null  int64  
 2   month                     25241 non-null  int64  
 3   day                       25241 non-null  int64  
 4   hour                      25241 non-null  int64  
 5   num_calls                 25241 non-null  int64  
 6   BRONX                     25241 non-null  int64  
 7   BROOKLYN                  25241 non-null  int64  
 8   MANHATTAN                 25241 non-null  int64  
 9   QUEENS                    25241 non-null  int64  
 10  RICHMOND / STATEN ISLAND  25241 non-null  int64  
 11  UNKNOWN                   25241 non-null  int64  
 12  STATION                   25241 non-null  object 
 13  NAME                      25241 non-null  object 
 14  DATE  

In [9]:
df.drop(columns=['Unnamed: 0','STATION','NAME','DATE'],inplace=True)

In [19]:
df

Unnamed: 0,year,month,day,hour,num_calls,BRONX,BROOKLYN,MANHATTAN,QUEENS,RICHMOND / STATEN ISLAND,UNKNOWN,PRCP,SNOW,SNWD,TMAX,TMIN,TAVG_CALC,Incidences
0,2010,1,2,2,93,25,23,20,20,5,0,0.02,0.2,0.0,34.0,17.0,25.5,1
1,2010,1,2,3,88,22,28,19,15,4,0,0.02,0.2,0.0,34.0,17.0,25.5,1
2,2010,1,3,12,144,43,42,33,26,0,0,0.00,0.0,0.0,22.0,17.0,19.5,1
3,2010,1,4,1,94,23,34,20,11,6,0,0.00,0.0,0.0,30.0,19.0,24.5,2
4,2010,1,4,10,219,53,67,54,34,11,0,0.00,0.0,0.0,30.0,19.0,24.5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25236,2016,8,23,20,221,59,59,50,44,9,0,0.00,0.0,0.0,82.0,61.0,71.5,13
25237,2016,8,23,21,213,51,55,55,44,8,0,0.00,0.0,0.0,82.0,61.0,71.5,8
25238,2016,8,23,22,176,54,44,42,29,7,0,0.00,0.0,0.0,82.0,61.0,71.5,11
25239,2016,8,23,23,152,35,43,39,31,4,0,0.00,0.0,0.0,82.0,61.0,71.5,8


In [24]:
X = df.drop(columns=['num_calls','BRONX','BROOKLYN','MANHATTAN','QUEENS','RICHMOND / STATEN ISLAND','UNKNOWN'])
y = df['num_calls']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [26]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

### Linear Regression

In [27]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.486040721285123, 0.4753389745647403)

### Support Vector Machines

In [33]:
svr = LinearSVR()
svr.fit(X_train_sc, y_train)
svr.score(X_train_sc, y_train), svr.score(X_test_sc, y_test)

(0.4774411388863039, 0.46537026788389635)

### AdaBoost Regressor

In [34]:
abr = AdaBoostRegressor()
abr.fit(X_train_sc, y_train)
abr.score(X_train_sc, y_train), abr.score(X_test_sc, y_test)

(0.662246359452032, 0.6570242277228019)

### Random Forest Regressor

In [36]:
rf = RandomForestRegressor()
rf.fit(X_train_sc, y_train)
rf.score(X_train_sc, y_train), rf.score(X_test_sc, y_test)

(0.9740959255797729, 0.8099825603558211)

In [39]:
rf_param = {
    'n_estimators': [200, 250, 300],
    'max_depth': [2, 3, 4, 5]
}
gs = GridSearchCV(rf, param_grid=rf_param, cv=5)
gs.fit(X_train_sc, y_train)
gs.score(X_train_sc, y_train), gs.score(X_test_sc, y_test)

(0.7921547036032153, 0.7786378774864823)

In [40]:
print(gs.best_score_)
gs.best_params_

0.7881969183291739


{'max_depth': 5, 'n_estimators': 200}

In [42]:
preds = gs.predict(X_test_sc)
np.sqrt(mean_squared_error(y_test, preds))

24.270542279490748