In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Allows plots to appear directly in the notebook.
%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV files into Data Frames:
weather = pd.read_csv('working_weatherHistoryResampled_asof_2022.04.06.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
station = pd.read_csv('working_stationStateResampled_asof_2022.04.06.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
weather.shape

(684, 7)

In [4]:
station.shape

(75240, 6)

In [5]:
weather.head()

Unnamed: 0,weatherHour,latitude,longitude,description,temp,humidity,wind_speed
0,2022-02-22 12:00:00,53.35,-6.26,few clouds,282.81,71,10.8
1,2022-02-24 10:00:00,53.35,-6.26,scattered clouds,276.31,80,9.77
2,2022-03-01 09:00:00,53.35,-6.26,clear sky,277.21,86,0.98
3,2022-03-01 10:00:00,53.35,-6.26,scattered clouds,279.13,83,1.78
4,2022-03-01 11:00:00,53.35,-6.26,broken clouds,280.61,80,3.65


In [6]:
station.head()

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes
0,143661,1,2022-02-22 12:00:00,OPEN,12,18
1,143662,2,2022-02-22 12:00:00,OPEN,18,2
2,143663,3,2022-02-22 12:00:00,OPEN,20,13
3,143664,4,2022-02-22 12:00:00,OPEN,22,13
4,143665,5,2022-02-22 12:00:00,OPEN,38,2


In [7]:
stationWeather = pd.merge(station, weather, how="left", on=["weatherHour"])

In [8]:
stationWeather.head()

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed
0,143661,1,2022-02-22 12:00:00,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.8
1,143662,2,2022-02-22 12:00:00,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.8
2,143663,3,2022-02-22 12:00:00,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.8
3,143664,4,2022-02-22 12:00:00,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.8
4,143665,5,2022-02-22 12:00:00,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.8


In [9]:
stationWeather.tail()

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed
75235,218896,106,2022-04-04 10:00:00,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84
75236,218897,107,2022-04-04 10:00:00,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84
75237,218898,108,2022-04-04 10:00:00,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84
75238,218899,109,2022-04-04 10:00:00,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84
75239,218900,110,2022-04-04 10:00:00,OPEN,28,2,53.35,-6.26,broken clouds,285.57,86,10.84


In [10]:
stationWeather.dtypes

ID                         int64
stationId                  int64
weatherHour               object
status                    object
available_bike_stands      int64
available_bikes            int64
latitude                 float64
longitude                float64
description               object
temp                     float64
humidity                   int64
wind_speed               float64
dtype: object

In [12]:
stationWeather[['weatherDate', 'weatherHour']] = stationWeather['weatherHour'].str.split(' ', expand=True)
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate
0,143661,1,12:00:00,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
1,143662,2,12:00:00,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
2,143663,3,12:00:00,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
3,143664,4,12:00:00,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
4,143665,5,12:00:00,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10:00:00,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04
75236,218897,107,10:00:00,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04
75237,218898,108,10:00:00,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04
75238,218899,109,10:00:00,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04


In [15]:
stationWeather[['weatherHour', 'weatherMinute', 'weatherSecond']] = stationWeather['weatherHour'].str.split(':', expand=True)
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
1,143662,2,12,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
2,143663,3,12,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
3,143664,4,12,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
4,143665,5,12,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00
75236,218897,107,10,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00
75237,218898,108,10,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00
75238,218899,109,10,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00


In [34]:
stationWeather['cal_bike_stands'] = stationWeather.available_bike_stands + stationWeather.available_bikes
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond,cal_bike_stands
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,30
1,143662,2,12,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,20
2,143663,3,12,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,33
3,143664,4,12,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,35
4,143665,5,12,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,20
75236,218897,107,10,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40
75237,218898,108,10,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40
75238,218899,109,10,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,28


In [35]:
# Change description to category type for numeric encoding
stationWeather['description'] = stationWeather['description'].astype('category')
stationWeather.dtypes

ID                          int64
stationId                   int64
weatherHour                object
status                     object
available_bike_stands       int64
available_bikes             int64
latitude                  float64
longitude                 float64
description              category
temp                      float64
humidity                    int64
wind_speed                float64
weatherDate                object
weatherMinute              object
weatherSecond              object
cal_bike_stands             int64
dtype: object

In [37]:
# Catergorical Encoding
stationWeather['num_desc'] = stationWeather['description'].cat.codes
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond,cal_bike_stands,num_desc
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,30,2
1,143662,2,12,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,20,2
2,143663,3,12,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,33,2
3,143664,4,12,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,35,2
4,143665,5,12,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,40,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,20,0
75236,218897,107,10,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40,0
75237,218898,108,10,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40,0
75238,218899,109,10,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,28,0


In [38]:
# Checking correlations for all the contious features
stationWeather.corr()

Unnamed: 0,ID,stationId,available_bike_stands,available_bikes,latitude,longitude,temp,humidity,wind_speed,cal_bike_stands,num_desc
ID,1.0,0.001461928,0.000109,-0.002508,,,0.2304128,-0.1411975,-0.2389431,-0.002657,0.008602108
stationId,0.001462,1.0,-0.012042,-0.042931,,,8.552269e-16,-6.187358e-16,-2.477227e-16,-0.064043,2.009989e-16
available_bike_stands,0.000109,-0.01204187,1.0,-0.680484,,,0.01946609,-0.01353588,0.001009944,0.57403,-0.001162099
available_bikes,-0.002508,-0.04293149,-0.680484,1.0,,,-0.02569532,0.02459045,-0.004632753,0.209393,-0.003387794
latitude,,,,,,,,,,,
longitude,,,,,,,,,,,
temp,0.230413,8.552269e-16,0.019466,-0.025695,,,1.0,-0.467591,0.1368502,-0.002737,-0.03380526
humidity,-0.141197,-6.187358e-16,-0.013536,0.02459,,,-0.467591,1.0,-0.1789565,0.009416,-0.04910726
wind_speed,-0.238943,-2.477227e-16,0.00101,-0.004633,,,0.1368502,-0.1789565,1.0,-0.003829,0.01195065
cal_bike_stands,-0.002657,-0.06404345,0.57403,0.209393,,,-0.002737095,0.009415986,-0.003829206,1.0,-0.005336483


# Multiple Linear Regression

In [39]:
X = stationWeather[['stationId', 'weatherHour', 'temp', 'humidity', 'wind_speed', 'cal_bike_stands', 'num_desc']]
y = stationWeather.available_bikes

<h2>Training</h2>

In [40]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['stationId', 'weatherHour', 'temp', 'humidity', 'wind_speed', 'cal_bike_stands', 'num_desc'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
        stationId weatherHour    temp  humidity  wind_speed  cal_bike_stands  \
0              1          12  282.81        71       10.80               30   
1              2          12  282.81        71       10.80               20   
2              3          12  282.81        71       10.80               33   
3              4          12  282.81        71       10.80               35   
4              5          12  282.81        71       10.80               40   
...          ...         ...     ...       ...         ...              ...   
75235        106          10  285.57        86       10.84               20   
75236        107          10  285.57        86       10.84               40   
75237        108          10  285.57        86       10.84               40   
75238        109          10  285.57        86       10.84               28   
75239        110          10  285.57        86       10.84               30   

       num_desc  
0             2  
1  

Unnamed: 0,feature,importance
5,cal_bike_stands,0.231645
3,humidity,0.012002
4,wind_speed,0.006122
1,weatherHour,0.00504
6,num_desc,-0.004279
0,stationId,-0.008338
2,temp,-0.052692


In [41]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('dwmb_resample_linReg_model.pkl', 'wb') as handle:
    pickle.dump(linreg, handle, pickle.HIGHEST_PROTOCOL)

<h2>Testing</h2>

In [42]:
linreg_predictions = linreg.predict(X)
print(type(linreg_predictions))

print("\nPredictions with linear regression: \n")
actual_vs_predicted_linreg = pd.concat([y, pd.DataFrame(linreg_predictions, columns=['Predicted'], index=y.index)], axis=1)
print(actual_vs_predicted_linreg)

<class 'numpy.ndarray'>

Predictions with linear regression: 

       available_bikes  Predicted
0                   18  12.461936
1                    2  10.137147
2                   13  13.140196
3                   13  13.595148
4                    2  14.745036
...                ...        ...
75235               17   9.303346
75236               11  13.927910
75237                3  13.919572
75238               24  11.131493
75239                2  11.586446

[75240 rows x 2 columns]


In [43]:
print(X.columns)
small_linreg = [X.iloc[0]]
Small_linreg_predictions = linreg.predict(small_linreg)
print(type(linreg_predictions))

Index(['stationId', 'weatherHour', 'temp', 'humidity', 'wind_speed',
       'cal_bike_stands', 'num_desc'],
      dtype='object')
<class 'numpy.ndarray'>


  return f(*args, **kwargs)


In [44]:
# Prototyping the results for /predict
# convert numpy array to dictionary
temp_results = dict(enumerate(Small_linreg_predictions.flatten(), 1))

# print numpy array
print(Small_linreg_predictions)
print(type(Small_linreg_predictions))
 
# print dictionary
print(temp_results)
print(type(temp_results))

[12.4619359]
<class 'numpy.ndarray'>
{1: 12.46193590044821}
<class 'dict'>


In [45]:
prediction_errors = y - linreg_predictions
print("Actual - Predicted:\n", prediction_errors)
print("\n(Actual - Predicted) squared:\n", prediction_errors**2)
print("\n Sum of (Actual - Predicted) squared:\n", (prediction_errors**2).sum())

Actual - Predicted:
 0         5.538064
1        -8.137147
2        -0.140196
3        -0.595148
4       -12.745036
           ...    
75235     7.696654
75236    -2.927910
75237   -10.919572
75238    12.868507
75239    -9.586446
Name: available_bikes, Length: 75240, dtype: float64

(Actual - Predicted) squared:
 0         30.670154
1         66.213167
2          0.019655
3          0.354201
4        162.435942
            ...    
75235     59.238485
75236      8.572656
75237    119.237057
75238    165.598460
75239     91.899946
Name: available_bikes, Length: 75240, dtype: float64

 Sum of (Actual - Predicted) squared:
 5723513.4247439075


In [46]:
# Mean Squared Error
mse = (prediction_errors** 2).mean()
rmse = ((prediction_errors** 2).mean())**0.5

print("\nMean Squared Error:\n", mse)
print("\nRoot Mean Squared Error:\n", rmse)


Mean Squared Error:
 76.0700880481648

Root Mean Squared Error:
 8.72181678597784


In [47]:
print("|Actual - Predicted|:\n", abs(prediction_errors))

|Actual - Predicted|:
 0         5.538064
1         8.137147
2         0.140196
3         0.595148
4        12.745036
           ...    
75235     7.696654
75236     2.927910
75237    10.919572
75238    12.868507
75239     9.586446
Name: available_bikes, Length: 75240, dtype: float64


In [48]:
# Mean Absolute Error
mae = abs(prediction_errors).mean()
print("\nMean Absolute Error:\n", mae)


Mean Absolute Error:
 7.19919414900611


In [49]:
# R2 Score
prediction_errors = y - linreg_predictions
print("Actual - Predicted:\n", prediction_errors)
print("\n(Actual - Predicted) squared:\n", prediction_errors**2)
print("\n Sum of squared errors:\n", sum(prediction_errors**2))

avg_predictions =np.ones(y.shape[0])* stationWeather.available_bikes.mean()

print("\nAverageModelPredictions:\n", avg_predictions)
avgpredictions_errors = y - avg_predictions
print("Actual - AvgPredictions:\n", avgpredictions_errors)
print("\n(Actual - AvgPredictions) squared:\n", avgpredictions_errors**2)
print("\n Total sum of squared errors:\n", sum(avgpredictions_errors**2))

r2 = 1 - sum(prediction_errors**2)/sum(avgpredictions_errors**2)
print("\n R2:\n", r2)

Actual - Predicted:
 0         5.538064
1        -8.137147
2        -0.140196
3        -0.595148
4       -12.745036
           ...    
75235     7.696654
75236    -2.927910
75237   -10.919572
75238    12.868507
75239    -9.586446
Name: available_bikes, Length: 75240, dtype: float64

(Actual - Predicted) squared:
 0         30.670154
1         66.213167
2          0.019655
3          0.354201
4        162.435942
            ...    
75235     59.238485
75236      8.572656
75237    119.237057
75238    165.598460
75239     91.899946
Name: available_bikes, Length: 75240, dtype: float64

 Sum of squared errors:
 5723513.42474392

AverageModelPredictions:
 [12.5333865 12.5333865 12.5333865 ... 12.5333865 12.5333865 12.5333865]
Actual - AvgPredictions:
 0         5.466614
1       -10.533386
2         0.466614
3         0.466614
4       -10.533386
           ...    
75235     4.466614
75236    -1.533386
75237    -9.533386
75238    11.466614
75239   -10.533386
Name: available_bikes, Length: 7524

In [50]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [51]:
printMetrics(y, linreg_predictions)


MAE:  7.19919414900611
RMSE:  8.721816785977829
R2:  0.04552179282340796
