In [19]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Allows plots to appear directly in the notebook.
%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [20]:
# Read CSV files into Data Frames:
weather = pd.read_csv('working_weatherHistory_asof_2022.03.26.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
station = pd.read_csv('working_stationState_asof_2022.03.26.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [21]:
weather.shape

(14443, 19)

In [22]:
station.shape

(1579820, 8)

In [23]:
weather.head()

Unnamed: 0,weatherTime,latitude,longitude,main,description,temp,feels_like,temp_min,temp_max,pressure,humidity,sea_level,grnd_level,wind_speed,wind_deg,wind_gust,clouds_all,country,name
0,2022-02-22 12:53:29,53.3498,-6.2603,Clouds,few clouds,282.81,278.75,281.86,283.69,1015,71,,,10.8,270,,20,IE,Mountjoy
1,2022-02-22 12:55:30,53.3498,-6.2603,Clouds,few clouds,282.81,278.75,281.86,283.69,1015,71,,,10.8,270,,20,IE,Mountjoy
2,2022-02-24 10:56:43,53.3515,-6.2553,Clouds,scattered clouds,276.31,270.37,275.31,276.99,1007,80,,,9.77,240,,40,IE,Mountjoy
3,2022-03-01 09:16:10,53.3498,-6.2603,Clear,clear sky,275.03,275.03,272.52,276.98,1030,91,,,1.03,0,,0,IE,Mountjoy
4,2022-03-01 09:30:03,53.3498,-6.2603,Clear,clear sky,276.8,276.8,273.53,279.76,1030,88,,,1.03,0,,0,IE,Mountjoy


In [24]:
station.head()

Unnamed: 0,ID,stationId,weatherTime,status,bike_stands,available_bike_stands,available_bikes,lastUpdate
0,2,1,2022-02-22 12:53:29,OPEN,30,12,18,1645534152000
1,3,2,2022-02-22 12:53:29,OPEN,20,18,2,1645534249000
2,4,3,2022-02-22 12:53:29,OPEN,33,20,13,1645534112000
3,5,4,2022-02-22 12:53:29,OPEN,35,22,13,1645534063000
4,6,5,2022-02-22 12:53:29,OPEN,40,38,2,1645533938000


In [25]:
stationWeather = pd.merge(station, weather, how="left", on=["weatherTime"])

In [26]:
stationWeather.head()

Unnamed: 0,ID,stationId,weatherTime,status,bike_stands,available_bike_stands,available_bikes,lastUpdate,latitude,longitude,...,pressure,humidity,sea_level,grnd_level,wind_speed,wind_deg,wind_gust,clouds_all,country,name
0,2,1,2022-02-22 12:53:29,OPEN,30,12,18,1645534152000,53.3498,-6.2603,...,1015,71,,,10.8,270,,20,IE,Mountjoy
1,3,2,2022-02-22 12:53:29,OPEN,20,18,2,1645534249000,53.3498,-6.2603,...,1015,71,,,10.8,270,,20,IE,Mountjoy
2,4,3,2022-02-22 12:53:29,OPEN,33,20,13,1645534112000,53.3498,-6.2603,...,1015,71,,,10.8,270,,20,IE,Mountjoy
3,5,4,2022-02-22 12:53:29,OPEN,35,22,13,1645534063000,53.3498,-6.2603,...,1015,71,,,10.8,270,,20,IE,Mountjoy
4,6,5,2022-02-22 12:53:29,OPEN,40,38,2,1645533938000,53.3498,-6.2603,...,1015,71,,,10.8,270,,20,IE,Mountjoy


In [27]:
stationWeather.tail()

Unnamed: 0,ID,stationId,weatherTime,status,bike_stands,available_bike_stands,available_bikes,lastUpdate,latitude,longitude,...,pressure,humidity,sea_level,grnd_level,wind_speed,wind_deg,wind_gust,clouds_all,country,name
1579815,1579817,106,2022-03-26 14:16:02,OPEN,20,20,0,1648303888000,53.3498,-6.2603,...,1034,57,,,4.63,70,,0,IE,Dublin
1579816,1579818,107,2022-03-26 14:16:02,OPEN,40,26,14,1648303696000,53.3498,-6.2603,...,1034,57,,,4.63,70,,0,IE,Dublin
1579817,1579819,108,2022-03-26 14:16:02,OPEN,40,19,21,1648304114000,53.3498,-6.2603,...,1034,57,,,4.63,70,,0,IE,Dublin
1579818,1579820,109,2022-03-26 14:16:02,OPEN,30,0,30,1648303984000,53.3498,-6.2603,...,1034,57,,,4.63,70,,0,IE,Dublin
1579819,1579821,110,2022-03-26 14:16:02,OPEN,30,23,7,1648303774000,53.3498,-6.2603,...,1034,57,,,4.63,70,,0,IE,Dublin


In [28]:
stationWeather.dtypes

ID                         int64
stationId                  int64
weatherTime               object
status                    object
bike_stands                int64
available_bike_stands      int64
available_bikes            int64
lastUpdate                 int64
latitude                 float64
longitude                float64
main                      object
description               object
temp                     float64
feels_like               float64
temp_min                 float64
temp_max                 float64
pressure                   int64
humidity                   int64
sea_level                float64
grnd_level               float64
wind_speed               float64
wind_deg                   int64
wind_gust                float64
clouds_all                 int64
country                   object
name                      object
dtype: object

In [29]:
# Checking correlations for all the contious features
stationWeather.corr()

Unnamed: 0,ID,stationId,bike_stands,available_bike_stands,available_bikes,lastUpdate,latitude,longitude,temp,feels_like,temp_min,temp_max,pressure,humidity,sea_level,grnd_level,wind_speed,wind_deg,wind_gust,clouds_all
ID,1.0,6.962531e-05,-5.733909e-06,-0.007876,-0.007164,0.999885,-0.0521985,-0.06669467,0.5787783,0.6416315,0.5107304,0.6217527,0.360961,-0.2465816,,,-0.2281555,-0.1976061,-0.2247464,-0.285643
stationId,7e-05,1.0,-0.0823538,-0.012317,-0.040503,-4e-06,4.756277e-14,-1.573345e-15,-2.948827e-14,-3.429772e-14,-2.511134e-14,-3.306301e-14,-2.949105e-14,1.67529e-14,,,1.692705e-14,1.000546e-14,5.977285e-15,2.217877e-14
bike_stands,-6e-06,-0.0823538,1.0,0.546971,0.192535,3e-06,-5.844746e-15,1.084427e-15,1.165076e-15,-8.706614e-16,1.861637e-15,4.62114e-16,-1.675242e-14,-1.5764640000000002e-17,,,1.066009e-14,6.659596e-15,-8.880239e-15,3.658268e-15
available_bike_stands,-0.007876,-0.01231685,0.546971,1.0,-0.671588,-0.007862,-0.002447613,-0.001601674,0.01836374,0.01576218,0.01727147,0.02036356,-0.005737444,-0.01200775,,,0.001521675,-0.01157648,-0.001080894,0.002167116
available_bikes,-0.007164,-0.04050273,0.1925351,-0.671588,1.0,-0.0072,0.0004607384,0.0007861884,-0.02494123,-0.02412586,-0.02416974,-0.02496945,-0.01310691,0.02577782,,,-0.002658852,0.001700013,0.007430698,0.002680608
lastUpdate,0.999885,-4.131345e-06,3.368798e-06,-0.007862,-0.0072,1.0,-0.05217601,-0.06659996,0.5786581,0.6414084,0.510277,0.6218825,0.362919,-0.2461845,,,-0.2276228,-0.1988107,-0.225123,-0.2866757
latitude,-0.052198,4.756277e-14,-5.844746e-15,-0.002448,0.000461,-0.052176,1.0,0.5630411,-0.04246294,-0.04614369,-0.02645525,-0.05306859,-0.03616734,0.00671859,,,0.04056297,0.01821222,0.009552176,0.05506622
longitude,-0.066695,-1.573345e-15,1.084427e-15,-0.001602,0.000786,-0.0666,0.5630411,1.0,-0.04560366,-0.05586599,-0.02885004,-0.05302979,-0.04261718,0.02218992,,,0.06706614,0.002985883,0.07657471,0.05874112
temp,0.578778,-2.948827e-14,1.165076e-15,0.018364,-0.024941,0.578658,-0.04246294,-0.04560366,1.0,0.9630745,0.968782,0.9684643,0.1030317,-0.5652473,,,0.1484113,-0.1834336,0.08721784,-0.03759393
feels_like,0.641632,-3.429772e-14,-8.706614e-16,0.015762,-0.024126,0.641408,-0.04614369,-0.05586599,0.9630745,1.0,0.9053898,0.9574601,0.2141231,-0.5385302,,,-0.0637046,-0.1876907,-0.2633662,-0.1171979


# Multiple Linear Regression

In [30]:
X = stationWeather[['temp', 'feels_like', 'humidity', 'wind_speed']]
y = stationWeather.available_bikes

<h2>Training</h2>

In [31]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['temp', 'feels_like', 'humidity', 'wind_speed'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
            temp  feels_like  humidity  wind_speed
0        282.81      278.75        71       10.80
1        282.81      278.75        71       10.80
2        282.81      278.75        71       10.80
3        282.81      278.75        71       10.80
4        282.81      278.75        71       10.80
...         ...         ...       ...         ...
1579815  288.55      287.63        57        4.63
1579816  288.55      287.63        57        4.63
1579817  288.55      287.63        57        4.63
1579818  288.55      287.63        57        4.63
1579819  288.55      287.63        57        4.63

[1579820 rows x 4 columns]
Coeficients: 
 [-0.05920869  0.01217033  0.01527799  0.01029389]

Intercept: 
 24.548652433756217


Unnamed: 0,feature,importance
2,humidity,0.015278
1,feels_like,0.01217
3,wind_speed,0.010294
0,temp,-0.059209


In [32]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('dwmb_linReg_model.pkl', 'wb') as handle:
    pickle.dump(linreg, handle, pickle.HIGHEST_PROTOCOL)

<h2>Testing</h2>

In [33]:
linreg_predictions = linreg.predict(X)
print(type(linreg_predictions))

print("\nPredictions with linear regression: \n")
actual_vs_predicted_linreg = pd.concat([y, pd.DataFrame(linreg_predictions, columns=['Predicted'], index=y.index)], axis=1)
print(actual_vs_predicted_linreg)

<class 'numpy.ndarray'>

Predictions with linear regression: 

         available_bikes  Predicted
0                     18  12.392234
1                      2  12.392234
2                     13  12.392234
3                     13  12.392234
4                      2  12.392234
...                  ...        ...
1579815                0  11.883044
1579816               14  11.883044
1579817               21  11.883044
1579818               30  11.883044
1579819                7  11.883044

[1579820 rows x 2 columns]


In [45]:
print(X.columns)
small_linreg = [X.iloc[0]]
Small_linreg_predictions = linreg.predict(small_linreg)
print(type(linreg_predictions))

Index(['temp', 'feels_like', 'humidity', 'wind_speed'], dtype='object')
<class 'numpy.ndarray'>


In [46]:
# Prototyping the results for /predict
# convert numpy array to dictionary
temp_results = dict(enumerate(Small_linreg_predictions.flatten(), 1))

# print numpy array
print(Small_linreg_predictions)
print(type(Small_linreg_predictions))
 
# print dictionary
print(temp_results)
print(type(temp_results))

[12.39223438]
<class 'numpy.ndarray'>
{1: 12.392234375734047}
<class 'dict'>


In [16]:
prediction_errors = y - linreg_predictions
print("Actual - Predicted:\n", prediction_errors)
print("\n(Actual - Predicted) squared:\n", prediction_errors**2)
print("\n Sum of (Actual - Predicted) squared:\n", (prediction_errors**2).sum())

Actual - Predicted:
 0           5.607766
1         -10.392234
2           0.607766
3           0.607766
4         -10.392234
             ...    
1579815   -11.883044
1579816     2.116956
1579817     9.116956
1579818    18.116956
1579819    -4.883044
Name: available_bikes, Length: 1579820, dtype: float64

(Actual - Predicted) squared:
 0           31.447035
1          107.998535
2            0.369379
3            0.369379
4          107.998535
              ...    
1579815    141.206731
1579816      4.481503
1579817     83.118890
1579818    328.224100
1579819     23.844117
Name: available_bikes, Length: 1579820, dtype: float64

 Sum of (Actual - Predicted) squared:
 127029972.69695903


In [17]:
# Mean Squared Error
mse = (prediction_errors** 2).mean()
rmse = ((prediction_errors** 2).mean())**0.5

print("\nMean Squared Error:\n", mse)
print("\nRoot Mean Squared Error:\n", rmse)


Mean Squared Error:
 80.40787728789726

Root Mean Squared Error:
 8.967043954832455


In [18]:
print("|Actual - Predicted|:\n", abs(prediction_errors))

|Actual - Predicted|:
 0           5.607766
1          10.392234
2           0.607766
3           0.607766
4          10.392234
             ...    
1579815    11.883044
1579816     2.116956
1579817     9.116956
1579818    18.116956
1579819     4.883044
Name: available_bikes, Length: 1579820, dtype: float64


In [19]:
# Mean Absolute Error
mae = abs(prediction_errors).mean()
print("\nMean Absolute Error:\n", mae)


Mean Absolute Error:
 7.361837571001831


In [20]:
# R2 Score
prediction_errors = y - linreg_predictions
print("Actual - Predicted:\n", prediction_errors)
print("\n(Actual - Predicted) squared:\n", prediction_errors**2)
print("\n Sum of squared errors:\n", sum(prediction_errors**2))

avg_predictions =np.ones(y.shape[0])* stationWeather.available_bikes.mean()

print("\nAverageModelPredictions:\n", avg_predictions)
avgpredictions_errors = y - avg_predictions
print("Actual - AvgPredictions:\n", avgpredictions_errors)
print("\n(Actual - AvgPredictions) squared:\n", avgpredictions_errors**2)
print("\n Total sum of squared errors:\n", sum(avgpredictions_errors**2))

r2 = 1 - sum(prediction_errors**2)/sum(avgpredictions_errors**2)
print("\n R2:\n", r2)

Actual - Predicted:
 0           5.607766
1         -10.392234
2           0.607766
3           0.607766
4         -10.392234
             ...    
1579815   -11.883044
1579816     2.116956
1579817     9.116956
1579818    18.116956
1579819    -4.883044
Name: available_bikes, Length: 1579820, dtype: float64

(Actual - Predicted) squared:
 0           31.447035
1          107.998535
2            0.369379
3            0.369379
4          107.998535
              ...    
1579815    141.206731
1579816      4.481503
1579817     83.118890
1579818    328.224100
1579819     23.844117
Name: available_bikes, Length: 1579820, dtype: float64

 Sum of squared errors:
 127029972.69696586

AverageModelPredictions:
 [12.50698181 12.50698181 12.50698181 ... 12.50698181 12.50698181
 12.50698181]
Actual - AvgPredictions:
 0           5.493018
1         -10.506982
2           0.493018
3           0.493018
4         -10.506982
             ...    
1579815   -12.506982
1579816     1.493018
1579817     8.49301

In [21]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [22]:
printMetrics(y, linreg_predictions)


MAE:  7.3618375710012485
RMSE:  8.967043954832214
R2:  0.0008265485304528308
