In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Allows plots to appear directly in the notebook.
%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV files into Data Frames:
weather = pd.read_csv('working_weatherHistoryResampled_asof_2022.04.06.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
station = pd.read_csv('working_stationStateResampled_asof_2022.04.06.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
weather.shape

(684, 7)

In [4]:
station.shape

(75240, 6)

In [5]:
weather.head()

Unnamed: 0,weatherHour,latitude,longitude,description,temp,humidity,wind_speed
0,2022-02-22 12:00:00,53.35,-6.26,few clouds,282.81,71,10.8
1,2022-02-24 10:00:00,53.35,-6.26,scattered clouds,276.31,80,9.77
2,2022-03-01 09:00:00,53.35,-6.26,clear sky,277.21,86,0.98
3,2022-03-01 10:00:00,53.35,-6.26,scattered clouds,279.13,83,1.78
4,2022-03-01 11:00:00,53.35,-6.26,broken clouds,280.61,80,3.65


In [6]:
station.head()

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes
0,143661,1,2022-02-22 12:00:00,OPEN,12,18
1,143662,2,2022-02-22 12:00:00,OPEN,18,2
2,143663,3,2022-02-22 12:00:00,OPEN,20,13
3,143664,4,2022-02-22 12:00:00,OPEN,22,13
4,143665,5,2022-02-22 12:00:00,OPEN,38,2


In [7]:
stationWeather = pd.merge(station, weather, how="left", on=["weatherHour"])

In [8]:
stationWeather.head()

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed
0,143661,1,2022-02-22 12:00:00,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.8
1,143662,2,2022-02-22 12:00:00,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.8
2,143663,3,2022-02-22 12:00:00,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.8
3,143664,4,2022-02-22 12:00:00,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.8
4,143665,5,2022-02-22 12:00:00,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.8


In [9]:
stationWeather.tail()

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed
75235,218896,106,2022-04-04 10:00:00,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84
75236,218897,107,2022-04-04 10:00:00,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84
75237,218898,108,2022-04-04 10:00:00,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84
75238,218899,109,2022-04-04 10:00:00,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84
75239,218900,110,2022-04-04 10:00:00,OPEN,28,2,53.35,-6.26,broken clouds,285.57,86,10.84


In [10]:
stationWeather.dtypes

ID                         int64
stationId                  int64
weatherHour               object
status                    object
available_bike_stands      int64
available_bikes            int64
latitude                 float64
longitude                float64
description               object
temp                     float64
humidity                   int64
wind_speed               float64
dtype: object

In [11]:
stationWeather[['weatherDate', 'weatherHour']] = stationWeather['weatherHour'].str.split(' ', expand=True)
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate
0,143661,1,12:00:00,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
1,143662,2,12:00:00,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
2,143663,3,12:00:00,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
3,143664,4,12:00:00,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
4,143665,5,12:00:00,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10:00:00,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04
75236,218897,107,10:00:00,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04
75237,218898,108,10:00:00,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04
75238,218899,109,10:00:00,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04


In [12]:
stationWeather[['weatherHour', 'weatherMinute', 'weatherSecond']] = stationWeather['weatherHour'].str.split(':', expand=True)
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
1,143662,2,12,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
2,143663,3,12,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
3,143664,4,12,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
4,143665,5,12,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00
75236,218897,107,10,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00
75237,218898,108,10,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00
75238,218899,109,10,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00


In [13]:
stationWeather['cal_bike_stands'] = stationWeather.available_bike_stands + stationWeather.available_bikes
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond,cal_bike_stands
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,30
1,143662,2,12,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,20
2,143663,3,12,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,33
3,143664,4,12,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,35
4,143665,5,12,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,20
75236,218897,107,10,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40
75237,218898,108,10,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40
75238,218899,109,10,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,28


In [14]:
# Change description to category type for numeric encoding
stationWeather['description'] = stationWeather['description'].astype('category')
stationWeather.dtypes

ID                          int64
stationId                   int64
weatherHour                object
status                     object
available_bike_stands       int64
available_bikes             int64
latitude                  float64
longitude                 float64
description              category
temp                      float64
humidity                    int64
wind_speed                float64
weatherDate                object
weatherMinute              object
weatherSecond              object
cal_bike_stands             int64
dtype: object

In [15]:
# Catergorical Encoding
stationWeather['num_desc'] = stationWeather['description'].cat.codes
stationWeather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond,cal_bike_stands,num_desc
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,30,2
1,143662,2,12,OPEN,18,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,20,2
2,143663,3,12,OPEN,20,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,33,2
3,143664,4,12,OPEN,22,13,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,35,2
4,143665,5,12,OPEN,38,2,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,40,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75235,218896,106,10,OPEN,3,17,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,20,0
75236,218897,107,10,OPEN,29,11,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40,0
75237,218898,108,10,OPEN,37,3,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,40,0
75238,218899,109,10,OPEN,4,24,53.35,-6.26,broken clouds,285.57,86,10.84,2022-04-04,00,00,28,0


In [18]:
station1Weather = stationWeather.query('stationId == 1')
station1Weather

Unnamed: 0,ID,stationId,weatherHour,status,available_bike_stands,available_bikes,latitude,longitude,description,temp,humidity,wind_speed,weatherDate,weatherMinute,weatherSecond,cal_bike_stands,num_desc
0,143661,1,12,OPEN,12,18,53.35,-6.26,few clouds,282.81,71,10.80,2022-02-22,00,00,30,2
110,143771,1,10,OPEN,17,13,53.35,-6.26,scattered clouds,276.31,80,9.77,2022-02-24,00,00,30,13
220,143881,1,09,OPEN,8,22,53.35,-6.26,clear sky,277.21,86,0.98,2022-03-01,00,00,30,1
330,143991,1,10,OPEN,12,18,53.35,-6.26,scattered clouds,279.13,83,1.78,2022-03-01,00,00,30,13
440,144101,1,11,OPEN,14,16,53.35,-6.26,broken clouds,280.61,80,3.65,2022-03-01,00,00,30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74690,218351,1,06,OPEN,17,13,53.35,-6.26,broken clouds,283.60,91,9.39,2022-04-04,00,00,30,0
74800,218461,1,07,OPEN,22,8,53.35,-6.26,broken clouds,283.86,90,8.45,2022-04-04,00,00,30,0
74910,218571,1,08,OPEN,24,5,53.35,-6.26,broken clouds,284.18,89,9.12,2022-04-04,00,00,29,0
75020,218681,1,09,OPEN,26,3,53.35,-6.26,broken clouds,284.66,89,10.12,2022-04-04,00,00,29,0


In [19]:
# Checking correlations for all the contious features
station1Weather.corr()

Unnamed: 0,ID,stationId,available_bike_stands,available_bikes,latitude,longitude,temp,humidity,wind_speed,cal_bike_stands,num_desc
ID,1.0,,0.171404,-0.171881,,,0.230413,-0.141198,-0.238943,-0.006992,0.008602
stationId,,,,,,,,,,,
available_bike_stands,0.171404,,1.0,-0.998649,,,0.415894,-0.290735,0.214522,0.038776,-0.061299
available_bikes,-0.171881,,-0.998649,1.0,,,-0.414172,0.288218,-0.215903,0.013194,0.059812
latitude,,,,,,,,,,,
longitude,,,,,,,,,,,
temp,0.230413,,0.415894,-0.414172,,,1.0,-0.467591,0.13685,0.038439,-0.033805
humidity,-0.141198,,-0.290735,0.288218,,,-0.467591,1.0,-0.178957,-0.05214,-0.049107
wind_speed,-0.238943,,0.214522,-0.215903,,,0.13685,-0.178957,1.0,-0.023807,0.011951
cal_bike_stands,-0.006992,,0.038776,0.013194,,,0.038439,-0.05214,-0.023807,1.0,-0.029395


# Multiple Linear Regression

In [20]:
X = station1Weather[['stationId', 'weatherHour', 'temp', 'humidity', 'wind_speed', 'cal_bike_stands', 'num_desc']]
y = station1Weather.available_bikes

<h2>Training</h2>

In [21]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['stationId', 'weatherHour', 'temp', 'humidity', 'wind_speed', 'cal_bike_stands', 'num_desc'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
        stationId weatherHour    temp  humidity  wind_speed  cal_bike_stands  \
0              1          12  282.81        71       10.80               30   
110            1          10  276.31        80        9.77               30   
220            1          09  277.21        86        0.98               30   
330            1          10  279.13        83        1.78               30   
440            1          11  280.61        80        3.65               30   
...          ...         ...     ...       ...         ...              ...   
74690          1          06  283.60        91        9.39               30   
74800          1          07  283.86        90        8.45               30   
74910          1          08  284.18        89        9.12               29   
75020          1          09  284.66        89       10.12               29   
75130          1          10  285.57        86       10.84               29   

       num_desc  
0             2  
110

Unnamed: 0,feature,importance
5,cal_bike_stands,0.438476
6,num_desc,0.090702
3,humidity,0.083771
1,weatherHour,0.07919
0,stationId,0.0
4,wind_speed,-0.460905
2,temp,-0.854095


In [22]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('dwmb_resample_singleStation_linReg_model.pkl', 'wb') as handle:
    pickle.dump(linreg, handle, pickle.HIGHEST_PROTOCOL)

<h2>Testing</h2>

In [23]:
linreg_predictions = linreg.predict(X)
print(type(linreg_predictions))

print("\nPredictions with linear regression: \n")
actual_vs_predicted_linreg = pd.concat([y, pd.DataFrame(linreg_predictions, columns=['Predicted'], index=y.index)], axis=1)
print(actual_vs_predicted_linreg)

<class 'numpy.ndarray'>

Predictions with linear regression: 

       available_bikes  Predicted
0                   18  15.362488
110                 13  22.982123
220                 22  25.599808
330                 18  24.507521
440                 16  21.030317
...                ...        ...
74690               13  16.356509
74800                8  16.563114
74910                5  15.537940
75020                3  14.746258
75130                1  13.465056

[684 rows x 2 columns]


In [24]:
print(X.columns)
small_linreg = [X.iloc[0]]
Small_linreg_predictions = linreg.predict(small_linreg)
print(type(linreg_predictions))

Index(['stationId', 'weatherHour', 'temp', 'humidity', 'wind_speed',
       'cal_bike_stands', 'num_desc'],
      dtype='object')
<class 'numpy.ndarray'>


  return f(*args, **kwargs)


In [25]:
# Prototyping the results for /predict
# convert numpy array to dictionary
temp_results = dict(enumerate(Small_linreg_predictions.flatten(), 1))

# print numpy array
print(Small_linreg_predictions)
print(type(Small_linreg_predictions))
 
# print dictionary
print(temp_results)
print(type(temp_results))

[15.36248817]
<class 'numpy.ndarray'>
{1: 15.362488169148492}
<class 'dict'>


In [26]:
prediction_errors = y - linreg_predictions
print("Actual - Predicted:\n", prediction_errors)
print("\n(Actual - Predicted) squared:\n", prediction_errors**2)
print("\n Sum of (Actual - Predicted) squared:\n", (prediction_errors**2).sum())

Actual - Predicted:
 0         2.637512
110      -9.982123
220      -3.599808
330      -6.507521
440      -5.030317
           ...    
74690    -3.356509
74800    -8.563114
74910   -10.537940
75020   -11.746258
75130   -12.465056
Name: available_bikes, Length: 684, dtype: float64

(Actual - Predicted) squared:
 0          6.956469
110       99.642772
220       12.958615
330       42.347832
440       25.304093
            ...    
74690     11.266151
74800     73.326917
74910    111.048171
75020    137.974587
75130    155.377630
Name: available_bikes, Length: 684, dtype: float64

 Sum of (Actual - Predicted) squared:
 33993.644956226126


In [27]:
# Mean Squared Error
mse = (prediction_errors** 2).mean()
rmse = ((prediction_errors** 2).mean())**0.5

print("\nMean Squared Error:\n", mse)
print("\nRoot Mean Squared Error:\n", rmse)


Mean Squared Error:
 49.6983113395119

Root Mean Squared Error:
 7.049702925621186


In [28]:
print("|Actual - Predicted|:\n", abs(prediction_errors))

|Actual - Predicted|:
 0         2.637512
110       9.982123
220       3.599808
330       6.507521
440       5.030317
           ...    
74690     3.356509
74800     8.563114
74910    10.537940
75020    11.746258
75130    12.465056
Name: available_bikes, Length: 684, dtype: float64


In [29]:
# Mean Absolute Error
mae = abs(prediction_errors).mean()
print("\nMean Absolute Error:\n", mae)


Mean Absolute Error:
 5.839607618943632


In [30]:
# R2 Score
prediction_errors = y - linreg_predictions
print("Actual - Predicted:\n", prediction_errors)
print("\n(Actual - Predicted) squared:\n", prediction_errors**2)
print("\n Sum of squared errors:\n", sum(prediction_errors**2))

avg_predictions =np.ones(y.shape[0])* station1Weather.available_bikes.mean()

print("\nAverageModelPredictions:\n", avg_predictions)
avgpredictions_errors = y - avg_predictions
print("Actual - AvgPredictions:\n", avgpredictions_errors)
print("\n(Actual - AvgPredictions) squared:\n", avgpredictions_errors**2)
print("\n Total sum of squared errors:\n", sum(avgpredictions_errors**2))

r2 = 1 - sum(prediction_errors**2)/sum(avgpredictions_errors**2)
print("\n R2:\n", r2)

Actual - Predicted:
 0         2.637512
110      -9.982123
220      -3.599808
330      -6.507521
440      -5.030317
           ...    
74690    -3.356509
74800    -8.563114
74910   -10.537940
75020   -11.746258
75130   -12.465056
Name: available_bikes, Length: 684, dtype: float64

(Actual - Predicted) squared:
 0          6.956469
110       99.642772
220       12.958615
330       42.347832
440       25.304093
            ...    
74690     11.266151
74800     73.326917
74910    111.048171
75020    137.974587
75130    155.377630
Name: available_bikes, Length: 684, dtype: float64

 Sum of squared errors:
 33993.64495622614

AverageModelPredictions:
 [19.89327485 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485
 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485
 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485
 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485 19.89327485
 19.89327485 19.89327485 19.89327485 19.89327485 19.8

In [31]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [32]:
printMetrics(y, linreg_predictions)


MAE:  5.839607618943638
RMSE:  7.049702925621184
R2:  0.21232985882944178
