In [None]:
# Using time as numeric and day as categorical

In [13]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LinearRegression, LogisticRegression
import pickle
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
import os

In [2]:
SQLPW = os.environ['SQLPW']


In [3]:
# Connect and query
engine = create_engine("mysql+mysqlconnector://softies:" + SQLPW + "@db-bikes.ck7tnbvjxsza.eu-west-1.rds.amazonaws.com:3306/db-bikes")

# Test with just station 108
stations = pd.read_sql_query("SELECT dynamic.available_bikes, dynamic.last_update from dynamic JOIN static ON static.address=dynamic.address WHERE static.number=108", engine)

In [4]:
stations

Unnamed: 0,available_bikes,last_update
0,9,2022-02-22 15:43:46
1,9,2022-02-22 15:53:52
2,18,2022-02-23 18:03:46
3,17,2022-02-23 18:17:57
4,17,2022-02-23 18:28:04
...,...,...
8241,14,2022-04-11 19:06:13
8242,14,2022-04-11 19:16:19
8243,15,2022-04-11 19:22:50
8244,16,2022-04-11 19:32:53


In [5]:
features = ['time', 'day']
days_of_week = ["Sunday","Monday", "Tuesday","Wednesday", "Thursday", "Friday", "Saturday"]

# Split last update to day and time
stations['day'] = list(map(lambda x: x.strftime('%A'), list(stations['last_update'])))
stations['time'] = list(map(lambda x: x.strftime('%H'), list(stations['last_update'])))

# Another sexual lambda function to convert day of week to number
# stations['day'] = list(map(lambda x: days_of_week.index(x), list(stations['day'])))

# for day in days_of_week:
#     stations[day] = list(map(lambda x: 1 if x == days_of_week.index(day) else 0, list(stations['day'])))

stations

Unnamed: 0,available_bikes,last_update,day,time
0,9,2022-02-22 15:43:46,Tuesday,15
1,9,2022-02-22 15:53:52,Tuesday,15
2,18,2022-02-23 18:03:46,Wednesday,18
3,17,2022-02-23 18:17:57,Wednesday,18
4,17,2022-02-23 18:28:04,Wednesday,18
...,...,...,...,...
8241,14,2022-04-11 19:06:13,Monday,19
8242,14,2022-04-11 19:16:19,Monday,19
8243,15,2022-04-11 19:22:50,Monday,19
8244,16,2022-04-11 19:32:53,Monday,19


In [6]:
X = stations[features]
X

Unnamed: 0,time,day
0,15,Tuesday
1,15,Tuesday
2,18,Wednesday
3,18,Wednesday
4,18,Wednesday
...,...,...
8241,19,Monday
8242,19,Monday
8243,19,Monday
8244,19,Monday


In [7]:
# Make time a numeric variable
X['time'] = X['time'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['time'] = X['time'].astype('int')


In [8]:
X.dtypes

time     int32
day     object
dtype: object

In [9]:
# Convert categorical variables into dummy variables
X = pd.get_dummies(data=X, drop_first=True)
X.shape

(8246, 7)

In [10]:
Y = stations['available_bikes']

In [11]:
# See time is numeric and day is dummy variable
X

Unnamed: 0,time,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,15,0,0,0,0,1,0
1,15,0,0,0,0,1,0
2,18,0,0,0,0,0,1
3,18,0,0,0,0,0,1
4,18,0,0,0,0,0,1
...,...,...,...,...,...,...,...
8241,19,1,0,0,0,0,0
8242,19,1,0,0,0,0,0
8243,19,1,0,0,0,0,0
8244,19,1,0,0,0,0,0


In [16]:
# Create test and train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 40)


In [17]:
model = LinearRegression()
model.fit(X_train, Y_train)
y_test_pred = model.predict(X_test)

In [18]:
print("Mean absolute error =", round(sm.mean_absolute_error(Y_test, y_test_pred), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(Y_test, y_test_pred), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(Y_test, y_test_pred), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(Y_test, y_test_pred), 2)) 
print("R2 score =", round(sm.r2_score(Y_test, y_test_pred), 2))

Mean absolute error = 6.06
Mean squared error = 53.04
Median absolute error = 5.74
Explain variance score = 0.12
R2 score = 0.12
