In [127]:
import numpy as np
import pandas as pd
import pymysql
import datetime
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pickle
import matplotlib.pyplot as plt
from constants import * 

In [19]:
# make database connection
try:
    # Connect to the database
    connection = pymysql.connect(host=DB,
                                user=DB_USER,
                                password=DB_PW,
                                database=CITY)
except pymysql.Error as e:
    print(e)

In [20]:
# get availability
get_availability = "SELECT * FROM availability"
df_availability = pd.read_sql_query(get_availability, connection)

# get weather
get_weather = "SELECT * FROM weather"
df_weather = pd.read_sql_query(get_weather, connection)

connection.close()

  df_availability = pd.read_sql_query(get_availability, connection)
  df_weather = pd.read_sql_query(get_weather, connection)


In [43]:
df_availability.head()

Unnamed: 0,number,available_bikes,available_bike_stands,last_update
0,1,15,16,1710576480
1,1,15,16,1710577085
2,1,16,15,1710577691
3,1,16,15,1710578296
4,1,16,15,1710578901


In [44]:
df_weather.head()

Unnamed: 0,last_update,rain,temp,hum
0,1712652300,0.03,7.0,81.0
1,1712653200,0.11,7.0,81.0
2,1712654100,0.11,8.0,76.0
3,1712655000,0.11,8.0,76.0
4,1712655900,0.11,8.0,76.0


In [45]:
# merge availability with weather data
df = pd.merge_asof(df_availability.sort_values('last_update'), 
                          df_weather.sort_values('last_update'), 
                          on='last_update')

df.head()

Unnamed: 0,number,available_bikes,available_bike_stands,last_update,rain,temp,hum
0,10,11,5,1710576107,,,
1,95,38,2,1710576111,,,
2,60,14,16,1710576117,,,
3,20,1,29,1710576118,,,
4,105,3,33,1710576118,,,


Unfortunately, the scraper that gathered the weather data was failing for the first few weeks and we weren't aware, so the data we have available isn't as comprehensive as we would like

In [46]:
df = df.dropna()
df.shape

(149328, 7)

The day of the week and the hour of the day are likely useful predictors for bike availability, and as such we want to impute these from the data collected

In [47]:
def get_day_and_hour(timestamp):
    datetime_obj = pd.to_datetime(timestamp, unit='s')
    return datetime_obj.hour, datetime_obj.dayofweek

In [48]:
df['day'], df['hour'] = zip(*df['last_update'].apply(get_day_and_hour))

Additionally, instead of prpedicting both `available_bikes` and `available_parking`, it makes instead to predict for just `availability`, which is effectively a ratio of the available bikes to the overall capacity of the station

This simplifies the analytics pipeline and will make the predictions more intutive: high availability is desirable in a departure station and low availability (high parking) is desirable in an arrival station

In [49]:
df['availability'] = df['available_bikes'] / (df['available_bikes'] + df['available_bike_stands'])

In [50]:
df.head()

Unnamed: 0,number,available_bikes,available_bike_stands,last_update,rain,temp,hum,day,hour,availability
460665,97,8,32,1712652301,0.03,7.0,81.0,8,1,0.2
460666,53,19,21,1712652306,0.03,7.0,81.0,8,1,0.475
460667,58,38,2,1712652313,0.03,7.0,81.0,8,1,0.95
460668,24,10,10,1712652319,0.03,7.0,81.0,8,1,0.5
460669,44,2,28,1712652325,0.03,7.0,81.0,8,1,0.066667


We will be making predictions for each station based on `rain`, `temp`, `hum`, `day` and `hour`: `day` is a categorical variable and we will treat the others as continuous

The first stage of our pipeline will be declaring `day` as a categorical variable

It is also best practise to standardise the scale of the continuous features

And finally, to add some non-linearity, we will impute some polynomial features for the continuous features

In [106]:
categorical = ['day']
continuous = ['rain', 'temp', 'hum', 'hour']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical),
        ('scaling', StandardScaler(), continuous),
        ('poly', PolynomialFeatures(degree=2), continuous)
    ],
    remainder='passthrough'
)

We will also want a way to compare the accuracy of different models across iterations, so we will store them in a dataframe for now

In [107]:
station_numbers = df['number'].unique()
df_results = pd.DataFrame({"number": station_numbers})

## Linear Regression

Let's keep it simple and start with a linear regression

In [108]:
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Add Linear Regression as the final step
])

In [122]:
lr_results = []

for station_number in station_numbers:
    df_station = df[df['number'] == station_number]
    X_train, X_test, y_train, y_test = train_test_split(df_station[categorical + continuous], df_station['availability'], test_size=0.2, random_state=69)
    score = lr_pipeline.fit(X_train, y_train).score(X_test, y_test)
    print("Station Number", station_number, "R2 Score:", score)

    with open(f'models/lr/lr_{station_number}.pkl','wb') as handle:
        pickle.dump(lr_pipeline, handle, pickle.HIGHEST_PROTOCOL)
    
    lr_results.append(score)

df_results['linear_regression'] = lr_results

Station Number 97 R2 Score: 0.6292366667416187
Station Number 53 R2 Score: 0.5969795802533711
Station Number 58 R2 Score: 0.6218622058086514
Station Number 24 R2 Score: 0.43404677435193617
Station Number 44 R2 Score: 0.6902700676728959
Station Number 20 R2 Score: 0.7651677876813636
Station Number 74 R2 Score: 0.6112123697744438
Station Number 80 R2 Score: 0.3553049343798558
Station Number 29 R2 Score: 0.6469902428360788
Station Number 47 R2 Score: 0.6095194680644069
Station Number 73 R2 Score: 0.5715964479041684
Station Number 88 R2 Score: 0.46754362110248415
Station Number 3 R2 Score: 0.6308924477076583
Station Number 42 R2 Score: 0.5622654551803173
Station Number 71 R2 Score: 0.15302434514291263
Station Number 117 R2 Score: 0.6263216383165278
Station Number 78 R2 Score: 0.3721581544429252
Station Number 91 R2 Score: 0.650393498056892
Station Number 22 R2 Score: 0.4096474223567308
Station Number 33 R2 Score: 0.5859321337796066
Station Number 62 R2 Score: 0.507988271220206
Station Numb

## Neural Network

When we included polynomial features, it seemed to be at odds with the non-linearity already introduced by the neural network, so we will use a different preprocessor here and drop the polynomial transformation

In [110]:
nn_preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical),
        ('scaling', StandardScaler(), continuous)
    ],
    remainder='passthrough'
)

In [116]:
nn_pipeline = Pipeline([
    ('preprocessor', nn_preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=69))  # Add Linear Regression as the final step
])

In [123]:
nn_results = []

for station_number in station_numbers:
    df_station = df[df['number'] == station_number]
    X_train, X_test, y_train, y_test = train_test_split(df_station[categorical + continuous], df_station['availability'], test_size=0.2, random_state=69)
    score = nn_pipeline.fit(X_train, y_train).score(X_test, y_test)
    print("Station Number", station_number, "R2 Score:", score)

    with open(f'models/nn/nn_{station_number}.pkl','wb') as handle:
        pickle.dump(nn_pipeline, handle, pickle.HIGHEST_PROTOCOL)

    nn_results.append(score)

df_results['neural_network'] = nn_results

Station Number 97 R2 Score: 0.9068224471955162
Station Number 53 R2 Score: 0.8227392483330356
Station Number 58 R2 Score: 0.9432230701574884
Station Number 24 R2 Score: 0.8686696116181719
Station Number 44 R2 Score: 0.8774788707386383
Station Number 20 R2 Score: 0.9287467277586812
Station Number 74 R2 Score: 0.8624672413632072
Station Number 80 R2 Score: 0.6896725846414924
Station Number 29 R2 Score: 0.8562173175042755
Station Number 47 R2 Score: 0.9264635789385537
Station Number 73 R2 Score: 0.903419305904878
Station Number 88 R2 Score: 0.8502235228269908
Station Number 3 R2 Score: 0.9131892136368268
Station Number 42 R2 Score: 0.8947154490092641
Station Number 71 R2 Score: 0.5203293378827203
Station Number 117 R2 Score: 0.8881105606873879
Station Number 78 R2 Score: 0.6367895027162387
Station Number 91 R2 Score: 0.8683351057258478
Station Number 22 R2 Score: 0.8525298406812793
Station Number 33 R2 Score: 0.7731045690762555
Station Number 62 R2 Score: 0.9065413031815319
Station Number

Granted, very little model tweaking was done here to improve the neural network, at a glance the R2 score performance is impressive seems to outperform the linear regression

## Random Forest

The other option for regression is a random forest, it will take the same scaling and polynomial features as the linear regression

In [131]:
rfr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50))  # Add Linear Regression as the final step
])

In [134]:
rfr_results = []

for station_number in station_numbers:
    df_station = df[df['number'] == station_number]
    X_train, X_test, y_train, y_test = train_test_split(df_station[categorical + continuous], df_station['availability'], test_size=0.2, random_state=69)
    score = rfr_pipeline.fit(X_train, y_train).score(X_test, y_test)
    print("Station Number", station_number, "R2 Score:", score)

    with open(f'models/rfr/rfr_{station_number}.pkl','wb') as handle:
        pickle.dump(nn_pipeline, handle, pickle.HIGHEST_PROTOCOL)

    rfr_results.append(score)

df_results['random_forest_regressor'] = rfr_results

Station Number 97 R2 Score: 0.9673571390045093
Station Number 53 R2 Score: 0.9344992593378499
Station Number 58 R2 Score: 0.9586892218111595
Station Number 24 R2 Score: 0.9360906032445688
Station Number 44 R2 Score: 0.9381593396018112
Station Number 20 R2 Score: 0.9408717551062459
Station Number 74 R2 Score: 0.9497999562491009
Station Number 80 R2 Score: 0.9158470005434084
Station Number 29 R2 Score: 0.966194677585284
Station Number 47 R2 Score: 0.9782411538740217
Station Number 73 R2 Score: 0.8724127486576215
Station Number 88 R2 Score: 0.9339349297231847
Station Number 3 R2 Score: 0.9638438401012632
Station Number 42 R2 Score: 0.9630375747792563
Station Number 71 R2 Score: 0.8742330441431594
Station Number 117 R2 Score: 0.9440414753867267
Station Number 78 R2 Score: 0.9570479304019712
Station Number 91 R2 Score: 0.9515734472896479
Station Number 22 R2 Score: 0.9273420987086627
Station Number 33 R2 Score: 0.8984304689157241
Station Number 62 R2 Score: 0.9221786646977299
Station Number

I was sceptical that we could do much better than the neural network but the random forest achieved exceptional results

In [137]:
"Linear Regression avg. R^2 Score: " + str(df_results['linear_regression'].mean())

'Linear Regression avg. R^2 Score: 0.5626720175803154'

In [138]:
"Neural Network avg. R^2 Score: " + str(df_results['neural_network'].mean())

'Neural Network avg. R^2 Score: 0.8494494464764162'

In [139]:
"Random Forest Regressor avg. R^2 Score: " + str(df_results['random_forest_regressor'].mean())

'Random Forest Regressor avg. R^2 Score: 0.9369386461152293'