In [51]:
#Import pandas and scikitlearn for Machine Learning Models
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, ElasticNetCV
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor

# Pickle saves the model
import pickle

import math

In [52]:
# Read in cleaned results
df = pd.read_csv('bike_weather_data.csv')

In [60]:
#df = df.loc[df['number'] == 2]

In [54]:
# Transform to a 2D array and assign features and target
# Available bikes as target. Another model can be done for available stands if needed.
features = df[['number', 'hour', 'minute',
              'main_temp', 'main_wind_speed', 'main_rain_volume_1h', 'main_snow_volume_1h',
              'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
              'Sunday', 'clouds', 'atmosphere', 'snow', 'light_rain',
              'rain', 'light_drizzle', 'drizzle', 'thunderstorm']].values
targets = df['available_bikes'].values


In [55]:
# Cycles through neural network options and selects the most accurate.
def mlp_regressor_grid_search():
    est = MLPRegressor(activation= 'logistic')
    sizes = []
    for size1 in range(5, 20, 5):
        sizes.append(size1)

    param_grid = dict(solver=['lbfgs'], #'sgd','adam'
                      learning_rate=['adaptive','invscaling'],
                      alpha=[0.0001], # [0.0005, 0.001, 0.005, 0.01]
                      max_iter=[200], #np.arange(200, 300, 20)
                      tol=[0.0001], #np.arange(0.00001, 0.0001, 0.00001)
                      hidden_layer_sizes=sizes)
    return GridSearchCV(est, param_grid=param_grid, n_jobs=1, verbose=100)


In [56]:
models = [LinearRegression(),
          Ridge(),
          HuberRegressor(),
          ElasticNetCV(),
          DecisionTreeRegressor(), 
          ExtraTreesRegressor(),
          GradientBoostingRegressor(),
          RandomForestRegressor(),
          BaggingRegressor()]


In [57]:
# Split the data set into test and training data.
def split_by_position(features, targets):
    """
    train 0.80
    test 0.20
    """
    len_train = int(0.80 * len(features))
    train_features = features[0:len_train]
    train_targets = targets[0:len_train]
    test_features = features[len_train:]
    test_targets = targets[len_train:]
    return train_features, test_features, train_targets, test_targets


In [58]:
# Assign training features and target and test features and target
train_features, test_features, train_targets, test_targets = split_by_position(features, targets)



In [59]:
# Normalise the features for training
scaler = StandardScaler()
scaler.fit(train_features)
scaled_train_features = scaler.transform(train_features)


ValueError: Found array with 0 sample(s) (shape=(0, 22)) while a minimum of 1 is required by StandardScaler.

In [50]:
# Tests training features and targets with different neural network options
est = mlp_regressor_grid_search()
est.fit(scaled_train_features, train_targets)




ValueError: Found input variables with inconsistent numbers of samples: [5384, 0]

In [34]:
# Print best hyper parameters for model
print(est.best_estimator_)

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=15, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


In [35]:
# Print score for best estimator on test data
print(est.score(scaler.transform(test_features), test_targets))


0.657077895715389


In [36]:
# Test algorithms for performance.
for model in models:
    est = model
    est.fit(scaled_train_features, train_targets)
    print(est.score(scaler.transform(test_features), test_targets))

-0.017231634688306352
-0.01931832973064318
-0.06061747864602762
0.10912041379977921
0.693071793894358
0.8029339769593492




0.7413657912020581
0.7943451173887663




0.7995582661107054


ExtraTrees - no alteration

0.11825679195687633

For all models:

0.012663067202476341
0.012660621258087978
-0.029618304713565413

0.012624214970299086
0.6871957436347627

0.8140946056581033
0.23435545657540113

0.7898846535251685
0.7895787539140001

In [37]:
# Get mean square error regression loss
est = ExtraTreesRegressor()
est.fit(scaled_train_features, train_targets)
test_predictions = est.predict(scaler.transform(test_features))
mean_squared_error(test_targets, test_predictions)

# 11.49 for station 54
# for all stations 21.6



13.155820341499629

In [38]:
mean_absolute_error(test_targets, test_predictions)
# 2.086 for station 54
# for all stations 2.84

2.1787676317743134

In [26]:
# Use features to print a prediction
# features = df[['number', 'hour', 'minute',
 #  'main_temp', 'main_wind_speed', 'main_rain_volume_1h', 'main_snow_volume_1h',
#   'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
#   'Sunday', 'clouds', 'atmosphere', 'snow', 'light_rain',
#   'rain', 'light_drizzle', 'drizzle', 'thunderstorm']].values
est = ExtraTreesRegressor()
est.fit(scaled_train_features, train_targets)
predict_data = [[54,19, 0,277.31,9.3, 0.51, 0, 0, 0, 1, 0, 0, 0, 0, 
                 0,0,0,1,0,0,0,0]]
scaled_predict = scaler.transform(predict_data)




prediction = est.predict(scaled_predict)
print("PREDICTION:", math.floor(prediction))


PREDICTION: 1




In [40]:
# Using elastic net CV as seems to be the best fit currently
# Pickle saves the model
est = ExtraTreesRegressor()
est.fit(scaled_train_features, train_targets)
filename_est = '../app/model.sav'
filename_scaler = '../app/scaler.sav'
pickle.dump(est, open(filename_est, 'wb'))
pickle.dump(scaler, open(filename_scaler, 'wb'))


