In [11]:
#Import pandas and scikitlearn for Machine Learning Models
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, ElasticNetCV
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor

# Pickle saves the model
import pickle

import math

In [2]:
# Read in cleaned results
df = pd.read_csv('bike+weather_to_28_03_19.csv')

In [12]:
# Transform to a 2D array and assign features and target
# Available bikes as target. Another model can be done for available stands if needed.
features = df[['number', 'hour', 'minute',
              'main_temp', 'main_wind_speed', 'main_rain_volume_1h', 'main_snow_volume_1h',
              'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
              'Sunday', 'light_rain', 'shower', 'light_drizzle', 'clear_sky',
              'fog', 'mist', 'moderate_rain', 'very_heavy_rain', 'sleet', 
              'snow', 'light_snow', 'drizzle', 'clouds', 'shower_sleet', 'heavy_snow']].values
targets = df['available_bikes'].values


In [13]:
# Cycles through neural network options and selects the most accurate.
def mlp_regressor_grid_search():
    est = MLPRegressor(activation= 'logistic')
    sizes = []
    for size1 in range(5, 20, 5):
        sizes.append(size1)

    param_grid = dict(solver=['lbfgs'], #'sgd','adam'
                      learning_rate=['adaptive','invscaling'],
                      alpha=[0.0001], # [0.0005, 0.001, 0.005, 0.01]
                      max_iter=[200], #np.arange(200, 300, 20)
                      tol=[0.0001], #np.arange(0.00001, 0.0001, 0.00001)
                      hidden_layer_sizes=sizes)
    return GridSearchCV(est, param_grid=param_grid, n_jobs=1, verbose=100)


In [14]:
models = [LinearRegression(),
          Ridge(),
          HuberRegressor(),
          ElasticNetCV(),
          DecisionTreeRegressor(), 
          ExtraTreesRegressor(),
          GradientBoostingRegressor(),
          RandomForestRegressor(),
          BaggingRegressor()]


In [15]:
# Split the data set into test and training data.
def split_by_position(features, targets):
    """
    train 0.80
    test 0.20
    """
    len_train = int(0.80 * len(features))
    train_features = features[0:len_train]
    train_targets = targets[0:len_train]
    test_features = features[len_train:]
    test_targets = targets[len_train:]
    return train_features, test_features, train_targets, test_targets


In [16]:
# Assign training features and target and test features and target
train_features, test_features, train_targets, test_targets = split_by_position(features, targets)



In [17]:
# Normalise the features for training
scaler = StandardScaler()
scaler.fit(train_features)
scaled_train_features = scaler.transform(train_features)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count


In [18]:
# Tests training features and targets with different neural network options
est = mlp_regressor_grid_search()
est.fit(scaled_train_features, train_targets)




Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] alpha=0.0001, hidden_layer_sizes=5, learning_rate=adaptive, max_iter=200, solver=lbfgs, tol=0.0001 




ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [10]:
# Print best hyper parameters for model
print(est.best_estimator_)

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=5, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


In [11]:
# Print score for best estimator on test data
print(est.score(scaler.transform(test_features), test_targets))

0.03694047286592561


In [12]:
# Test algorithms for performance.
for model in models:
    est = model
    est.fit(scaled_train_features, train_targets)
    print(est.score(scaler.transform(test_features), test_targets))

-0.062333034862789294
-0.06233280617312143
-0.014559521222305525




-0.057212782207644475
-0.4409851318085789




-0.3737735935040265
-0.24131868469258966




-0.29773723426883003
-0.29974566153494475


In [13]:
# Use features to print a prediction
predict_data = [[2.0,4.0,17.0,44.0]]
scaled_predict = scaler.transform(predict_data)


prediction = est.predict(scaled_predict)
print("PREDICTION:", math.floor(prediction))


PREDICTION: 1


In [16]:
# Using elastic net CV as seems to be the best fit currently
# Pickle saves the model
est = MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=5, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
est.fit(scaled_train_features, train_targets)
filename_est = '../app/model.sav'
filename_scaler = '../app/scaler.sav'
pickle.dump(est, open(filename_est, 'wb'))
pickle.dump(scaler, open(filename_scaler, 'wb'))
