## Reading Data

In [None]:
# Library Imports.
import operator

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from patsy import dmatrices
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import KFold

from scipy.optimize import curve_fit

import sqlalchemy as sqla
import pymysql
from sqlalchemy import create_engine

import csv
import datetime as dt

In [None]:
URI='database-comp30830.c2kwpm1jk01q.us-east-1.rds.amazonaws.com'
PORT='3306'
DB='comp30830_db'
PASSWORD='Simple12'
USER='admin'

In [None]:
engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD,
                                                              URI, PORT, DB), echo=True)

In [None]:
bikes=pd.read_sql_table('live_bike_data', engine)  

In [None]:
# Make a new dataframe of this table
bikes.to_csv('allBikes.csv', index=False)

In [None]:
# Read csv file into a dataframe.
bikes = pd.read_csv('allBikes.csv')

In [None]:
weather=pd.read_sql_table('live_weather_data', engine)  

In [None]:
# Make a new dataframe of this table
weather.to_csv('allWeather.csv', index=False)

In [None]:
# Read csv file into a dataframe.
weather = pd.read_csv('allWeather.csv')

In [None]:
bikes['datetime'] = pd.to_datetime(bikes['date'] + ' ' + bikes['time'])
weather['datetime'] = pd.to_datetime(weather['date'] + ' ' + weather['time'])

In [None]:
bikes = bikes.sort_values(by='datetime')
weather = weather.sort_values(by='datetime')

## This can be used as a checkpoint, start from here if you want to run again without having to call from the database.

In [None]:
full_df = pd.merge_asof(bikes, weather, left_on="datetime", right_on="datetime",direction="nearest")

In [None]:
full_df.head(3)

In [None]:
# constrict the dataframe to only those times in which the service is available to users.
full_df = full_df.drop(full_df[(full_df.datetime.dt.hour > 0) & (full_df.datetime.dt.hour < 5)].index)

In [None]:
## Create four each flags representing the stage of the day.
morning_start = pd.to_datetime("05:00:00").time()
morning_end = pd.to_datetime("12:00:00").time()
afternoon_start = pd.to_datetime("12:01:00").time()
afternoon_end = pd.to_datetime("16:59:00").time()
evening_start = pd.to_datetime("17:00:00").time()
evening_end = pd.to_datetime("20:00:00").time()
night_start = pd.to_datetime("20:01:00").time()
night_end = pd.to_datetime("23:59:59").time()

In [None]:
full_df['morning'] = np.where((full_df['datetime'].dt.time > morning_start)
                         & (full_df['datetime'].dt.time < morning_end),
                         1, 0)

full_df['afternoon'] = np.where((full_df['datetime'].dt.time > afternoon_start)
                         & (full_df['datetime'].dt.time < afternoon_end),
                         1, 0)

full_df['evening'] = np.where((full_df['datetime'].dt.time > evening_start)
                         & (full_df['datetime'].dt.time < evening_end),
                         1, 0)

full_df['night'] = np.where((full_df['datetime'].dt.time > night_start)
                         & (full_df['datetime'].dt.time < night_end),
                         1, 0)

In [None]:
#replace days with numbers
full_df["day_x"].replace(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], [0,1,2,3,4,5,6], inplace=True)

In [None]:
# create a time of day column, based on the hours of the day.
full_df['tod'] = full_df.datetime.dt.hour

- Observe the usage on a given weekday at a particular station:

In [None]:
weekday_test_df = full_df.loc[(full_df['ID'] == 84) & (full_df['day_x'] == 2)]

In [None]:
# First, plot the observed data
weekday_test_df.plot(kind='scatter', x='tod', y='availableBikes')

In [None]:
weekday_test_df = full_df.loc[(full_df['ID'] == 84) & (full_df['day_x'] == 4)]

In [None]:
weekday_test_df.plot(kind='scatter', x='tod', y='availableBikes')

- Observe the usage on a given weekend day at the same station:

In [None]:
weekend_test_df = full_df.loc[(full_df['ID'] == 84) & (full_df['day_x'] == 5)]

In [None]:
# First, plot the observed data
weekend_test_df.plot(kind='scatter', x='tod', y='availableBikes')

### We can see two varying patterns for days of the week compared to days of the weekend; therefore two spearate models will be developed- one for Monday to Friday and another for Saturdays and Sundays.

In [None]:
#clouds
full_df["number"].replace([801,802,803,804], 'clouds', inplace=True)

#clear
full_df["number"].replace([800], 'clear', inplace=True)

#Atmosphere
full_df["number"].replace([701,711,721,731,741,751,761,762,771,781], 'Atmosphere', inplace=True)

#snow
full_df["number"].replace([600,601,602,611,612,613,615,616,620,621,622], 'snow', inplace=True)

#rain
full_df["number"].replace([500,501,502,503,504,511,520,521,522,531], 'rainfall', inplace=True)

#drizzle
full_df["number"].replace([300,301,302,310,311,312,313,314,321], 'drizzle', inplace=True)

#thunderstorm
full_df["number"].replace([200,201,202,210,211,212,221,230,231,232], 'thunderstorm', inplace=True)

In [None]:
full_df.head()

In [None]:
full_df.drop(["date_x","time_x","status", "epoch", "main",
         "description","icon", "tempMin", "tempMax", "tempFeels", "humidity",
         "pressure", "windSpeed","windDeg","sunrise", "sunset",
             "date_y","time_y", "day_y"],axis=1,inplace=True)

In [None]:
# add a flag that indicates whether a day is dry (has zero rain)
full_df['dry_day'] = (full_df['rain'] == 0).astype(int)

In [None]:
choice = int(input("Please enter a number to predict for either availableBikes - (0) or availableBikeStands - (1): "))
if choice == 0:
    full_df = full_df.rename(columns={"availableBikes": "target"})
    full_df.drop(["availableBikeStands"], axis=1,inplace=True)    
else:
    full_df = full_df.rename(columns={"availableBikeStands": "target"})
    full_df.drop(["availableBikes"], axis=1,inplace=True)

In [None]:
week_df = full_df.loc[(full_df['day_x'] >= 0) & (full_df['day_x'] <= 4)]

In [None]:
weekend_df = full_df.loc[(full_df['day_x'] >= 5) & (full_df['day_x'] <= 6)]

In [None]:
station = int(input("Please enter station ID: "))
week_or_weekend = int(input("Please choose to predict for week - (0) or weekend - (1): "))
# bikes_or_stands = int(input("Please choose to predict either availableBikeStands(0) or availableBikes(1): "))
# Constrain df to a single station on a single day
if week_or_weekend == 0:
    new_df = week_df.loc[(week_df.ID == station)]
else:
    new_df = weekend_df.loc[(weekend_df.ID == station)]

# Constrain df to a single station on a single day
# new_df = full_df.loc[(full_df['ID'] == station) & (full_df['day_x'] == day)]

In [None]:
## Dropping all columns not necessary for predictive model.
new_df.drop(["ID", "datetime"], axis=1,inplace=True)

In [None]:
# Make a new dataframe of this station
new_df.to_csv('comp303830_model_multipleLinearRegression.csv', index=False)

In [None]:
# Read csv file into a dataframe.
df = pd.read_csv('comp303830_model_multipleLinearRegression.csv')

In [None]:
df.head(3)

In [None]:
# Print the average target(availableBikes/availableBikeStands) in our dataset.
# We could use this as a very simple baseline prediction model.
# A better prediction model should at least improve on this baseline model.
round(df.target.mean())

### Observing the data:
- Trying to find correlations between continuous data and the target feature:

In [None]:
# First, plot the observed data
df.plot(kind='scatter', x='rain', y='target')

In [None]:
# First, plot the observed data
df.plot(kind='scatter', x='temp', y='target')

- There does not appear to be a clear correlation between the target feature and the continuous data

In [None]:
## Keep these lists for later.
## Will be used below.
x_list = df.tod.tolist()
y_list = df.target.tolist()

In [None]:
## Keep this pandas series for later.
## Will be used below.
tod_placeholder = df[['tod']]

# Training with continuous and categorical features

In [None]:
#replace days with numbers
df["day_x"].replace([0,1,2,3,4,5,6], ['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], inplace=True)

In [None]:
#We can also do this directly for all categorical features
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Input features must exclude the target feature
column_names = list(df.columns)[1:]

In [None]:
X = df[column_names]
y = df.target

In [None]:
# drop_first = True removes multi-collinearity
add_var = pd.get_dummies(X['tod'], prefix='tod', drop_first=True)
# Add all the columns to the model data
X = X.join(add_var)
# Drop the original column that was expanded
X.drop(columns=['tod'], inplace=True)

In [None]:
X.head()

In [None]:
# Drop any rows with null values
df.dropna(axis=0, how='any', inplace=True)

model = LinearRegression(fit_intercept=False)
model.fit(X, y)
df['predicted'] = model.predict(X)

In [None]:
#This function is used repeatedly to compute all metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [None]:
printMetrics(y, model.predict(X))

In [None]:
df[['target', 'predicted']].plot(alpha=0.5, figsize=(20, 5))

<ref: https://jakevdp.github.io/PythonDataScienceHandbook/05.06-linear-regression.html >

In [None]:
df.head()

## Document here on the errors between the predicted and actual values. The model, while not perfect, will suffice for the time being anyway.

In [None]:
params = pd.Series(model.coef_, index=X.columns)
params

In [None]:
from sklearn.utils import resample
np.random.seed(1)
err = np.std([model.fit(*resample(X, y)).coef_
              for i in range(1000)], 0)

In [None]:
print(pd.DataFrame({'effect': params.round(0),
                    'error': err.round(0)}))

<ref: https://becominghuman.ai/linear-regression-in-python-with-pandas-scikit-learn-72574a2ec1a5 >

##  Linear regression with nonlinear data: sidebar - we will not actually be using this model due to time constraints faced with implementing it.

In [None]:
x = np.asarray(x_list)
y_new = np.asarray(y_list)
plt.scatter(x, y_new);

In [None]:
model = LinearRegression(fit_intercept=True)

model.fit(x[:, np.newaxis], y_new)

xfit = np.linspace(0, 24, 1000)
yfit = model.predict(xfit[:, np.newaxis])

plt.scatter(x, y_new)
plt.plot(xfit, yfit);

In [None]:
print("Model slope:    ", model.coef_[0])
print("Model intercept:", model.intercept_)

In [None]:
# transforming the data to include another axis
x = x[:, np.newaxis]
y_new = y_new[:, np.newaxis]

In [None]:
polynomial_features= PolynomialFeatures(degree=2)
x_poly = polynomial_features.fit_transform(x)

model = LinearRegression()
model.fit(x_poly, y_new)
y_poly_pred = model.predict(x_poly)

rmse = np.sqrt(mean_squared_error(y_new,y_poly_pred))
r2 = r2_score(y_new,y_poly_pred)
print("Root Mean Square Error: ", rmse)
print("Coefficient of Determination (R2): ", r2)

In [None]:
plt.scatter(x, y_new, s=10)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(x,y_poly_pred), key=sort_axis)
x, y_poly_pred = zip(*sorted_zip)
plt.plot(x, y_poly_pred, color='m')
plt.show()

# Explore this one more time before submitting.

<ref: https://jakevdp.github.io/PythonDataScienceHandbook/05.06-linear-regression.html >

# Evaluation with train/test split

In [None]:
# Split the data into train and test sets
# Take a third (random) data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# If we want to get the same train/test split every time we run, we can set the random_state variable to a fixed value
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [None]:
features = list(df.columns)[1:]

In [None]:
# Train on the training sample and test on the test sample.
linreg = LinearRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
#print(linreg_train.coef_)
print("Features and coeficients:", list(zip(features, linreg.coef_)))

In [None]:
# Predicted bikes on training set
train_predictions = linreg.predict(X_train)
# print("Actual values of training:\n", y_train)
# print("Predictions on training:", train_predictions)
printMetrics(y_train, train_predictions)

In [None]:
# Predicted bikes on test set
test_predictions = linreg.predict(X_test)
# print("Actual values of test:\n", y_test)
# print("Predictions on test:", test_predictions)
printMetrics(y_test, test_predictions)

# Evaluation with cross-validation

In [None]:
sorted(metrics.SCORERS.keys())

In [None]:
scores = -cross_val_score(LinearRegression(), X, y, scoring='neg_mean_absolute_error', cv=5)
scores

In [None]:
metrics = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
scores = cross_validate(LinearRegression(), X, y, scoring=metrics, cv=5)
scores

In [None]:
sorted(scores.keys())

# Training with continuous and categorical features

<ref: https://stackoverflow.com/questions/34007308/linear-regression-analysis-with-string-categorical-features-variables >

<ref: https://towardsdatascience.com/simple-and-multiple-linear-regression-in-python-c928425168f9 >