In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from cleaningScript import cleanDatav2 # I <3 encapsulation

In [2]:
#function to calculate metrics quickly
def printMetrics(testActualVal, predictions):
    print("MAE: ", sklearn.metrics.mean_absolute_error(testActualVal, predictions))
    print("RMSE: ", sklearn.metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", sklearn.metrics.r2_score(testActualVal, predictions))

In [3]:
route, direction = '84', 1

In [None]:
df = cleanDatav2(route, direction)

In [None]:
print(len(df),  "rows")
df.head(3)

In [None]:
df['weekday'] = df.date.dt.weekday
df['month'] = df.date.dt.month
df['hour'] = (df.stopActualArr//3600)
df.head(3)

## Journey time vs Progrnumber

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(df.progrnumber, df.journeytime)
plt.grid()

Journey times are sometimes negative. Removing any stop from a journey that had a negative journey time anywhere as the whole trip could be offset

# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

dftmp = df[(df.journeytime <= 0) & (df.progrnumber > 1)]
df = df[~df['date'].isin(dftmp['date']) | ~df['tripid'].isin(dftmp['tripid'])]
print(len(df),  "rows")

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

plt.figure(figsize=(10,5))
plt.scatter(df.progrnumber, df.journeytime)
plt.grid()

## Journey time vs Month

In [None]:
dfTotalJourney = df.groupby(['date','tripid']).max() #get the max journey time for a stop on a trip ,i.e. the last stop
dfTotalJourney

In [None]:
plt.rcParams["figure.figsize"] = (10, 7)
dfTotalJourney.assign(index=dfTotalJourney.groupby('month').cumcount()).pivot('index','month','journeytime').plot(kind='box')
plt.grid()

some small variation in the boxplots. Doesn't look very promising but not reason enough to exclude yet

## Journey time vs Weekday

In [None]:
plt.rcParams["figure.figsize"] = (10, 7)
dfTotalJourney.assign(index=dfTotalJourney.groupby('weekday').cumcount()).pivot('index','weekday','journeytime').plot(kind='box')
plt.grid()

more variation here for weekday, more promising particularly for the weekend

## Journey time vs Hour

In [None]:
dfTotalJourney.hour %= 24

In [None]:
plt.rcParams["figure.figsize"] = (20, 7)
dfTotalJourney.assign(index=dfTotalJourney.groupby('hour').cumcount()).pivot('index','hour','journeytime').plot(kind='box')
plt.grid()

some variation in the hour of the day the route finishes. appears to be cyclical, with 2 peaks and troughs. Maybe ausing some sort of trigonometric function or polynomial to model as well as having separate factors for each month.

## Journey time vs Rain

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(dfTotalJourney.rain, dfTotalJourney.journeytime)
plt.grid()

looks like a normal distribution of journey times. The heavier rain is just clumped around the mean. This doesn't appear to have much predictive power.

## Journey time vs Temperature

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(dfTotalJourney.temp, dfTotalJourney.journeytime)
plt.grid()

looks like temperature doesn't have a correlation with the total journey time either, just clumped around the middle with no apparent relationship

## Journey time vs Humidity & Journey time vs Pressure

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(dfTotalJourney.humidity, dfTotalJourney.journeytime)
plt.grid()

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(dfTotalJourney.pressure, dfTotalJourney.journeytime)
plt.grid()

First plot is humidity and appears to have a slight negative relationship with journey time so will look to see if it's useful.

Second plot is pressure and doesn't appear to have any meaningful relationship to journey time. It's clumped aroun=d the centre like temperature

# Model for total trip time

In [None]:
dfTotalJourney.head(3)

In [None]:
#removing rain, temp, pressure as no correlation is apparent
#removing stopActualArr as journeytime is calculated from it
#removing journeytime as I am modelling the total journey
#dwelltime also removed as it is part of the journeytime and so can't be used to predict ahead of time
#keeping date and tripid for ID purposes
dfTotalJourney = dfTotalJourney.drop(columns = ['stopActualArr','rain','temp','pressure','progrnumber','dwelltime'])

In [None]:
dfTotalJourney.head(3)

In [None]:
dfTotalJourney.month = dfTotalJourney.month.astype('category')
dfTotalJourney.hour = dfTotalJourney.hour.astype('category')
dfTotalJourney.weekday = dfTotalJourney.weekday.astype('category')

In [None]:
monthlyDummies = pd.get_dummies(dfTotalJourney.month, prefix='m', drop_first=True)
hourlyDummies = pd.get_dummies(dfTotalJourney.hour,prefix='h', drop_first=True)
dailyDummies = pd.get_dummies(dfTotalJourney.weekday, prefix='d', drop_first=True)

In [None]:
dfTotalJourney = pd.concat([dfTotalJourney,monthlyDummies,hourlyDummies,dailyDummies], axis = 1)

In [None]:
dfTotalJourney.drop(columns=['month','hour','weekday'], inplace=True)

In [None]:
dfTotalJourney.head(3)

In [None]:
for i in dfTotalJourney:
    dfTotalJourney[i] = dfTotalJourney[i].astype('int')

In [None]:
Xfeatures = dfTotalJourney.columns[dfTotalJourney.columns != 'journeytime']
Xfeatures = Xfeatures.drop(['humidity'])
X = dfTotalJourney[Xfeatures] #separate target feature from predicitve features for train test split
y = dfTotalJourney.journeytime

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
lmodel = LinearRegression().fit(Xtrain, ytrain)
rfc = RandomForestRegressor(oob_score=True, random_state=1)
rfc.fit(Xtrain, ytrain)
knn = KNeighborsRegressor().fit(Xtrain, ytrain)

In [None]:
featureImportance = pd.DataFrame({'feature': Xfeatures, 'importance':lmodel.coef_})
featureImportance.sort_values('importance', ascending=False)

In [None]:
trainPredictions = lmodel.predict(Xtest)
rfcPrediction = rfc.predict(Xtest)
knnPrediction = knn.predict(Xtest)

In [None]:
trueVsPredicted = pd.concat([ytest, pd.DataFrame(trainPredictions, columns=['Linear'], index=Xtest.index)], axis=1)
trueVsPredicted.head(5)

In [None]:
print('Linear Regression:')
printMetrics(ytest, trainPredictions)
print('\nRandom Forest:')
printMetrics(ytest, rfcPrediction)
print('\nk-Nearest Neighbours:')
printMetrics(ytest,knnPrediction)

# Models for time taken to reach each stop

In [None]:
dfAllStops = df.drop(columns = ['stopActualArr','rain','temp','pressure','dwelltime'])

In [None]:
dfAllStops.hour %= 24
dfAllStops.head(3)

In [None]:
dfAllStops.month = dfAllStops.month.astype('category')
dfAllStops.hour = dfAllStops.hour.astype('category')
dfAllStops.weekday = dfAllStops.weekday.astype('category')

In [None]:
monthDummies = pd.get_dummies(dfAllStops.month, prefix='m', drop_first=True)
hourDummies = pd.get_dummies(dfAllStops.hour,prefix='h', drop_first=True)
dayDummies = pd.get_dummies(dfAllStops.weekday, prefix='d', drop_first=True)

In [None]:
dfAllStops = pd.concat([dfAllStops,monthDummies,hourDummies,dayDummies], axis=1)
dfAllStops.drop(columns=['month','hour','weekday'], inplace=True)
dfAllStops.head(3)

In [None]:
for i in dfAllStops:
    dfAllStops[i] = dfAllStops[i].astype('int')

In [None]:
Xfeatures = dfAllStops.columns[dfAllStops.columns != 'journeytime']
Xfeatures = Xfeatures.drop(['date','tripid','humidity']) #removed humidity due to low effect
X = dfAllStops[Xfeatures] #separate target feature from predicitve features for train test split
y = dfAllStops.journeytime

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.7, random_state=1)

In [None]:
l2model = LinearRegression().fit(Xtrain, ytrain)
rfc = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=1)
rfc.fit(Xtrain,ytrain)

In [None]:
knn = KNeighborsRegressor().fit(Xtrain, ytrain)

In [None]:
trainPredictions = l2model.predict(Xtest)
rfcPrediction = rfc.predict(Xtest)

In [None]:
knnPrediction = knn.predict(Xtest)

In [None]:
featureImportance = pd.DataFrame({'feature': Xfeatures, 'importance':l2model.coef_})
featureImportance.sort_values('importance', ascending=False)

In [None]:
trueVsPredicted = pd.concat([ytest, pd.DataFrame(trainPredictions, columns=['Predicted'], index=Xtest.index)], axis=1)
trueVsPredicted.head(5)

In [43]:
print('Linear Regression:')
printMetrics(ytest, trainPredictions)
print('\nRandom Forest:')
printMetrics(ytest, rfcPrediction)
# print('\nk-Nearest Neighbours:')
# printMetrics(ytest,knnPrediction)

Linear Regression:
MAE:  315.85570469084615
RMSE:  452.67629733580327
R2:  0.9185367661119411

Random Forest:
MAE:  184.6873186101612
RMSE:  303.88273904347244
R2:  0.9632888320867619


In [76]:
print('Linear Regression:')
printMetrics(ytest, trainPredictions)
print('\nRandom Forest:')
printMetrics(ytest, rfcPrediction)
print('\nk-Nearest Neighbours:')
printMetrics(ytest,knnPrediction)

Linear Regression:
MAE:  316.06802088909933
RMSE:  451.36491830460415
R2:  0.9187967304287481

Random Forest:
MAE:  258.4394287954046
RMSE:  412.7695674612088
R2:  0.9320900744524375

k-Nearest Neighbours:
MAE:  297.7337470999046
RMSE:  456.1813729548733
R2:  0.9170544655756778
