In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.font_manager as fm
import matplotlib
import timeit
import feather
import statsmodels as stm
import autotime
%matplotlib inline
%load_ext autotime
import seaborn as sns
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression as LinR
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split

In [None]:
# Replace the XXXX in Route_XXXX_travel_time.csv below with one route number.

res = pd.read_csv('Route_XXXX_travel_time_csvs/Route_XXXX_travel_time.csv')

In [None]:
# If there is more than one route, list them in the first line below.
# If there is only one route, do not run the below code.

route_list_SSIDXXXXXXXX = ['list', 'here', 'of', 'all', 'routes', 'required']
for r in route_list_SSIDXXXXXXXX:
    df = pd.read_csv('Route_XXXX_travel_time_csvs/Route_%s_travel_time.csv' % r)
    res = pd.concat([df, res], axis=0) 
res

Description:  

In [None]:
#check unique values of each feature

print("Feature, UniqueValues") 
for column in res:
    print(column + "\t" + str(len(res[column].unique())))

In [None]:
# create dataframe for SSID XXXXXXXX
# replace 'leeson' with the variable name of your choice - go to Edit in the menu above and select 'Find and Replace' to do so

res['SSID'] = res['SSID'].astype('category')
leeson = res[res.SSID == XXXXXXXX]
leeson.shape

In [None]:
#check unique values of each feature

print("Feature, UniqueValues") 
for column in leeson:
    print(column + "\t" + str(len(leeson[column].unique())))

In [None]:
# drop unneeded columns

leeson = leeson.drop(['SourceStopID', 'DestStopID'], axis=1)

In [None]:
# reorder columns

leeson = leeson[['TravelTime', 'HourFrame', 'Day', 'SchoolHoliday', 'Rain', 'WindSpeed', 'TimeFrame', 'JourneyPatternID', 'VehicleJourneyID', 'SSID']]

In [None]:
# apply any necessary leading zeroes to make all SSIDs 8 digit strings 
leeson.SSID = leeson.SSID.apply(lambda x: str(int(x)).zfill(8))

In [None]:
leeson.reset_index(inplace=True)
leeson = leeson.drop('index', axis=1)
leeson

In [None]:
# save as csv

r = leeson.loc[0, 'SSID'] 
leeson.to_csv('SSID_CSVs/SSID_%s.csv' % r)

In [None]:
# load csv

leeson = pd.read_csv('SSID_CSVs/SSID_%s.csv' % r)

In [None]:
# apply any necessary leading zeroes to make all SSIDs 8 digit strings 

leeson.SSID = leeson.SSID.apply(lambda x: str(int(x)).zfill(8))

In [None]:
leeson.info(memory_usage='deep')

In [None]:
leeson = leeson.drop('Unnamed: 0', axis=1)

In [None]:
# assign more appropriate datatypes
leeson['Day'] = leeson['Day'].astype('category')
leeson['JourneyPatternID'] = leeson['JourneyPatternID'].astype('category')
leeson['VehicleJourneyID'] = leeson['VehicleJourneyID'].astype('category')
leeson['SSID'] = leeson['SSID'].astype('category')
leeson['HourFrame'] = leeson['HourFrame'].astype('category')
leeson['TimeFrame'] = pd.to_datetime(leeson['TimeFrame'], format='%Y-%m-%d')

In [None]:
# necessary for later operation

leeson.JourneyPatternID.cat.remove_unused_categories(inplace=True)

In [None]:
leeson.memory_usage(deep=True)

## Analysis of target feature TravelTime

In [None]:
# Checking stats for TravelTime so as to better understand the following graphs

leeson.TravelTime.describe()

Histiogram of values (x-axis is number of seconds)

In [None]:
leeson.TravelTime.hist(figsize=(16, 8), bins=50)

Boxplot to check for outliers

In [None]:
leeson.TravelTime.plot(kind='box', figsize=(8, 8), showfliers=True)

In [None]:
# checking how isolated top high-value outliers are in terms of time/date, and density of low-value outliers 

leeson.sort_values(['TravelTime', 'TimeFrame'], ascending=False, inplace=True)
leeson

Observation:  

## Weather features

In [None]:
# checking correlations between weather and TravelTime

cont_columns = leeson[['TravelTime', 'Rain', 'WindSpeed']].columns

# Correlation matrix using code found on https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
sns.set(style="white")

# Calculate correlation of all pairs of continuous features
corr = leeson[cont_columns].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom colormap - blue and red
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.yticks(rotation = 0)
plt.xticks(rotation = 45)

In [None]:
# scatterplots for the relationship between weather and TravelTime
fig, axs = plt.subplots(1, 2, sharey=True)
leeson.plot(kind='scatter', x='WindSpeed', y='TravelTime', ax=axs[0], figsize=(16, 8))
leeson.plot(kind='scatter', x='Rain', y='TravelTime', ax=axs[1])

Observation:  

## TravelTime and categorical features

In [None]:
# Create variables for horizontal lines in the charts below to represent the mean/medium.

leeson_mean = leeson.TravelTime.mean()
leeson_median  = leeson.TravelTime.median()

## JourneyPatternID strip plot

#### Note: In below charts, horizontal line is overall mean/median

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation=45)
sns.stripplot(x='JourneyPatternID', y="TravelTime", data=leeson, jitter=True);

coord_x1 = -1
coord_y1 = leeson_mean
coord_z1 = leeson_median
coord_x2 = 30


# mean full line, median dashed line
plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')
plt.plot([coord_x1, coord_x2], [coord_z1, coord_z1], '--')

Observation:  

### Bar plot for mean TravelTime per JourneyPatternID

In [None]:
mean_JPID = leeson.groupby('JourneyPatternID')['TravelTime'].mean()
mean_JPID.plot(kind='bar', figsize=(15, 6), rot=45)

# code for line from https://stackoverflow.com/questions/38017465/how-to-add-a-line-on-top-of-a-bar-chart

coord_x1 = -1
coord_y1 = leeson_mean
coord_x2 = 30

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for median TravelTime per JourneyPatternID

In [None]:
median_JPID = leeson.groupby('JourneyPatternID')['TravelTime'].median()
median_JPID.plot(kind='bar', figsize=(15, 6), rot=45)

coord_x1 = -1
coord_y1 = leeson_median
coord_x2 = 30

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for mean TravelTime per HourFrame

In [None]:
mean_HF = leeson.groupby('HourFrame')['TravelTime'].mean()
mean_HF.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = leeson_mean
coord_x2 = 25

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for median TravelTime per HourFrame

In [None]:
med_HF = leeson.groupby('HourFrame')['TravelTime'].median()
med_HF.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = leeson_median
coord_x2 = 25

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for mean TravelTime per Day

In [None]:
mean_Day = leeson.groupby('Day')['TravelTime'].mean()
mean_Day=mean_Day.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

mean_Day.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = leeson_mean

coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for median TravelTime per HourFrame

In [None]:
med_Day = leeson.groupby('Day')['TravelTime'].median()

med_Day=med_Day.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
med_Day.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = leeson_median
coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for mean TravelTime when SchoolHoliday true/false

In [None]:
# check how many rows are true/false for SchoolHoliday
leeson.SchoolHoliday.describe().T

In [None]:
mean_SH = leeson.groupby('SchoolHoliday')['TravelTime'].mean()
mean_SH.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = leeson_mean
coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### Bar plot for median TravelTime when SchoolHoliday true/false

In [None]:
med_SH = leeson.groupby('SchoolHoliday')['TravelTime'].median()
med_SH.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = leeson_median
coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

In [None]:
# new feature: does the bus have an X in its name?
# DELETE THIS CELL if no bus with X travels here

leeson['XBuses'] = leeson[leeson["JourneyPatternID"].str.find("X") > 0].sum(axis=1) > 0
leeson["XBuses"].fillna(False, inplace=True)
leeson['XBuses'] = leeson['XBuses'].astype('int')
#XBusesC = leeson.groupby('SchoolHoliday')['TravelTime'].median()
leeson['XBuses'].value_counts().plot(kind='bar', figsize=(15, 6), rot=0)


Observation:  

## Looking at dropping outliers

In [None]:
# make a copy of original df
trimleeson = leeson.copy()

# Remove TravelTime outliers beyond a conservative 3 x IQR
# Code adapted from here: http://nbviewer.jupyter.org/urls/bitbucket.org/hrojas/learn-pandas/raw/master/lessons/07%20-%20Lesson.ipynb
# Also got some help here: https://www.reddit.com/r/learnpython/comments/65sz8g/trying_to_get_highlight_outliers_in_a_dataframe/
# Functions first defined for calculating upper and lower bounds.
lowerOp = trimleeson.quantile(q=.25) - (3*(trimleeson.quantile(q=.75)-trimleeson.quantile(q=.25)))
upperOp = trimleeson.quantile(q=.75) + (3*(trimleeson.quantile(q=.75)-trimleeson.quantile(q=.25)))

# New boolean column created where values beyond the bounds for 'TravelTime' are tagged as True.
trimleeson['OutlierTT'] = (trimleeson['TravelTime'] < lowerOp['TravelTime']) | (trimleeson['TravelTime'] > upperOp['TravelTime'])

# Outlier rows counted
print("There would be", trimleeson[(trimleeson['OutlierTT'] == True)].shape[0], "outliers dropped.")

In [None]:
trimleeson = trimleeson[trimleeson.OutlierTT != True]
trimleeson.sort_values(['TravelTime'], ascending=False, inplace=True)

In [None]:
trimleeson

NOTE - Outliers not removed.

Observation:  

### Linear Regression model

In [None]:
# Train/fit a model using all features to start with.
# Some modifications to the data first

# dropping SSID and TimeFrame as these have no predictive value here
LRleeson = leeson.drop(['SSID', 'TimeFrame'], axis=1)

# Convert binary categorical SchoolHoliday column to numerical.
LRleeson['SchoolHoliday'] = LRleeson['SchoolHoliday'].astype('int')

In [None]:
# preliminary training to see which variables are worth selecting
# delete Xbuses from formula if none
lrle = sm.ols(formula="TravelTime ~  SchoolHoliday + WindSpeed + Rain + XBuses + C(JourneyPatternID)+ C(HourFrame) + C(Day)", data=LRleeson).fit()

In [None]:
print(lrle.summary())

Observation:  

In [None]:
rsqa = stm.regression.linear_model.RegressionResults.rsquared_adj(lrle) 

print ("Many JourneyPatternIDs have p-values too high (over 0.05), so they will be cut next.")
print ("The adjusted R-squared value of", rsqa, "means that Linear Regression is unlikely to be useful in the long run though.")
#ols(y=rets['AAPL'], x=rets.ix[:, ['GOOG']]).rsquared

In [None]:
# Train/fit a model using features with p-value under 0.05

# create dummy variables from HourFrame, Day, JourneyPatternID using get_dummies
# code adapted from: https://github.com/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb
HF_dummies = pd.get_dummies(LRleeson.HourFrame, prefix='HF')
Day_dummies = pd.get_dummies(LRleeson.Day, prefix='Day')
JPID_dummies = pd.get_dummies(LRleeson.JourneyPatternID, prefix='JPID')

# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
LRleeson = pd.concat([LRleeson, HF_dummies, Day_dummies, JPID_dummies], axis=1)

In [None]:
print(LRleeson.columns.values.tolist())

In [None]:
# remove any features with p-values under 0.05 

#LRleeson = LRleeson.drop(['delete', 'any', 'features', 'you', 'wish'], axis = 1)


In [None]:
LRleeson.head(25)

In [None]:
lrle1 = sm.ols(formula="TravelTime ~  AnyJPIDsOrOtherFeaturesNotAlreadyIncludedHere + Xbuses + SchoolHoliday + WindSpeed + Rain + HF_0 + HF_6 + HF_7 + HF_8 + HF_9 + HF_10 + HF_11 + HF_12 + HF_13 + HF_14 + HF_15 + HF_16 + HF_17 + HF_18 + HF_19 + HF_20 + HF_21 + HF_22 + HF_23 + Day_Friday + Day_Monday + Day_Saturday + Day_Sunday + Day_Thursday + Day_Tuesday + Day_Wednesday", data=LRleeson).fit()

In [None]:
print(lrle1.summary())

Observation:  