### Create Timetable
Creates timetable based on historic data provided.

In [2]:
# Imports and general data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import re

from sklearn import preprocessing
from scipy.stats import chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages


# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')

#Plotting inline
%matplotlib inline

In [3]:
# Import necessary data
leavetimes_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\Feature_Engineering_Sets\trim_leavetimes.csv', sep=';')
trips_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\Feature_Engineering_Sets\trim_trips.csv', sep=';')

In [4]:
leavetimes_df.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,PASSENGERS,PASSENGERSIN,PASSENGERSOUT,DISTANCE,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
0,DB,15-JAN-18 00:00:00,6114847,40,60,39356,39356,39469,39482,1000912,,,,,,,23-JAN-18 10:24:03,
1,DB,15-JAN-18 00:00:00,6119911,40,60,44156,44156,44246,44259,1000183,,,,,,,23-JAN-18 10:24:03,
2,DB,15-JAN-18 00:00:00,6107234,40,60,45356,45356,45172,45188,2868369,,,,,,,23-JAN-18 10:24:03,
3,DB,15-JAN-18 00:00:00,6107787,40,60,41756,41756,41719,41734,2693269,,,,,,,23-JAN-18 10:24:03,
4,DB,15-JAN-18 00:00:00,6109346,40,60,40556,40556,40355,40373,1001137,,,,,,,23-JAN-18 10:24:03,


In [5]:
trips_df.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,BASIN,TENDERLOT,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
0,DB,15-JAN-18 00:00:00,6110593,18,18_3,1,71807,68700,71391.0,68781.0,BasDef,,,,23-JAN-18 10:24:03,",1718118,"
1,DB,15-JAN-18 00:00:00,6120971,140,140_21,2,64464,60000,63578.0,,BasDef,,,,23-JAN-18 10:24:03,",2416375,"
2,DB,15-JAN-18 00:00:00,6121776,140,140_21,2,65664,61200,65096.0,61173.0,BasDef,,,,23-JAN-18 10:24:03,",2425389,"
3,DB,15-JAN-18 00:00:00,6120834,140,140_21,2,66264,61800,65700.0,61842.0,BasDef,,,,23-JAN-18 10:24:03,",2420795,"
4,DB,15-JAN-18 00:00:00,6110904,140,140_21,2,65064,60600,65362.0,60652.0,BasDef,,,,23-JAN-18 10:24:03,",2412709,"


In [6]:
# Combine data tables and keep only the relevant headings
result = pd.merge(trips_df[['DAYOFSERVICE','TRIPID','LINEID','ROUTEID','DIRECTION']],
                  leavetimes_df[['PROGRNUMBER','STOPPOINTID', 'TRIPID', 'PLANNEDTIME_ARR','PLANNEDTIME_DEP','ACTUALTIME_ARR','ACTUALTIME_DEP']],
                 on='TRIPID', 
                 how='left')

In [7]:
result = pd.merge(trips_df[['DAYOFSERVICE','TRIPID','LINEID','ROUTEID','DIRECTION']],
                  leavetimes_df[['PROGRNUMBER','STOPPOINTID','PLANNEDTIME_ARR','PLANNEDTIME_DEP','ACTUALTIME_ARR','ACTUALTIME_DEP', 'TRIPID']],
                 on='TRIPID', 
                 how='left')

In [8]:
result.sort_values(["PROGRNUMBER"])

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP
3430774,18-JAN-18 00:00:00,6116742,63,63_26,1,1,2039,56700,56700,56623,56623
4628591,17-JAN-18 00:00:00,6107087,16,16_24,2,1,5171,30600,30600,30777,30777
1127085,17-JAN-18 00:00:00,6116968,43,43_84,1,1,1184,30900,30900,30859,30859
9119848,15-JAN-18 00:00:00,6121399,41,41_7,2,1,4843,22500,22500,22459,22459
4628649,17-JAN-18 00:00:00,6107087,16,16_24,2,1,5171,30600,30600,30580,30580
...,...,...,...,...,...,...,...,...,...,...,...
227235,15-JAN-18 00:00:00,6120958,33,33_71,2,103,292,22320,22320,21757,21757
7966584,15-JAN-18 00:00:00,6107731,33,33_69,1,103,3802,32183,32183,32033,32033
6386844,19-JAN-18 00:00:00,6107731,33,33_69,1,103,3802,32183,32183,33005,33005
6732368,18-JAN-18 00:00:00,6120958,33,33_71,2,103,292,22320,22320,21757,21757


In [9]:
is_first =  result['PROGRNUMBER']==1
first_stop = result[is_first]

In [10]:
first_stop.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP
24,15-JAN-18 00:00:00,6110593,18,18_3,1,1,4359,68700,68700,68781,68781
114,15-JAN-18 00:00:00,6110593,18,18_3,1,1,4359,68700,68700,68710,68710
165,15-JAN-18 00:00:00,6110593,18,18_3,1,1,4359,68700,68700,68655,68655
176,15-JAN-18 00:00:00,6110593,18,18_3,1,1,4359,68700,68700,68950,68950
252,15-JAN-18 00:00:00,6110593,18,18_3,1,1,4359,68700,68700,68638,68638


In [11]:
timetable = first_stop[['DAYOFSERVICE', 'LINEID', 'ROUTEID', 'DIRECTION', 'PROGRNUMBER', 
                          'STOPPOINTID', 'PLANNEDTIME_DEP']]

In [12]:
timetable

Unnamed: 0,DAYOFSERVICE,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_DEP
24,15-JAN-18 00:00:00,18,18_3,1,1,4359,68700
114,15-JAN-18 00:00:00,18,18_3,1,1,4359,68700
165,15-JAN-18 00:00:00,18,18_3,1,1,4359,68700
176,15-JAN-18 00:00:00,18,18_3,1,1,4359,68700
252,15-JAN-18 00:00:00,18,18_3,1,1,4359,68700
...,...,...,...,...,...,...,...
9862951,16-JAN-18 00:00:00,37,37_14,1,1,7340,40500
9863044,16-JAN-18 00:00:00,37,37_14,1,1,7340,40500
9863088,16-JAN-18 00:00:00,37,37_14,1,1,7340,40500
9863147,16-JAN-18 00:00:00,37,37_14,1,1,7340,40500


In [44]:
timetable['DAYOFSERVICE'] = pd.to_datetime(timetable['DAYOFSERVICE'])
timetable['WEEKDAY'] = timetable['DAYOFSERVICE'].dt.day_name()

In [45]:
timetable.head()

Unnamed: 0,DAYOFSERVICE,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_DEP,WEEKDAY
24,2018-01-15,18,18_3,1,1,4359,68700,Monday
114,2018-01-15,18,18_3,1,1,4359,68700,Monday
165,2018-01-15,18,18_3,1,1,4359,68700,Monday
176,2018-01-15,18,18_3,1,1,4359,68700,Monday
252,2018-01-15,18,18_3,1,1,4359,68700,Monday


In [46]:
timetable.drop(["DAYOFSERVICE"], axis=1, inplace=True)

In [47]:
timetable

Unnamed: 0,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_DEP,WEEKDAY
24,18,18_3,1,1,4359,68700,Monday
114,18,18_3,1,1,4359,68700,Monday
165,18,18_3,1,1,4359,68700,Monday
176,18,18_3,1,1,4359,68700,Monday
252,18,18_3,1,1,4359,68700,Monday
...,...,...,...,...,...,...,...
9862951,37,37_14,1,1,7340,40500,Tuesday
9863044,37,37_14,1,1,7340,40500,Tuesday
9863088,37,37_14,1,1,7340,40500,Tuesday
9863147,37,37_14,1,1,7340,40500,Tuesday


In [48]:
timetable.drop_duplicates(inplace=True)

In [49]:
timetable

Unnamed: 0,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_DEP,WEEKDAY
24,18,18_3,1,1,4359,68700,Monday
430,140,140_21,2,1,896,60000,Monday
574,140,140_21,2,1,896,61200,Monday
793,140,140_21,2,1,896,61800,Monday
1034,140,140_21,2,1,896,60600,Monday
...,...,...,...,...,...,...,...
9861924,40,40_27,1,1,7132,61200,Tuesday
9862127,122,122_16,2,1,1423,27000,Tuesday
9862378,122,122_14,1,1,4525,31800,Tuesday
9862639,145,145_105,2,1,7574,64800,Tuesday


In [54]:
timetable.sort_values(["ROUTEID", "PLANNEDTIME_DEP", "DIRECTION"])

Unnamed: 0,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_DEP,WEEKDAY
1174222,102,102_10,2,1,1073,06:45:00,Wednesday
2023193,102,102_10,2,1,1073,06:45:00,Friday
3716689,102,102_10,2,1,1073,06:45:00,Thursday
8158383,102,102_10,2,1,1073,06:45:00,Monday
9458905,102,102_10,2,1,1073,06:45:00,Tuesday
...,...,...,...,...,...,...,...
1728952,9,9_8,2,1,4392,23:20:00,Wednesday
2792726,9,9_8,2,1,4392,23:20:00,Tuesday
4352543,9,9_8,2,1,4392,23:20:00,Thursday
7259919,9,9_8,2,1,4392,23:20:00,Friday


In [52]:
timetable['PLANNEDTIME_DEP'] = pd.to_datetime(timetable['PLANNEDTIME_DEP'], unit='s').dt.strftime("%H:%M:%S")

In [53]:
timetable

Unnamed: 0,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_DEP,WEEKDAY
24,18,18_3,1,1,4359,19:05:00,Monday
430,140,140_21,2,1,896,16:40:00,Monday
574,140,140_21,2,1,896,17:00:00,Monday
793,140,140_21,2,1,896,17:10:00,Monday
1034,140,140_21,2,1,896,16:50:00,Monday
...,...,...,...,...,...,...,...
9861924,40,40_27,1,1,7132,17:00:00,Tuesday
9862127,122,122_16,2,1,1423,07:30:00,Tuesday
9862378,122,122_14,1,1,4525,08:50:00,Tuesday
9862639,145,145_105,2,1,7574,18:00:00,Tuesday


In [55]:
timetable.to_csv("timetable.csv")

In [59]:
# Import necessary data
leavetimes_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\JupyterNotebooks\FeatureEngineering\stop_times.txt', sep=",")
trips_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\JupyterNotebooks\FeatureEngineering\trips.txt', sep=',')

In [60]:
leavetimes_df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,5242.y1005.60-1-d12-1.1.O,21:00:00,21:00:00,8240DB000226,1,Sandymount,0,0,0.0
1,5242.y1005.60-1-d12-1.1.O,21:00:38,21:00:38,8220DB000228,2,Sandymount,0,0,267.48
2,5242.y1005.60-1-d12-1.1.O,21:01:11,21:01:11,8240DB000229,3,Sandymount,0,0,483.53
3,5242.y1005.60-1-d12-1.1.O,21:02:02,21:02:02,8240DB000227,4,Sandymount,0,0,834.47
4,5242.y1005.60-1-d12-1.1.O,21:02:35,21:02:35,8240DB000230,5,Sandymount,0,0,1063.49


In [61]:
trips_df.head()

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
0,60-116-b12-1,y1003,3736.y1003.60-116-b12-1.57.I,60-116-b12-1.57.I,Church of Our Lady of Good Council - Parnell S...,1
1,60-116-b12-1,y1003,3737.y1003.60-116-b12-1.56.O,60-116-b12-1.56.O,Sussex Road - Church of Our Lady of Good Council,0
2,60-116-d12-1,y1005,3907.y1005.60-116-d12-1.57.I,60-116-d12-1.57.I,Church of Our Lady of Good Council - Parnell S...,1
3,60-116-d12-1,y1005,3908.y1005.60-116-d12-1.56.O,60-116-d12-1.56.O,Sussex Road - Church of Our Lady of Good Council,0
4,60-118-b12-1,y1003,3770.y1003.60-118-b12-1.58.I,60-118-b12-1.58.I,Enniskerry Road (Ballycorus Rd) - Eden Quay,1
