In [224]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import csv
import os
import sqlalchemy
import pymysql
from urllib.parse import quote
from dotenv import load_dotenv

In [225]:
pd.set_option('display.max_rows', 100)

In [226]:
#getting database password
load_dotenv()
password = os.getenv('DB_PASSWORD')

In [227]:
DBengine = sqlalchemy.create_engine("mysql+pymysql://student:{}@localhost:3306/dublinbus".format(quote(password)))

In [228]:
line = '84' #bus line
direction = 1 #values of 1 or 2
dfBase = pd.read_sql('call dublinbus.trip_leavetimes_join("{}", {});'.format(line, direction) #using stored procedure
                 , DBengine)

In [255]:
print(len(dfBase), "rows")
dfBase.head(3)

394884 rows


Unnamed: 0,dayofservice,tripid,lineid,direction,progrnumber,stoppointid,tripPlannedArr,tripPlannedDep,tripActualArr,tripActualDep,stopPlannedArr,stopPlannedDep,stopActualArr,stopActualDep,vehicleid
0,01-JAN-18 00:00:00,5955785,84,1,2,3082,40879,36600,41351.0,36583.0,36643,36643,36617,36629,2534807
1,01-JAN-18 00:00:00,5955785,84,1,4,3114,40879,36600,41351.0,36583.0,36762,36762,36782,36790,2534807
2,01-JAN-18 00:00:00,5955785,84,1,5,3115,40879,36600,41351.0,36583.0,36798,36798,36819,36819,2534807


In [259]:
df = dfBase.drop(columns = ['vehicleid','tripPlannedArr','tripPlannedDep','tripActualArr','stopPlannedDep', 'stopPlannedArr'])
df.head(3)

Unnamed: 0,dayofservice,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,01-JAN-18 00:00:00,5955785,84,1,2,3082,36583.0,36617,36629
1,01-JAN-18 00:00:00,5955785,84,1,4,3114,36583.0,36782,36790
2,01-JAN-18 00:00:00,5955785,84,1,5,3115,36583.0,36819,36819


In [260]:
df.rename(columns={'dayofservice': 'date'}, inplace=True)
df.date = pd.to_datetime(df.date, format="%d-%b-%y %H:%M:%S")
print(len(df),  "rows")
df.head(3)

394884 rows


Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,2018-01-01,5955785,84,1,2,3082,36583.0,36617,36629
1,2018-01-01,5955785,84,1,4,3114,36583.0,36782,36790
2,2018-01-01,5955785,84,1,5,3115,36583.0,36819,36819


In [261]:
df.dtypes

date             datetime64[ns]
tripid                    int64
lineid                   object
direction                 int64
progrnumber               int64
stoppointid               int64
tripActualDep           float64
stopActualArr             int64
stopActualDep             int64
dtype: object

In [262]:
df.tripActualDep = df.tripActualDep.astype('Int64')

In [263]:
df.dtypes

date             datetime64[ns]
tripid                    int64
lineid                   object
direction                 int64
progrnumber               int64
stoppointid               int64
tripActualDep             Int64
stopActualArr             int64
stopActualDep             int64
dtype: object

In [264]:
df.sort_values(by = ['date', 'tripActualDep', 'progrnumber'], inplace=True)

In [265]:
df = df.reset_index(drop=True).dropna(subset='tripActualDep')

In [266]:
df.head(3)

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,2018-01-01,5955785,84,1,1,7188,36583,36583,36583
1,2018-01-01,5955785,84,1,2,3082,36583,36617,36629
2,2018-01-01,5955785,84,1,3,7660,36583,36721,36721


In [267]:
#getting the usual starting stop, will srop rows that have other stops
modeStartStop = df[df.progrnumber == 1].mode().stoppointid[0].astype('int64')

In [268]:
#df of unusual starting stops
dftmp = df[(df.progrnumber == 1) & (df.stoppointid != modeStartStop)]

#removing any entries from a trip with an unusual starting stop
df = df[~df['date'].isin(dftmp['date']) | ~df['tripid'].isin(dftmp['tripid'])] #should result in 375798-9442 rows
print(len(df),  "rows")
df.head(3)

# usually the different starting stop is a stop further up the line, something could be done with this data
# but whether it's a good use of time is debatable. 

366356 rows


Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,2018-01-01,5955785,84,1,1,7188,36583,36583,36583
1,2018-01-01,5955785,84,1,2,3082,36583,36617,36629
2,2018-01-01,5955785,84,1,3,7660,36583,36721,36721


In [269]:
df['journeytime'] = df.stopActualArr - df.tripActualDep
df['dwelltime'] = df.stopActualDep - df.stopActualArr
df.head(10)

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep,journeytime,dwelltime
0,2018-01-01,5955785,84,1,1,7188,36583,36583,36583,0,0
1,2018-01-01,5955785,84,1,2,3082,36583,36617,36629,34,12
2,2018-01-01,5955785,84,1,3,7660,36583,36721,36721,138,0
3,2018-01-01,5955785,84,1,4,3114,36583,36782,36790,199,8
4,2018-01-01,5955785,84,1,5,3115,36583,36819,36819,236,0
5,2018-01-01,5955785,84,1,6,3116,36583,36867,36867,284,0
6,2018-01-01,5955785,84,1,7,3117,36583,36905,36905,322,0
7,2018-01-01,5955785,84,1,8,3118,36583,36915,36915,332,0
8,2018-01-01,5955785,84,1,9,3119,36583,36936,36936,353,0
9,2018-01-01,5955785,84,1,10,3120,36583,36996,36996,413,0


In [270]:
df.drop(columns=['tripActualDep','stopActualDep', 'lineid','direction','stoppointid'], inplace=True)
df.head(3)

Unnamed: 0,date,tripid,progrnumber,stopActualArr,journeytime,dwelltime
0,2018-01-01,5955785,1,36583,0,0
1,2018-01-01,5955785,2,36617,34,12
2,2018-01-01,5955785,3,36721,138,0
