In [1]:
import pandas as pd
import os
import sqlalchemy
import pymysql
from urllib.parse import quote
from dotenv import load_dotenv

In [2]:
pd.set_option('display.max_rows', 100)

In [3]:
#getting database password
load_dotenv()
password = os.getenv('DB_PASSWORD')

In [4]:
DBengine = sqlalchemy.create_engine("mysql+pymysql://student:{}@localhost:3306/dublinbus".format(quote(password)))

In [7]:
line = '40E' #bus line
direction = 1 #values of 1 or 2
dfBase = pd.read_sql('call dublinbus.trip_leavetimes_join("{}", {});'.format(line, direction) #using stored procedure
                 , DBengine)

In [8]:
print(len(dfBase), "rows")
dfBase.head(3)

0 rows


Unnamed: 0,dayofservice,tripid,lineid,direction,progrnumber,stoppointid,tripPlannedArr,tripPlannedDep,tripActualArr,tripActualDep,stopPlannedArr,stopPlannedDep,stopActualArr,stopActualDep,vehicleid


In [441]:
#redundant columns
df = dfBase.drop(columns = ['vehicleid','tripPlannedArr','tripPlannedDep','tripActualArr','stopPlannedDep', 'stopPlannedArr'])
df.head(3)

Unnamed: 0,dayofservice,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,01-JAN-18 00:00:00,5955785,84,1,2,3082,36583.0,36617,36629
1,01-JAN-18 00:00:00,5955785,84,1,4,3114,36583.0,36782,36790
2,01-JAN-18 00:00:00,5955785,84,1,5,3115,36583.0,36819,36819


In [442]:
#
df.rename(columns={'dayofservice': 'date'}, inplace=True)
df.date = pd.to_datetime(df.date, format="%d-%b-%y %H:%M:%S")
print(len(df),  "rows")
df.head(3)

394884 rows


Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,2018-01-01,5955785,84,1,2,3082,36583.0,36617,36629
1,2018-01-01,5955785,84,1,4,3114,36583.0,36782,36790
2,2018-01-01,5955785,84,1,5,3115,36583.0,36819,36819


In [443]:
df.dtypes

date             datetime64[ns]
tripid                    int64
lineid                   object
direction                 int64
progrnumber               int64
stoppointid               int64
tripActualDep           float64
stopActualArr             int64
stopActualDep             int64
dtype: object

In [444]:
df.tripActualDep = df.tripActualDep.astype('Int64')

In [445]:
df.dtypes

date             datetime64[ns]
tripid                    int64
lineid                   object
direction                 int64
progrnumber               int64
stoppointid               int64
tripActualDep             Int64
stopActualArr             int64
stopActualDep             int64
dtype: object

In [446]:
df.sort_values(by = ['date', 'tripActualDep', 'progrnumber'], inplace=True)

In [447]:
df = df.reset_index(drop=True).dropna(subset='tripActualDep')

In [448]:
df.head(3)

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,2018-01-01,5955785,84,1,1,7188,36583,36583,36583
1,2018-01-01,5955785,84,1,2,3082,36583,36617,36629
2,2018-01-01,5955785,84,1,3,7660,36583,36721,36721


In [449]:
#getting the usual starting stop, will srop rows that have other stops
modeStartStop = df[df.progrnumber == 1].mode().stoppointid[0].astype('int64')

In [450]:
#df of unusual starting stops
dftmp = df[(df.progrnumber == 1) & (df.stoppointid != modeStartStop)]

#removing any entries from a trip with an unusual starting stop
df = df[(~df['date'].isin(dftmp['date'])) | (~df['tripid'].isin(dftmp['tripid']))].reset_index(drop=True) #should result in 375798-9442 rows
print(len(df),  "rows")
dftmp

# usually the different starting stop is a stop further up the line, something could be done with this data
# but whether it's a good use of time is debatable. 

366356 rows


Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
1636,2018-01-02,5965755,84,1,1,4168,56408,56408,56408
3239,2018-01-03,5965755,84,1,1,4168,56394,56394,56394
4918,2018-01-04,5965755,84,1,1,4168,56522,56522,56522
6428,2018-01-05,5965755,84,1,1,4168,56371,56371,56371
9746,2018-01-08,6103348,84,1,1,4168,56414,56414,56414
...,...,...,...,...,...,...,...,...,...
378366,2018-10-11,8030352,84,1,1,4168,56334,56334,56334
380538,2018-10-16,8030352,84,1,1,4168,56427,56427,56427
381991,2018-10-17,8030352,84,1,1,4168,56353,56353,56353
383223,2018-10-18,8030352,84,1,1,4168,56432,56432,56432


In [453]:
#dealing with unusual, usually premature, last stop
#getting the index of the last stop for each trip
dftmp = df.groupby(['date','tripid']).progrnumber.idxmax().reset_index()

dftmp = df.iloc[dftmp.progrnumber] #all rows corresponding to a last stop
dftmp

# lastStopCounts = dftmp.stoppointid.value_counts()
# modeLastStop = dftmp.stoppointid.mode()[0]
# print(modeLastStop, lastStopCounts[modeLastStop])
# print('other', sum(lastStopCounts)-lastStopCounts[modeLastStop])
# max(lastStopCounts)/sum(lastStopCounts) #the mode last stop dominates so removing any other would be good

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
85,2018-01-01,5955785,84,1,86,4956,36583,41351,41351
255,2018-01-01,5955787,84,1,89,4956,46212,51000,51000
427,2018-01-01,5955789,84,1,86,4956,55864,60201,60201
600,2018-01-01,5955791,84,1,86,4956,65420,70188,70188
771,2018-01-01,5955793,84,1,86,4956,74927,79206,79206
...,...,...,...,...,...,...,...,...,...
365522,2018-10-27,8059369,84,1,87,4956,24010,28872,28872
365727,2018-10-27,8059371,84,1,90,4956,38515,43511,43511
365867,2018-10-27,8059373,84,1,86,7272,50365,56025,56025
366064,2018-10-27,8059375,84,1,87,4956,63235,68567,68567


In [414]:
dftmp = dftmp[dftmp.stoppointid != modeLastStop].reset_index(drop=True)
dftmp = df.merge(dftmp, how='left', on=['date','tripid'], suffixes=(None,'Y'), indicator=True)
dftmp = dftmp[dftmp['_merge'] == "left_only"].drop(columns=['_merge'])
df = dftmp[list(df.columns)]

In [415]:
df

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
0,2018-01-01,5955785,84,1,1,7188,36583,36583,36583
1,2018-01-01,5955785,84,1,2,3082,36583,36617,36629
2,2018-01-01,5955785,84,1,3,7660,36583,36721,36721
3,2018-01-01,5955785,84,1,4,3114,36583,36782,36790
4,2018-01-01,5955785,84,1,5,3115,36583,36819,36819
...,...,...,...,...,...,...,...,...,...
366351,2018-10-27,8048174,84,1,83,4257,79168,83849,83849
366352,2018-10-27,8048174,84,1,84,4258,79168,83897,83897
366353,2018-10-27,8048174,84,1,85,4259,79168,83944,83944
366354,2018-10-27,8048174,84,1,86,7272,79168,83955,83955


In [417]:
df[(df.date == '2018-01-08')&(df.tripid==6096287)]

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep
9762,2018-01-08,6096287,84,1,1,7188,84441,84441,84441
9763,2018-01-08,6096287,84,1,2,3082,84441,84506,84600
9764,2018-01-08,6096287,84,1,3,7660,84441,84636,84636
9765,2018-01-08,6096287,84,1,4,3114,84441,84683,84696
9766,2018-01-08,6096287,84,1,5,3115,84441,84725,84725
9767,2018-01-08,6096287,84,1,6,3116,84441,84764,84764
9768,2018-01-08,6096287,84,1,7,3117,84441,84807,84817
9769,2018-01-08,6096287,84,1,8,3118,84441,84835,84835
9770,2018-01-08,6096287,84,1,9,3119,84441,84861,84861
9771,2018-01-08,6096287,84,1,10,3120,84441,84929,84936


In [419]:
df['journeytime'] = df.stopActualArr - df.tripActualDep
df['dwelltime'] = df.stopActualDep - df.stopActualArr
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['journeytime'] = df.stopActualArr - df.tripActualDep
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dwelltime'] = df.stopActualDep - df.stopActualArr


Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep,journeytime,dwelltime
0,2018-01-01,5955785,84,1,1,7188,36583,36583,36583,0,0
1,2018-01-01,5955785,84,1,2,3082,36583,36617,36629,34,12
2,2018-01-01,5955785,84,1,3,7660,36583,36721,36721,138,0
3,2018-01-01,5955785,84,1,4,3114,36583,36782,36790,199,8
4,2018-01-01,5955785,84,1,5,3115,36583,36819,36819,236,0
5,2018-01-01,5955785,84,1,6,3116,36583,36867,36867,284,0
6,2018-01-01,5955785,84,1,7,3117,36583,36905,36905,322,0
7,2018-01-01,5955785,84,1,8,3118,36583,36915,36915,332,0
8,2018-01-01,5955785,84,1,9,3119,36583,36936,36936,353,0
9,2018-01-01,5955785,84,1,10,3120,36583,36996,36996,413,0


In [420]:
df[(df.tripid == 6111779) & (df.progrnumber <3)]

Unnamed: 0,date,tripid,lineid,direction,progrnumber,stoppointid,tripActualDep,stopActualArr,stopActualDep,journeytime,dwelltime
17983,2018-01-15,6111779,84,1,1,7188,31419,31419,31419,0,0
17984,2018-01-15,6111779,84,1,2,3082,31419,31488,31513,69,25
19464,2018-01-16,6111779,84,1,1,7188,32477,32477,32477,0,0
19465,2018-01-16,6111779,84,1,2,3082,32477,30780,30780,-1697,0
21004,2018-01-17,6111779,84,1,1,7188,32071,32071,32071,0,0
21005,2018-01-17,6111779,84,1,2,3082,32071,32163,32395,92,232
22463,2018-01-18,6111779,84,1,1,7188,32020,32020,32020,0,0
22464,2018-01-18,6111779,84,1,2,3082,32020,32211,32211,191,0
24015,2018-01-19,6111779,84,1,1,7188,31237,31237,31237,0,0
24016,2018-01-19,6111779,84,1,2,3082,31237,31281,31326,44,45


In [421]:
df.drop(columns=['tripActualDep','stopActualDep', 'lineid','direction','stoppointid'], inplace=True)
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['tripActualDep','stopActualDep', 'lineid','direction','stoppointid'], inplace=True)


Unnamed: 0,date,tripid,progrnumber,stopActualArr,journeytime,dwelltime
0,2018-01-01,5955785,1,36583,0,0
1,2018-01-01,5955785,2,36617,34,12
2,2018-01-01,5955785,3,36721,138,0
3,2018-01-01,5955785,4,36782,199,8
4,2018-01-01,5955785,5,36819,236,0
5,2018-01-01,5955785,6,36867,284,0
6,2018-01-01,5955785,7,36905,322,0
7,2018-01-01,5955785,8,36915,332,0
8,2018-01-01,5955785,9,36936,353,0
9,2018-01-01,5955785,10,36996,413,0


In [422]:
max(df.stopActualArr)

90289

In [423]:
#getting weather dataframe
dfWeather = pd.read_sql('select * from weather;', DBengine)
dfWeather

Unnamed: 0,date,rain,temp,humidity,pressure
0,01/01/2018 00:00,0.0,4.6,82,991.0
1,01/01/2018 01:00,0.1,4.7,81,991.1
2,01/01/2018 02:00,0.0,4.8,81,991.1
3,01/01/2018 03:00,0.0,4.9,82,990.7
4,01/01/2018 04:00,0.0,5.3,81,990.3
...,...,...,...,...,...
8756,31/12/2018 19:00,0.0,9.9,74,1034.9
8757,31/12/2018 20:00,0.0,9.9,75,1035.0
8758,31/12/2018 21:00,0.0,9.9,75,1035.0
8759,31/12/2018 22:00,0.0,9.9,76,1035.1


In [424]:
dfWeather.dtypes

date         object
rain        float64
temp        float64
humidity      int64
pressure    float64
dtype: object

In [425]:
dfWeather['date'] = pd.to_datetime(dfWeather.date, format="%d/%m/%Y %H:%M")
dfWeather.head(3)

Unnamed: 0,date,rain,temp,humidity,pressure
0,2018-01-01 00:00:00,0.0,4.6,82,991.0
1,2018-01-01 01:00:00,0.1,4.7,81,991.1
2,2018-01-01 02:00:00,0.0,4.8,81,991.1


In [426]:
#new columns to make merging into bus data easier
dfWeather['hour'] = dfWeather.date.dt.hour
dfWeather['date'] = dfWeather.date.dt.date
dfWeather['date'] = pd.to_datetime(dfWeather.date, format = "%Y-%m-%d")

In [427]:
dfWeather.head(3)

Unnamed: 0,date,rain,temp,humidity,pressure,hour
0,2018-01-01,0.0,4.6,82,991.0,0
1,2018-01-01,0.1,4.7,81,991.1,1
2,2018-01-01,0.0,4.8,81,991.1,2


In [428]:
dfWeather.dtypes

date        datetime64[ns]
rain               float64
temp               float64
humidity             int64
pressure           float64
hour                 int64
dtype: object

In [429]:
#temporary columns to join to weather to deal with some stops happening in hours 24 and 25 of a day
#I think this happens as they don't want the date to rollover in the middle of a bus trip
df['hour'] = df.stopActualArr//3600
df['day'] = df['date']

#in new column rollover the date when hour is >23, also subtract 24 from hour
#now all rows should be able to match to a weather row
df.loc[df.hour > 23, ['day']] += pd.Timedelta(days=1)
df.loc[df.hour > 23, ['hour']] -=24

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df.stopActualArr//3600
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day'] = df['date']


In [430]:
df = df.merge(dfWeather, how='left', left_on = ['day','hour'], right_on = ['date','hour'])

In [431]:
df.rename(columns={'date_x':'date'},inplace=True)
df.drop(columns=['hour','day','date_y'], inplace=True)

In [432]:
#checking if any nulls exist
df.isna().sum()

date             0
tripid           0
progrnumber      0
stopActualArr    0
journeytime      0
dwelltime        0
rain             0
temp             0
humidity         0
pressure         0
dtype: int64

In [433]:
df.head(3)

Unnamed: 0,date,tripid,progrnumber,stopActualArr,journeytime,dwelltime,rain,temp,humidity,pressure
0,2018-01-01,5955785,1,36583,0,0,0.0,5.6,79,992.9
1,2018-01-01,5955785,2,36617,34,12,0.0,5.6,79,992.9
2,2018-01-01,5955785,3,36721,138,0,0.0,5.6,79,992.9


In [434]:
#removing trips that have a negative trip time along any point
dftmp = df[(df.journeytime <= 0) & (df.progrnumber > 1)]
df = df[~df['date'].isin(dftmp['date']) | ~df['tripid'].isin(dftmp['tripid'])].reset_index()
df

Unnamed: 0,index,date,tripid,progrnumber,stopActualArr,journeytime,dwelltime,rain,temp,humidity,pressure
0,0,2018-01-01,5955785,1,36583,0,0,0.0,5.6,79,992.9
1,1,2018-01-01,5955785,2,36617,34,12,0.0,5.6,79,992.9
2,2,2018-01-01,5955785,3,36721,138,0,0.0,5.6,79,992.9
3,3,2018-01-01,5955785,4,36782,199,8,0.0,5.6,79,992.9
4,4,2018-01-01,5955785,5,36819,236,0,0.0,5.6,79,992.9
...,...,...,...,...,...,...,...,...,...,...,...
345481,347924,2018-10-27,8048174,83,83849,4681,0,0.0,2.6,78,1026.1
345482,347925,2018-10-27,8048174,84,83897,4729,0,0.0,2.6,78,1026.1
345483,347926,2018-10-27,8048174,85,83944,4776,0,0.0,2.6,78,1026.1
345484,347927,2018-10-27,8048174,86,83955,4787,0,0.0,2.6,78,1026.1
