In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.font_manager as fm
import matplotlib
import timeit
import feather
%matplotlib inline

In [2]:
res = pd.read_feather('DBus_data_first_clean_ian.feather')

In [5]:
res.shape

(70904023, 11)

In [4]:
# Attaching leading zeroes to VehicleJourneyIDs and StopIDs of less than 4 digits
# code from: https://stackoverflow.com/questions/33243763/pandas-add-leading-0-to-string-values-so-all-values-are-equal-len

res.VehicleJourneyID = res.VehicleJourneyID.apply(lambda x: str(int(x)).zfill(4))
res.StopID = res.StopID.apply(lambda x: str(int(x)).zfill(4))

In [6]:
res.head(4)

Unnamed: 0,Timestamp,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,Delay,VehicleID,StopID,AtStop,Day
0,2013-01-31 00:00:01,131005,2013-01-30,3406,-6.273923,53.343307,-235,33608,1998,False,Wednesday
1,2013-01-31 00:00:03,71003,2013-01-30,6069,-6.2307,53.317432,0,43003,8,True,Wednesday
2,2013-01-31 00:00:03,671001,2013-01-30,905,-6.325533,53.346302,-396,33438,2248,False,Wednesday
3,2013-01-31 00:00:03,831001,2013-01-30,5613,-6.284449,53.420197,-490,40012,1552,False,Wednesday


## ------------------------Start: The same as last version ---------------------------- ##

### Part I. Second stage cleaning

#### 1. Delete those rows of same TimeFrame + JourneyPatternID  +  VehicleJourneyID +  VehicleID happen under n times
0711: test n = 10

In [7]:
gb = res.groupby(['TimeFrame', 'JourneyPatternID', 'VehicleJourneyID', 'VehicleID'])

In [8]:
# Delete those count is less than 10

gbcount = gb['AtStop'].count()
idx = pd.DataFrame(gbcount[gbcount < 10])

In [9]:
# Set the index back to data

idx.reset_index(['TimeFrame', 'JourneyPatternID', 'VehicleJourneyID', 'VehicleID'], inplace=True)

In [10]:
# Merge back to res to see which one should be keep

merge = pd.merge(res, idx, on=['TimeFrame', 'JourneyPatternID', 'VehicleJourneyID', 'VehicleID'], how='outer',suffixes='oa')

# If null means that row should keep

res = merge[merge['AtStopa'].isnull()]

# Clean up

res.drop('AtStopa', axis=1, inplace=True)
res = res.rename(columns = {'AtStopo':'AtStop'})
del merge

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


#### 2. Delete those rows of same TimeFrame + VehicleJourneyID  but exist over one JourneyPatternID

In [11]:
# Due to make following function work need to change datatype here

res["JourneyPatternID"] = res["JourneyPatternID"].astype("object")

# Check how many unique JourneyPatternID under that group (if over 1 is abnormal)

gb = res.groupby(['TimeFrame', 'VehicleJourneyID'])
idx = pd.DataFrame(gb['JourneyPatternID'].unique())
idx['JourneyPatternID'] = idx['JourneyPatternID'].apply(lambda x: len(x))

In [12]:
# Get those rows which unique JourneyPatternID are over 1

idx = idx[idx.JourneyPatternID > 1]

# Set the index back to data

idx.reset_index(['TimeFrame', 'VehicleJourneyID'], inplace=True)


In [13]:
# Merge with res then can see if is null at the column JourneyPatternIDA then should keep
# suffixes='OA' : is set column name to JourneyPatternIDO and JourneyPatternIDA

merge = pd.merge(res, idx, on=['TimeFrame', 'VehicleJourneyID'], how='outer',suffixes='OA')

# Keep those JourneyPatternIDA is null

res = merge[merge['JourneyPatternIDA'].isnull()]

# Clean up

res.drop('JourneyPatternIDA', axis=1, inplace=True)
res = res.rename(columns = {'JourneyPatternIDO':'JourneyPatternID'})
del merge

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [15]:
# Store as feather

res = res.reset_index()
res.to_feather('DBus_stage_three_clean_v1_step1.feather')

In [78]:
res.shape

(70555585, 12)

## ------------------------End: The same as last version ---------------------------- ##

### Part II. Delete stopID not in that trip_id (same meaning as JourneyPatternID)

In [2]:
res = pd.read_feather('DBus_stage_three_clean_v1_step1.feather')
res.shape

(70555585, 12)

#### Step 1. Prepare datafram to compare

#### 1. Get the dataframe that has shape_id, route, sequence of stops and stopID base on the "googletransit20130315-1546/stop_times.csv" data

In [3]:
# Read in stop_times.csv and trips.csv
stop_times_2012 = pd.read_csv('stop_times.csv')
trips_2012 = pd.read_csv('trips.csv')

# Merge by trip_id
# trip_id here has the same meaning of JourneyPatternID
merge = pd.merge(stop_times_2012, trips_2012, on='trip_id', how='outer')

# Keep only necessary columns
merge.drop(['arrival_time','departure_time','pickup_type','drop_off_type','service_id','shape_dist_traveled'], axis=1, inplace=True)

In [4]:
# Seporate trip_id to route, stop_id to StopID
# Route
merge['route_short'] = merge['trip_id'].apply(lambda x: x[x.index('-')+1:])
merge['route_short'] = merge['route_short'].apply(lambda x: x[: x.index('-')])
merge['route_short'] = merge['route_short'].apply(lambda x: str(x).zfill(4))

# StopID
merge['StopID'] = merge['stop_id'].apply(lambda x: x[-4:])

In [5]:
# Find StopID and the sequence of that shape_id
gb = merge.groupby(['shape_id', 'route_short', 'direction_id','stop_sequence', 'StopID'])
gbc = gb.count()
gbc.reset_index(['shape_id', 'route_short', 'direction_id','stop_sequence', 'StopID'], inplace=True)
transit_shapeID_stopID = gbc.drop(['trip_id','stop_id','route_id','trip_headsign'], axis=1)

In [6]:
# Now we have trip_id to its corresponding StopID and sequence
transit_shapeID_stopID

Unnamed: 0,shape_id,route_short,direction_id,stop_sequence,StopID
0,0-1-y12-1.1.O,0001,0,1,0226
1,0-1-y12-1.1.O,0001,0,2,0228
2,0-1-y12-1.1.O,0001,0,3,0229
3,0-1-y12-1.1.O,0001,0,4,0227
4,0-1-y12-1.1.O,0001,0,5,0230
5,0-1-y12-1.1.O,0001,0,6,0231
6,0-1-y12-1.1.O,0001,0,7,1641
7,0-1-y12-1.1.O,0001,0,8,1642
8,0-1-y12-1.1.O,0001,0,9,0213
9,0-1-y12-1.1.O,0001,0,10,0214


#### 2. Get the dataframe that has JourneyPatternID to its possible StopID from the data clean from second stage

In [7]:
# Get JourneyPatternID and its StopID
temp = res[res.AtStop == True]
gb = temp.groupby(['JourneyPatternID', 'StopID'])
gc = gb.count()

In [8]:
# Clean up
gbc = gc[~gc.Timestamp.isnull()] # Doing this here is cause the feather will include all column, so clean up a little here
gbc.reset_index(['JourneyPatternID', 'StopID'], inplace=True)
gps_JPID_StopID = gbc.loc[:,['JourneyPatternID', 'StopID', 'AtStop']]
gps_JPID_StopID = gps_JPID_StopID.rename(columns = {'AtStop':'Count'})

In [9]:
# Separate JourneyPatternID to route and direction
gps_JPID_StopID['route_short'] = gps_JPID_StopID['JourneyPatternID'].str[:4]
gps_JPID_StopID['Direction'] = gps_JPID_StopID['JourneyPatternID'].str[4:]


# Now we have JourneyPatternID to its possible StopID
gps_JPID_StopID

Unnamed: 0,JourneyPatternID,StopID,Count,route_short,Direction
0,00010001,0119,688.0,0001,0001
1,00010001,1641,352.0,0001,0001
2,00010001,1642,648.0,0001,0001
3,00010001,0213,1734.0,0001,0001
4,00010001,0214,266.0,0001,0001
5,00010001,0226,23931.0,0001,0001
6,00010001,0227,716.0,0001,0001
7,00010001,0228,203.0,0001,0001
8,00010001,0229,689.0,0001,0001
9,00010001,0230,258.0,0001,0001


0720 add
### Check if the number of trip_id is match with JourneyPatternID
In this section, we try to find out if the number of trip_id under each route in googletransit2012 is the same number of JourneyPatternID in gps data. For example, for route 007B has trip_id "0-7B-y12-1.176.O" and "0-7B-y12-1.177.I"; JourneyPatternID "007B0001" and "007B1001". So we know at least there is one-to-one mapping. Other scenario see document "compare_route_googletransit_vs_gps.xlsx"

In [10]:
gb_tss = transit_shapeID_stopID.groupby(['route_short','shape_id'])
gb_tss_c = gb_tss.count()
gb_tss_c = gb_tss_c.reset_index(['route_short','shape_id'])
gb_tss_c.drop(['direction_id', 'stop_sequence', 'StopID'], axis=1, inplace=True)
gb_tss_c

Unnamed: 0,route_short,shape_id
0,0001,0-1-y12-1.1.O
1,0001,0-1-y12-1.2.O
2,0001,0-1-y12-1.3.O
3,0001,0-1-y12-1.4.I
4,0001,0-1-y12-1.5.I
5,0004,0-4-y12-1.6.O
6,0004,0-4-y12-1.7.O
7,0004,0-4-y12-1.8.I
8,0004,0-4-y12-1.9.I
9,0007,0-7-y12-1.167.O


In [11]:
gb_gjs = gps_JPID_StopID.groupby(['route_short','JourneyPatternID'])
gb_gjs_c = gb_gjs.count()
gb_gjs_c = gb_gjs_c.reset_index(['route_short','JourneyPatternID'])
gb_gjs_c.drop(['StopID', 'Count', 'Direction'], axis=1, inplace=True)
gb_gjs_c

Unnamed: 0,route_short,JourneyPatternID
0,0001,00010001
1,0001,00010002
2,0001,00010003
3,0001,00011001
4,0001,00011002
5,0004,00040001
6,0004,00040002
7,0004,00041001
8,0004,00041002
9,0007,00070001


In [12]:
# Route exist both in GPS and google transit
inter = set(gb_tss_c.route_short.unique()) & set(gb_gjs_c.route_short.unique())

tss_l = []
gjs_l = []

for r in inter:
    tss_l.append(gb_tss_c[gb_tss_c.route_short == r])
    gjs_l.append(gb_gjs_c[gb_gjs_c.route_short == r])

#     
tss_l[0] = tss_l[0].reset_index()
gjs_l[0] = gjs_l[0].reset_index()
df = pd.concat([tss_l[0], gjs_l[0]], axis=1)


for i in range(1, len(tss_l)):
    tss_l[i] = tss_l[i].reset_index()
    gjs_l[i] = gjs_l[i].reset_index()
    temp = pd.concat([tss_l[i], gjs_l[i]], axis=1)
    df = pd.concat([df, temp], axis=0)


In [13]:
# Route not in GPS data
diff = set(gb_tss_c.route_short.unique()) - inter

df2 = pd.DataFrame(columns = df.columns)
for r in diff:
    x = gb_tss_c[gb_tss_c.route_short == r]
    x = x.reset_index()
    y = gb_gjs_c[gb_gjs_c.route_short == r]
    y = y.reset_index()
    temp = pd.concat([x, y], axis=1)
    df2 = pd.concat([df2, temp], axis=0)

# Route not in google transit data

diff = set(gb_gjs_c.route_short.unique()) - inter

#df2 = pd.DataFrame(columns = df.columns)
for r in diff:
    x = gb_tss_c[gb_tss_c.route_short == r]
    x = x.reset_index()
    y = gb_gjs_c[gb_gjs_c.route_short == r]
    y = y.reset_index()
    temp = pd.concat([x, y], axis=1)
    df2 = pd.concat([df2, temp], axis=0)


In [14]:
out = pd.concat([df,df2], axis=0)
out.drop(['index'], axis=1, inplace=True)
out.to_csv('compare_route_googletransit_vs_gps.csv', index=False)

### Step2. Mapping the possible trip_id to JourneyPatternID
In order to find the possible trip_id to JourneyPatternID, here transfer the sequence of stop_id in each trip_id to set. The same as StopID of each JourneyPatternID.
The reason to transfer to a set is because we can use the set intersection method to find out the common StopID between two sets. Example, JourneyPatternID 00010001 has stopID {1,2,3,4}, 00010002 has {1,3,5} in gps data and trip_id 0-1-y12-1.1.O has stop_id {1,2,3,4}. So we can get the length of intersection from 00010001 is 4 and 00010002 is 2. Which means 00010001 is more likely is trip_id 0-1-y12-1.1.O

0720 Revised
#### Remain only those route is both exist in googletransit and gps data
Due to the observation above, we found out there are few route not exist in googletransit. So, we will delete it at this moment.

In [15]:
# In diff set are those route not exist in googletransit
inter = set(gb_tss_c.route_short.unique()) & set(gb_gjs_c.route_short.unique())
diff = set(gb_gjs_c.route_short.unique()) - inter

res['Route'] = res['JourneyPatternID'].str[:4]
res = res[~res.Route.isin(list(diff))]

#### 1. Get StopID set of googletransit data

In [16]:
# Sort by shape_id and stopID
transit_shapeID_stopID.sort_values(['shape_id', 'StopID'], inplace=True)

# Store each StopID of each shape into set
shapeID_stopID_set = {}
shape_unique = transit_shapeID_stopID.shape_id.unique()

for s in shape_unique:
    temp = []
    for stopid in transit_shapeID_stopID[transit_shapeID_stopID.shape_id == s]['StopID']:
        temp.append(stopid)
        temp.sort()
    shapeID_stopID_set[s] = [set(temp), temp]

In [17]:
# Here transform to dataframe so can merge later

# Get the shape_id to its stopID list
shapeID_stopID_df = pd.DataFrame(shapeID_stopID_set, index=['StopID_set', 'StopID_list'])
shapeID_stopID_df = shapeID_stopID_df.T
shapeID_stopID_df = shapeID_stopID_df.reset_index()
shapeID_stopID_df = shapeID_stopID_df.rename(columns = {'index':'shape_id'})

In [18]:
# Separate shape_id to route
shapeID_stopID_df['route_short'] = shapeID_stopID_df['shape_id'].apply(lambda x: x[x.index('-')+1:][:x[x.index('-')+1:].index('-')])
shapeID_stopID_df['route_short'] = shapeID_stopID_df['route_short'].apply(lambda x: str(x).zfill(4))

In [19]:
# Now we get the shape_id to its StopID set
shapeID_stopID_df

Unnamed: 0,shape_id,StopID_set,StopID_list,route_short
0,0-1-y12-1.1.O,"{0231, 0374, 0044, 0375, 0230, 0390, 0378, 035...","[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005...",0001
1,0-1-y12-1.2.O,"{0374, 0375, 0390, 0378, 0354, 0380, 0340, 037...","[0265, 0271, 0340, 0350, 0351, 0352, 0353, 035...",0001
2,0-1-y12-1.3.O,"{0231, 0044, 0230, 0048, 1641, 0119, 0227, 021...","[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005...",0001
3,0-1-y12-1.4.I,"{0017, 0014, 0220, 0223, 0019, 0382, 0636, 037...","[0010, 0012, 0014, 0015, 0017, 0018, 0019, 002...",0001
4,0-1-y12-1.5.I,"{0382, 0636, 0371, 0395, 0391, 0637, 0387, 039...","[0010, 0278, 0319, 0371, 0381, 0382, 0383, 038...",0001
5,0-102-y12-1.85.O,"{3572, 4381, 0905, 6015, 0913, 3620, 3618, 093...","[0733, 0788, 0815, 0816, 0905, 0913, 0914, 093...",0102
6,0-102-y12-1.86.I,"{3708, 3090, 3606, 4465, 1073, 6055, 4381, 360...","[0928, 0944, 0945, 0947, 0948, 0949, 0950, 095...",0102
7,0-102-y12-1.87.I,"{3606, 4465, 1073, 4381, 3608, 0949, 3598, 095...","[0928, 0944, 0945, 0947, 0948, 0949, 0950, 095...",0102
8,0-104-y12-1.88.O,"{1774, 4597, 0662, 0220, 0223, 1650, 0225, 165...","[0218, 0219, 0220, 0221, 0222, 0223, 0224, 022...",0104
9,0-104-y12-1.89.I,"{0231, 1747, 0242, 1749, 0230, 0243, 0233, 176...","[0226, 0227, 0228, 0229, 0230, 0231, 0232, 023...",0104


#### 2. StopID set of gps data

In [20]:
# The same operation here for gps data
# Sort by JourneyPatternID and stopID
gps_JPID_StopID.sort_values(['JourneyPatternID', 'StopID'], inplace=True)

# Store each StopID of each JourneyPatternID into list
JPID_stopID_set = {}
JPID_unique = gps_JPID_StopID.JourneyPatternID.unique()

for s in JPID_unique:
    temp = []
    for stopid in gps_JPID_StopID[gps_JPID_StopID.JourneyPatternID == s]['StopID']:
        temp.append(stopid)
        temp.sort()
    JPID_stopID_set[s] = [set(temp), temp]

In [21]:
# Get the JourneyPatternID to its stopID list
JPID_stopID_df = pd.DataFrame(JPID_stopID_set, index=['StopID_set', 'StopID_list'])
JPID_stopID_df = JPID_stopID_df.T
JPID_stopID_df = JPID_stopID_df.reset_index()
JPID_stopID_df = JPID_stopID_df.rename(columns = {'index':'JourneyPatternID'})
JPID_stopID_df

Unnamed: 0,JourneyPatternID,StopID_set,StopID_list
0,00010001,"{0231, 0374, 0044, 0375, 0230, 0390, 0378, 035...","[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005..."
1,00010002,"{0374, 0375, 0390, 0378, 0354, 0380, 0340, 037...","[0265, 0271, 0340, 0350, 0351, 0352, 0353, 035..."
2,00010003,"{0231, 0048, 0265, 1641, 0049, 0229, 0046, 005...","[0045, 0046, 0047, 0048, 0049, 0050, 0051, 005..."
3,00011001,"{0017, 0014, 0220, 0223, 0019, 0382, 0371, 039...","[0010, 0012, 0014, 0015, 0017, 0018, 0019, 002..."
4,00011002,"{0382, 0371, 0395, 0391, 0387, 0393, 0397, 039...","[0010, 0278, 0319, 0371, 0381, 0382, 0383, 038..."
5,00040001,"{0113, 0037, 0419, 7481, 7483, 0408, 2026, 032...","[0037, 0038, 0039, 0040, 0112, 0113, 0114, 011..."
6,00040002,"{0113, 0037, 0324, 0190, 0188, 0112, 0184, 014...","[0037, 0038, 0040, 0112, 0113, 0115, 0127, 014..."
7,00041001,"{0092, 0091, 0483, 6182, 0197, 0473, 0028, 048...","[0006, 0027, 0028, 0029, 0090, 0091, 0092, 009..."
8,00041002,"{0483, 7330, 0473, 0491, 0281, 3162, 0400, 047...","[0281, 0400, 0469, 0471, 0473, 0475, 0478, 047..."
9,00070001,"{3219, 0419, 2035, 2041, 2042, 3220, 0408, 204...","[0273, 0402, 0405, 0408, 0409, 0410, 0411, 041..."


In [22]:
# Get the route from JourneyPatternID
JPID_stopID_df['route_short'] = JPID_stopID_df['JourneyPatternID'].str[:4]

In [23]:
# Now we also have JourneyPatternID to its StopID list
JPID_stopID_df

Unnamed: 0,JourneyPatternID,StopID_set,StopID_list,route_short
0,00010001,"{0231, 0374, 0044, 0375, 0230, 0390, 0378, 035...","[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005...",0001
1,00010002,"{0374, 0375, 0390, 0378, 0354, 0380, 0340, 037...","[0265, 0271, 0340, 0350, 0351, 0352, 0353, 035...",0001
2,00010003,"{0231, 0048, 0265, 1641, 0049, 0229, 0046, 005...","[0045, 0046, 0047, 0048, 0049, 0050, 0051, 005...",0001
3,00011001,"{0017, 0014, 0220, 0223, 0019, 0382, 0371, 039...","[0010, 0012, 0014, 0015, 0017, 0018, 0019, 002...",0001
4,00011002,"{0382, 0371, 0395, 0391, 0387, 0393, 0397, 039...","[0010, 0278, 0319, 0371, 0381, 0382, 0383, 038...",0001
5,00040001,"{0113, 0037, 0419, 7481, 7483, 0408, 2026, 032...","[0037, 0038, 0039, 0040, 0112, 0113, 0114, 011...",0004
6,00040002,"{0113, 0037, 0324, 0190, 0188, 0112, 0184, 014...","[0037, 0038, 0040, 0112, 0113, 0115, 0127, 014...",0004
7,00041001,"{0092, 0091, 0483, 6182, 0197, 0473, 0028, 048...","[0006, 0027, 0028, 0029, 0090, 0091, 0092, 009...",0004
8,00041002,"{0483, 7330, 0473, 0491, 0281, 3162, 0400, 047...","[0281, 0400, 0469, 0471, 0473, 0475, 0478, 047...",0004
9,00070001,"{3219, 0419, 2035, 2041, 2042, 3220, 0408, 204...","[0273, 0402, 0405, 0408, 0409, 0410, 0411, 041...",0007


#### 3. Now merge and find the length of common StopID between two set

In [24]:
# Merge two datafram base on the route_short (cause for now we still don't know which JourneyPatternID is which trip_id)
merge = pd.merge(shapeID_stopID_df, JPID_stopID_df, on='route_short', how='inner')

In [26]:
# Find the common StopID of googletransport and gps
# (x[0] & x[1]) is for python set structure to find the common elements between two sets
merge['common_len'] = merge.loc[:,['StopID_set_x', 'StopID_set_y']].apply(lambda x: len(x[0] & x[1]), axis=1)
merge['shape_id_len'] = merge['StopID_list_x'].apply(lambda x: len(x))
merge['JourneyPatternID_len'] = merge['StopID_list_y'].apply(lambda x: len(x))

merge['JPID_len-shapid_len'] = merge['JourneyPatternID_len'] - merge['shape_id_len']
merge = merge[merge['JPID_len-shapid_len'] == 0]

In [27]:
# Get the maxmun common len in each shape_id
merge_max = merge[merge.groupby(['shape_id'])['common_len'].transform('max') == merge['common_len']]
merge_max

Unnamed: 0,shape_id,StopID_set_x,StopID_list_x,route_short,JourneyPatternID,StopID_set_y,StopID_list_y,common_len,shape_id_len,JourneyPatternID_len,JPID_len-shapid_len
0,0-1-y12-1.1.O,"{0231, 0374, 0044, 0375, 0230, 0390, 0378, 035...","[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005...",0001,00010001,"{0231, 0374, 0044, 0375, 0230, 0390, 0378, 035...","[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005...",42,42,42,0
6,0-1-y12-1.2.O,"{0374, 0375, 0390, 0378, 0354, 0380, 0340, 037...","[0265, 0271, 0340, 0350, 0351, 0352, 0353, 035...",0001,00010002,"{0374, 0375, 0390, 0378, 0354, 0380, 0340, 037...","[0265, 0271, 0340, 0350, 0351, 0352, 0353, 035...",21,21,21,0
25,0-102-y12-1.85.O,"{3572, 4381, 0905, 6015, 0913, 3620, 3618, 093...","[0733, 0788, 0815, 0816, 0905, 0913, 0914, 093...",0102,01020001,"{3572, 4381, 0905, 6015, 0913, 3620, 3618, 093...","[0733, 0788, 0815, 0816, 0905, 0913, 0914, 093...",52,52,52,0
29,0-102-y12-1.86.I,"{3708, 3090, 3606, 4465, 1073, 6055, 4381, 360...","[0928, 0944, 0945, 0947, 0948, 0949, 0950, 095...",0102,01021001,"{3708, 3090, 3606, 4465, 1073, 6055, 4381, 360...","[0928, 0944, 0945, 0947, 0948, 0949, 0950, 095...",55,55,55,0
34,0-104-y12-1.88.O,"{1774, 4597, 0662, 0220, 0223, 1650, 0225, 165...","[0218, 0219, 0220, 0221, 0222, 0223, 0224, 022...",0104,01040001,"{1774, 4597, 0662, 0220, 0223, 1650, 0225, 165...","[0218, 0219, 0220, 0221, 0222, 0223, 0224, 022...",32,32,32,0
37,0-104-y12-1.89.I,"{0231, 1747, 0242, 1749, 0230, 0243, 0233, 176...","[0226, 0227, 0228, 0229, 0230, 0231, 0232, 023...",0104,01041001,"{0231, 1747, 0242, 1749, 0230, 0243, 0233, 176...","[0226, 0227, 0228, 0229, 0230, 0231, 0232, 023...",33,33,33,0
38,0-11-y12-1.14.O,"{0855, 0037, 0857, 0441, 0044, 0747, 0449, 026...","[0035, 0036, 0037, 0038, 0039, 0040, 0041, 004...",0011,00110001,"{0855, 0037, 0857, 0441, 0044, 0747, 0449, 026...","[0035, 0036, 0037, 0038, 0039, 0040, 0041, 004...",62,62,62,0
48,0-11-y12-1.16.I,"{0017, 0014, 0878, 0887, 0019, 3028, 0449, 302...","[0010, 0012, 0014, 0015, 0017, 0018, 0019, 002...",0011,00111001,"{0017, 0014, 0878, 0887, 7220, 0019, 3028, 044...","[0010, 0012, 0014, 0015, 0017, 0018, 0019, 002...",63,64,64,0
58,0-114-y12-1.92.O,"{3085, 3184, 0449, 6007, 3181, 3491, 3532, 045...","[0449, 0450, 3085, 3181, 3184, 3187, 3188, 318...",0114,01140001,"{3085, 3184, 0449, 6007, 3181, 3491, 3532, 045...","[0449, 0450, 3085, 3181, 3184, 3187, 3188, 318...",28,28,28,0
61,0-114-y12-1.93.I,"{3085, 2833, 3166, 3173, 3175, 3168, 3176, 317...","[0448, 2833, 3082, 3085, 3165, 3166, 3167, 316...",0114,01141001,"{3085, 2833, 3166, 3173, 3175, 3168, 3176, 317...","[0448, 2833, 3082, 3085, 3165, 3166, 3167, 316...",29,29,29,0


In [28]:
# Now we get the JourneyPatternID to its possible shape_id
# Here could have duplicate JPID but it's the same StopID_list so doesn't matter map to which shape_id, they are the same
map_JPId_shapeID = merge_max.loc[:, ['route_short', 'JourneyPatternID', 'shape_id', 'StopID_list_x']]
map_JPId_shapeID

Unnamed: 0,route_short,JourneyPatternID,shape_id,StopID_list_x
0,0001,00010001,0-1-y12-1.1.O,"[0044, 0045, 0046, 0047, 0048, 0049, 0050, 005..."
6,0001,00010002,0-1-y12-1.2.O,"[0265, 0271, 0340, 0350, 0351, 0352, 0353, 035..."
25,0102,01020001,0-102-y12-1.85.O,"[0733, 0788, 0815, 0816, 0905, 0913, 0914, 093..."
29,0102,01021001,0-102-y12-1.86.I,"[0928, 0944, 0945, 0947, 0948, 0949, 0950, 095..."
34,0104,01040001,0-104-y12-1.88.O,"[0218, 0219, 0220, 0221, 0222, 0223, 0224, 022..."
37,0104,01041001,0-104-y12-1.89.I,"[0226, 0227, 0228, 0229, 0230, 0231, 0232, 023..."
38,0011,00110001,0-11-y12-1.14.O,"[0035, 0036, 0037, 0038, 0039, 0040, 0041, 004..."
48,0011,00111001,0-11-y12-1.16.I,"[0010, 0012, 0014, 0015, 0017, 0018, 0019, 002..."
58,0114,01140001,0-114-y12-1.92.O,"[0449, 0450, 3085, 3181, 3184, 3187, 3188, 318..."
61,0114,01141001,0-114-y12-1.93.I,"[0448, 2833, 3082, 3085, 3165, 3166, 3167, 316..."


In [29]:
# Export to csv
map_JPId_shapeID.to_csv('map_JPID_to_shape_id.csv', index=False)

In [31]:
res.shape

(70434885, 13)

#### Step 3. Now delete those rows that StopID should not in that JourneyPatternID

In [32]:
# Base on the mapping made before we can know what StopID should on that routes
# Before we already get the shape_id to its StopID sequence, so we use this as basic
transit_shapeID_stopID
# Add JourneyPatternID column
map_JPID_to_stopID = pd.merge(transit_shapeID_stopID, map_JPId_shapeID, on='shape_id', how='inner')
map_JPID_to_stopID = map_JPID_to_stopID.loc[:,['JourneyPatternID', 'StopID']]
map_JPID_to_stopID

Unnamed: 0,JourneyPatternID,StopID
0,00010001,0044
1,00010001,0045
2,00010001,0046
3,00010001,0047
4,00010001,0048
5,00010001,0049
6,00010001,0050
7,00010001,0051
8,00010001,0052
9,00010001,0119


#### 0720 revised
Split into 2 situation. For those JourneyPatternID certainly can map to certain trip_id, we delete those StopID should not on that JouurneyPatternID. Others, we still use percentage to judge if need to be remove.

#### First sub-set: JourneyPatternID can map to certain trip_id

In [33]:
# Get those JPID can map to trip_id
can_map = map_JPID_to_stopID['JourneyPatternID'].unique()
res_sub1 = res[res.JourneyPatternID.isin(can_map)]
res_sub1.shape

(56489727, 13)

In [34]:
# Now start to delete
# merge res and map_JPID_to_stopID

res_sub1 = pd.merge(res_sub1, map_JPID_to_stopID, on=['JourneyPatternID','StopID'], how='inner')
res_sub1

Unnamed: 0,index,Timestamp,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,Delay,VehicleID,StopID,AtStop,Day,Route
0,135,2013-01-31 00:00:03,00071003,2013-01-30,6069,-6.230700,53.317432,0,43003,0008,True,Wednesday,0007
1,135,2013-01-31 00:00:03,00071003,2013-01-30,6069,-6.230700,53.317432,0,43003,0008,True,Wednesday,0007
2,136,2013-01-31 00:00:23,00071003,2013-01-30,6069,-6.230683,53.317417,0,43003,0008,True,Wednesday,0007
3,136,2013-01-31 00:00:23,00071003,2013-01-30,6069,-6.230683,53.317417,0,43003,0008,True,Wednesday,0007
4,137,2013-01-31 00:00:42,00071003,2013-01-30,6069,-6.230717,53.317432,0,43003,0008,True,Wednesday,0007
5,137,2013-01-31 00:00:42,00071003,2013-01-30,6069,-6.230717,53.317432,0,43003,0008,True,Wednesday,0007
6,138,2013-01-31 00:00:48,00071003,2013-01-30,6069,-6.230717,53.317432,0,43003,0008,True,Wednesday,0007
7,138,2013-01-31 00:00:48,00071003,2013-01-30,6069,-6.230717,53.317432,0,43003,0008,True,Wednesday,0007
8,139,2013-01-31 00:01:02,00071003,2013-01-30,6069,-6.230717,53.317451,0,43003,0008,True,Wednesday,0007
9,139,2013-01-31 00:01:02,00071003,2013-01-30,6069,-6.230717,53.317451,0,43003,0008,True,Wednesday,0007


In [37]:
res_sub1.shape

(60069484, 14)

In [None]:
# Store as feather
#res_sub1 = res_sub1.reset_index()
res_sub1.to_feather('res_sub1.feather')

#### Second sub-set: If cannot map to any JournayPatternID
In version 3, use the outlier mean - 1.5*IQR to delete the outlier. In this version, for those JourneyPatternID not sure map to which trip_id we use "mean - 1.5*IQR" to delete the outlier.

In [None]:
res_sub2 = res[~res.JourneyPatternID.isin(can_map)]
res_sub2.shape

In [None]:
# Count how many times that StopID occurs in the certain JourneyPatternID

gb = res_sub2.groupby(['JourneyPatternID', 'StopID'])
idx = pd.DataFrame(gb['AtStop'].count())

In [None]:
# Set index back to data

idx.reset_index(['JourneyPatternID', 'StopID'], inplace=True)

# Rename column

idx = idx.rename(columns = {'AtStop':'AtStopCount'})

In [None]:
# In order to find the outlier, here calculate the mean and IQR of that JourneyPatternID + StopID

gb2 = idx.groupby(['JourneyPatternID'])
gb2count =  gb2['AtStopCount'].mean()
gb2q =  gb2['AtStopCount'].quantile(.75) - gb2['AtStopCount'].quantile(.25)
idx2 = pd.concat([pd.DataFrame(gb2count),pd.DataFrame(gb2q)], axis=1)

# In order to merge with idx, here set JourneyPatternID back to data

idx2.reset_index(['JourneyPatternID'], inplace=True)
idx2.columns = ['JourneyPatternID', 'AtStopCountMean', 'AtStopCountIQR']

In [None]:
# Calculate mean - 2 * IQR

idx2['AtStopCountLowerBound'] = idx2['AtStopCountMean'] - 2* idx2['AtStopCountIQR']

In [None]:
idx2

In [39]:
# Compare the time that StopID occurs of its JourneyPatternID with the lowerbound

merge = pd.merge(idx, idx2, on=['JourneyPatternID'], how='outer')

# merge will only keep those JourneyPatternID + StopID is outlier

merge = merge[merge['AtStopCount'] < merge['AtStopCountLowerBound']]

# Clean up..

merge.drop(['AtStopCount', 'AtStopCountMean', 'AtStopCountIQR'], axis=1, inplace=True)

In [40]:
# Merge with res, then those AtStopCountLowerBound is null should be kept

merge2 = pd.merge(res_sub2, merge, on=['JourneyPatternID', 'StopID'], how='outer',suffixes='OA')
res_sub2 = merge2[merge2['AtStopCountLowerBound'].isnull()]

# Clean up

res_sub2.drop('AtStopCountLowerBound', axis=1, inplace=True)
del merge, merge2

In [41]:
res_sub2

Unnamed: 0,index,Timestamp,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,Delay,VehicleID,StopID,AtStop,Day,Route,AtStopCountLowerBound
0,0,2013-01-31 00:00:01,00131005,2013-01-30,3406,-6.273923,53.343307,-235,33608,1998,False,Wednesday,0013,
1,132,2013-01-30 23:59:04,00131005,2013-01-30,3406,-6.278856,53.342804,-218,33608,1998,False,Wednesday,0013,
2,133,2013-01-30 23:59:22,00131005,2013-01-30,3406,-6.277600,53.342831,-218,33608,1998,False,Wednesday,0013,
3,134,2013-01-30 23:59:43,00131005,2013-01-30,3406,-6.276123,53.343075,-218,33608,1998,False,Wednesday,0013,
4,526703,2013-01-29 23:58:10,00131005,2013-01-29,3406,-6.277630,53.342827,-290,33609,1998,False,Tuesday,0013,
5,526704,2013-01-29 23:58:28,00131005,2013-01-29,3406,-6.275703,53.343204,-290,33609,1998,False,Tuesday,0013,
6,526705,2013-01-29 23:58:49,00131005,2013-01-29,3406,-6.273649,53.343307,-313,33609,1998,False,Tuesday,0013,
7,1321942,2013-01-28 23:57:48,00131005,2013-01-28,3406,-6.279424,53.342854,-284,33189,1998,False,Monday,0013,
8,1321943,2013-01-28 23:58:06,00131005,2013-01-28,3406,-6.275852,53.343159,-319,33189,1998,False,Monday,0013,
9,1321944,2013-01-28 23:58:26,00131005,2013-01-28,3406,-6.274416,53.343353,-319,33189,1998,False,Monday,0013,


In [43]:
# Concat two files together

res = pd.concat([res_sub1, res_sub2], axis=0)
res

Unnamed: 0,AtStop,AtStopCountLowerBound,Day,Delay,JourneyPatternID,Lat,Lon,Route,StopID,TimeFrame,Timestamp,VehicleID,VehicleJourneyID,index,level_0
0,True,,Wednesday,0,00071003,53.317432,-6.230700,0007,0008,2013-01-30,2013-01-31 00:00:03,43003,6069,135,135.0
1,True,,Wednesday,0,00071003,53.317417,-6.230683,0007,0008,2013-01-30,2013-01-31 00:00:23,43003,6069,136,136.0
2,True,,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:42,43003,6069,137,137.0
3,True,,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:48,43003,6069,138,138.0
4,True,,Wednesday,0,00071003,53.317451,-6.230717,0007,0008,2013-01-30,2013-01-31 00:01:02,43003,6069,139,139.0
5,True,,Wednesday,0,00071003,53.317451,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:21,43003,6069,140,140.0
6,True,,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:41,43003,6069,141,141.0
7,True,,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:50,43003,6069,142,142.0
8,True,,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:02,43003,6069,143,143.0
9,True,,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:21,43003,6069,144,144.0


In [44]:
res.drop(['AtStopCountLowerBound', 'level_0', 'index'], axis=1, inplace=True)

Unnamed: 0,AtStop,Day,Delay,JourneyPatternID,Lat,Lon,Route,StopID,TimeFrame,Timestamp,VehicleID,VehicleJourneyID
0,True,Wednesday,0,00071003,53.317432,-6.230700,0007,0008,2013-01-30,2013-01-31 00:00:03,43003,6069
1,True,Wednesday,0,00071003,53.317417,-6.230683,0007,0008,2013-01-30,2013-01-31 00:00:23,43003,6069
2,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:42,43003,6069
3,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:48,43003,6069
4,True,Wednesday,0,00071003,53.317451,-6.230717,0007,0008,2013-01-30,2013-01-31 00:01:02,43003,6069
5,True,Wednesday,0,00071003,53.317451,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:21,43003,6069
6,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:41,43003,6069
7,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:50,43003,6069
8,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:02,43003,6069
9,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:21,43003,6069


In [45]:
# Store as feather
res = res.reset_index()
res.to_feather('DBus_stage_three_clean_v1_step3.feather')

### Part III. Add Weather Information

In [3]:
res = pd.read_feather('DBus_stage_three_clean_v1_step3.feather')
res

Unnamed: 0,index,AtStop,Day,Delay,JourneyPatternID,Lat,Lon,Route,StopID,TimeFrame,Timestamp,VehicleID,VehicleJourneyID
0,0,True,Wednesday,0,00071003,53.317432,-6.230700,0007,0008,2013-01-30,2013-01-31 00:00:03,43003,6069
1,1,True,Wednesday,0,00071003,53.317417,-6.230683,0007,0008,2013-01-30,2013-01-31 00:00:23,43003,6069
2,2,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:42,43003,6069
3,3,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:48,43003,6069
4,4,True,Wednesday,0,00071003,53.317451,-6.230717,0007,0008,2013-01-30,2013-01-31 00:01:02,43003,6069
5,5,True,Wednesday,0,00071003,53.317451,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:21,43003,6069
6,6,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:41,43003,6069
7,7,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:50,43003,6069
8,8,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:02,43003,6069
9,9,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:21,43003,6069


In [4]:
# Read weather data
ch = pd.read_csv('Casement_Hourly_Nov12Jan13_WindRainOnly.csv')
dh = pd.read_csv('DubAirport_Hourly_Nov12Jan13_WindRainOnly.csv')
ph = pd.read_csv('PhoenixPark_Hourly_Nov12Jan13_RainOnly.csv')

In [5]:
ph

Unnamed: 0,date,rain
0,6/11/2012 0:00,0
1,6/11/2012 1:00,0
2,6/11/2012 2:00,0
3,6/11/2012 3:00,0
4,6/11/2012 4:00,0
5,6/11/2012 5:00,0
6,6/11/2012 6:00,0
7,6/11/2012 7:00,0.1
8,6/11/2012 8:00,0
9,6/11/2012 9:00,0


In [6]:
dh.dtypes

date     object
rain    float64
wdsp      int64
dtype: object

In [7]:
from datetime import datetime
ch['DateTime'] = ch['date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M'))
ch['DateTime'] = ch['DateTime'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H'))

dh['DateTime'] = dh['date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M'))
dh['DateTime'] = dh['DateTime'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H'))


ph['DateTime'] = ph['date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M'))
ph['DateTime'] = ph['DateTime'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H'))

In [8]:
ch.drop('date', axis=1, inplace=True)
dh.drop('date', axis=1, inplace=True)
ph.drop('date', axis=1, inplace=True)

In [9]:
# Merge
merge = pd.merge(ch, dh, on='DateTime', how='outer')
merge = pd.merge(merge, ph, on='DateTime', how='outer')
merge

Unnamed: 0,rain_x,wdsp_x,DateTime,rain_y,wdsp_y,rain
0,0.0,8,2012-11-06 00,0.0,10,0
1,0.0,8,2012-11-06 01,0.0,10,0
2,0.0,10,2012-11-06 02,0.0,10,0
3,0.0,11,2012-11-06 03,0.0,13,0
4,0.0,10,2012-11-06 04,0.0,12,0
5,0.0,11,2012-11-06 05,0.0,12,0
6,0.0,11,2012-11-06 06,0.0,13,0
7,0.0,13,2012-11-06 07,0.0,16,0.1
8,0.0,13,2012-11-06 08,0.0,17,0
9,0.0,15,2012-11-06 09,0.0,16,0


In [10]:
merge.isnull().sum()

rain_x      0
wdsp_x      0
DateTime    0
rain_y      0
wdsp_y      0
rain        0
dtype: int64

In [11]:
# Convert to float
merge['rain'] = merge['rain'].convert_objects(convert_numeric=True)
merge.dtypes

  


rain_x      float64
wdsp_x        int64
DateTime     object
rain_y      float64
wdsp_y        int64
rain        float64
dtype: object

In [12]:
# Calculate average
merge['Rain_Avg'] = merge.loc[:,['rain_x', 'rain_y', 'rain']].mean(axis=1)
merge['Wind_Speed_Avg'] = merge.loc[:,['wdsp_x', 'wdsp_y']].mean(axis=1)
weather = merge.loc[:,['DateTime', 'Rain_Avg', 'Wind_Speed_Avg']]
weather

Unnamed: 0,DateTime,Rain_Avg,Wind_Speed_Avg
0,2012-11-06 00,0.000000,9.0
1,2012-11-06 01,0.000000,9.0
2,2012-11-06 02,0.000000,10.0
3,2012-11-06 03,0.000000,12.0
4,2012-11-06 04,0.000000,11.0
5,2012-11-06 05,0.000000,11.5
6,2012-11-06 06,0.000000,12.0
7,2012-11-06 07,0.033333,14.5
8,2012-11-06 08,0.000000,15.0
9,2012-11-06 09,0.000000,15.5


In [13]:
# Merge with gps data
res['DateTime'] = res['Timestamp'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H'))
res_2 = pd.merge(res, weather, on='DateTime', how='left')
res_2

Unnamed: 0,index,AtStop,Day,Delay,JourneyPatternID,Lat,Lon,Route,StopID,TimeFrame,Timestamp,VehicleID,VehicleJourneyID,DateTime,Rain_Avg,Wind_Speed_Avg
0,0,True,Wednesday,0,00071003,53.317432,-6.230700,0007,0008,2013-01-30,2013-01-31 00:00:03,43003,6069,2013-01-31 00,0.0,19.5
1,1,True,Wednesday,0,00071003,53.317417,-6.230683,0007,0008,2013-01-30,2013-01-31 00:00:23,43003,6069,2013-01-31 00,0.0,19.5
2,2,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:42,43003,6069,2013-01-31 00,0.0,19.5
3,3,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:48,43003,6069,2013-01-31 00,0.0,19.5
4,4,True,Wednesday,0,00071003,53.317451,-6.230717,0007,0008,2013-01-30,2013-01-31 00:01:02,43003,6069,2013-01-31 00,0.0,19.5
5,5,True,Wednesday,0,00071003,53.317451,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:21,43003,6069,2013-01-31 00,0.0,19.5
6,6,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:41,43003,6069,2013-01-31 00,0.0,19.5
7,7,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:50,43003,6069,2013-01-31 00,0.0,19.5
8,8,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:02,43003,6069,2013-01-31 00,0.0,19.5
9,9,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:21,43003,6069,2013-01-31 00,0.0,19.5


In [14]:
res_2 = res_2.reset_index()
res_2.to_feather('res_2.feather')

### Part III. Add School Holiday

In [15]:
res_2 = pd.read_feather('res_2.feather')
res_2.shape

(70422379, 17)

In [22]:
school_holiday = ['2013-01-01','2013-01-02', '2013-01-03','2013-01-04','2013-01-05','2013-01-06']
school_holiday_days = ['Saturday','Sunday']
res_2['SchoolHoliday'] = np.zeros(res_2.shape[0], dtype=bool)

Unnamed: 0,level_0,index,AtStop,Day,Delay,JourneyPatternID,Lat,Lon,Route,StopID,TimeFrame,Timestamp,VehicleID,VehicleJourneyID,DateTime,Rain_Avg,Wind_Speed_Avg,SchoolHoliday
0,0,0,True,Wednesday,0,00071003,53.317432,-6.230700,0007,0008,2013-01-30,2013-01-31 00:00:03,43003,6069,2013-01-31 00,0.0,19.5,False
1,1,1,True,Wednesday,0,00071003,53.317417,-6.230683,0007,0008,2013-01-30,2013-01-31 00:00:23,43003,6069,2013-01-31 00,0.0,19.5,False
2,2,2,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:42,43003,6069,2013-01-31 00,0.0,19.5,False
3,3,3,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:00:48,43003,6069,2013-01-31 00,0.0,19.5,False
4,4,4,True,Wednesday,0,00071003,53.317451,-6.230717,0007,0008,2013-01-30,2013-01-31 00:01:02,43003,6069,2013-01-31 00,0.0,19.5,False
5,5,5,True,Wednesday,0,00071003,53.317451,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:21,43003,6069,2013-01-31 00,0.0,19.5,False
6,6,6,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:41,43003,6069,2013-01-31 00,0.0,19.5,False
7,7,7,True,Wednesday,0,00071003,53.317432,-6.230733,0007,0008,2013-01-30,2013-01-31 00:01:50,43003,6069,2013-01-31 00,0.0,19.5,False
8,8,8,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:02,43003,6069,2013-01-31 00,0.0,19.5,False
9,9,9,True,Wednesday,0,00071003,53.317432,-6.230717,0007,0008,2013-01-30,2013-01-31 00:02:21,43003,6069,2013-01-31 00,0.0,19.5,False


In [28]:
res_2.loc[res_2.TimeFrame.isin(school_holiday), 'SchoolHoliday'] = True
res_2.loc[res_2.Day.isin(school_holiday_days), 'SchoolHoliday'] = True

In [29]:
res_2.to_feather('DBus_stage_two_clean_v4.feather')

In [27]:
res_2.shape

(70422379, 18)