# prep work

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import altair as alt
import datetime as dt

In [None]:
routes = pd.read_csv('data/routes.txt.csv')
stop_times = pd.read_csv('data/stop_times.txt.csv')
trips = pd.read_csv('data/trips.txt.csv')
stops = pd.read_csv('data/stops.txt.csv')

# Create Bus Trips DF

In [None]:
idx = routes.index[routes['route_type']==3]
route_columns = ['route_id','route_long_name','route_short_name']
bus_routes = routes.loc[idx, route_columns].copy()

trip_columns = ['trip_id','original_trip_id','route_id','direction_id','trip_headsign']

bus_trips = pd.merge(trips.loc[:,trip_columns],
                     bus_routes,
                     how='right',
                     on='route_id')

# merge start and stop times

In [None]:
# get last stop
# (separating this out because it takes a long time to run)
max_stop_sequence = stop_times.groupby('trip_id').max()['stop_sequence'].reset_index()
max_stop_sequence.rename(index=str,columns={'stop_sequence':'stop_count'}, inplace=True)

In [None]:
# start time
first_stop_cols = ['trip_id','departure_time']
first_stop_idx = stop_times.index[stop_times['stop_sequence']==1]
route_start_times = stop_times.loc[first_stop_idx, first_stop_cols].copy()

# end time
last_stop_cols = ['trip_id','stop_sequence','arrival_time']
route_end_times = pd.merge(max_stop_sequence,
                           stop_times.loc[:, last_stop_cols],
                           how='left',
                           left_on=['trip_id','stop_count'],
                           right_on=['trip_id','stop_sequence'])

# calculate duration
route_endpoints = pd.merge(route_start_times,
                           route_end_times,
                           on='trip_id')
route_endpoints['departure_add_day'] = route_endpoints['departure_time'].apply(lambda t: int(t[:2])>=24)*1
route_endpoints['arrival_add_day'] = route_endpoints['arrival_time'].apply(lambda t: int(t[:2])>=24)*1

def clean_time_string(s):
    s_fixed = ('%02d' % (int(s[:2]) % 24)) + s[2:]
    return s_fixed

route_endpoints['arrival_time'] = pd.to_datetime(route_endpoints['arrival_time'].apply(lambda s: clean_time_string(s)), format='%H:%M:%S', errors='coerce')
route_endpoints['departure_time'] = pd.to_datetime(route_endpoints['departure_time'].apply(lambda s: clean_time_string(s)), format='%H:%M:%S', errors='coerce')

route_endpoints['duration'] = (((route_endpoints['arrival_time']-route_endpoints['departure_time']).dt.total_seconds() / 60) 
                               + 1440*(route_endpoints['arrival_add_day'] - route_endpoints['departure_add_day']))

# merge
endpoint_cols = ['trip_id','departure_time','arrival_time','stop_count','duration']
trips_w_duration = pd.merge(bus_trips,
                            route_endpoints.loc[:,endpoint_cols],
                            how='left',
                            on='trip_id')

# Explore

In [None]:
cols=['stop_count','duration']
pd.merge(trips_w_duration.groupby('route_short_name')[cols].min().reset_index(),
         trips_w_duration.groupby('route_short_name')[cols].max().reset_index(),
         on='route_short_name',
         suffixes=['_min','_max'])

In [None]:
plt.scatter(trips_w_duration['stop_count'],
            trips_w_duration['duration'])
plt.show()

In [None]:
'''alt.Chart(trips_w_duration).mark_boxplot(extent='min-max').encode(
    x='route_short_name:O',
    y='duration:Q'
)'''

In [None]:
# all trips
trips_w_duration.groupby('route_short_name')['duration'].describe()['75%'].hist()
plt.show()

trips_w_duration.groupby('route_short_name')['duration'].describe().round(2).to_clipboard()

In [None]:
# off peak (before 8:00 am or after 8:00 pm)
idx = trips_w_duration.index[(trips_w_duration['stop_count'] >= 20)
                             & ((trips_w_duration['departure_time'] < dt.datetime(1900,1,1,8,0,0))
                                | (trips_w_duration['departure_time'] > dt.datetime(1900,1,1,20,0,0)))]
trips_w_duration.loc[idx].groupby('route_short_name')['duration'].describe()['75%'].hist()
plt.show()

trips_w_duration.loc[idx].groupby('route_short_name')['duration'].describe().round(2).to_clipboard()

In [None]:
# peak (between 8:00 am and 8:00 pm)
idx = trips_w_duration.index[(trips_w_duration['stop_count'] >= 20)
                             & ((trips_w_duration['departure_time'] > dt.datetime(1900,1,1,8,0,0))
                                & (trips_w_duration['departure_time'] < dt.datetime(1900,1,1,20,0,0)))]
trips_w_duration.loc[idx].groupby('route_short_name')['duration'].describe()['75%'].hist()
plt.show()

trips_w_duration.loc[idx].groupby('route_short_name')['duration'].describe().round(2).to_clipboard()

In [None]:
['1','14','14R','14X','19','1AX','1BX','23','24','28','28R','29','31','31AX','31BX','33','38','38AX','38BX','38R','43','44','48','49','5','5R','6','7','714','7X','8','8AX','8BX','9','90','91','9R']

In [None]:
# stop counts for short trips
idx = trips_w_duration.index[(trips_w_duration['duration'] <= 30)]
trips_w_duration.loc[idx,'stop_count'].hist()
plt.show()

# trips_w_duration.loc[idx].groupby('route_short_name')['duration'].describe().round(2).to_clipboard()


In [None]:
# stop counts for short trips
idx = trips_w_duration.index[(trips_w_duration['duration'] <= 10) & (trips_w_duration['stop_count'] >= 20)]
trips_w_duration.loc[idx].groupby('route_short_name')['stop_count'].max()
trips_w_duration.loc[idx]

trips_w_duration.loc[idx,'departure_time'].dt.strftime('%H').reset_index().groupby('departure_time').count()

In [None]:
idx = trips_w_duration.index[(trips_w_duration['duration'] >= 30) & (trips_w_duration['stop_count'] >= 20)]

df = trips_w_duration.copy()
df['departure_hour'] = df['departure_time'].dt.strftime('%H')
pd.pivot_table(df.loc[idx],
               index='departure_hour',
               columns='route_short_name',
               values='stop_count',
               aggfunc='mean').fillna(0).round(1)

In [None]:
df.loc[idx,'trip_id'].sample(1)

In [None]:
trip_id = '8627478_merged_8749233'
print(pd.merge(trips.loc[trips.index[trips['trip_id']==trip_id],['trip_id','route_id']],
         routes.loc[:,['route_id','route_long_name','route_short_name','direction']],
         how='left',
         on='route_id').transpose())

idx = stop_times.index[stop_times['trip_id']==trip_id]
pd.merge(stop_times.loc[idx,['stop_id','stop_sequence','arrival_time','departure_time']],
         stops.loc[:,['stop_id','stop_name']],
         how='left',
         on='stop_id')