In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import datetime as dt

In [None]:
routes = pd.read_csv('data/routes.txt.csv')
stop_times = pd.read_csv('data/stop_times.txt.csv')
trips = pd.read_csv('data/trips.txt.csv')
stops = pd.read_csv('data/stops.txt.csv')

In [None]:
stops_w_route = pd.merge(stop_times,
                         trips,
                         how='left',
                         on='trip_id')

In [None]:
last_stops = stops_w_route.groupby('trip_id')['stop_sequence'].max().reset_index()

In [None]:
last_stops.rename(index=str, columns={'stop_sequence':'last_stop'}, inplace=True)
last_stops.head()

In [None]:
stops_first_last = pd.merge(stops_w_route,
                            last_stops,
                            how='left',
                            on='trip_id')
stops_first_last['First'] = False
stops_first_last['Last'] = False


In [None]:
stops_first_last.head()

In [None]:
first_idx = stops_first_last.index[stops_first_last['stop_sequence']==1]
stops_first_last.loc[first_idx,'First'] = True

In [None]:
last_idx = stops_first_last.index[stops_first_last['stop_sequence']==stops_first_last['last_stop']]
stops_first_last.loc[last_idx,'Last'] = True

In [None]:
stops_first_last.head()

In [None]:
drop_idx = stops_first_last.index[(stops_first_last['First'] | stops_first_last['Last']) == False]

In [None]:
stops_first_last.drop(drop_idx, inplace=True)
stops_first_last.head()

In [None]:
stops_first_last.groupby('trip_headsign').count()['arrival_time'].reset_index().head()

In [None]:
min_arrivals = stops_first_last.groupby('trip_id').min()['arrival_time'].reset_index()
max_arrivals = stops_first_last.groupby('trip_id').max()['arrival_time'].reset_index()

In [None]:
min_arrivals.shape

In [None]:
max_arrivals.shape

In [None]:
trip_times = pd.merge(min_arrivals,
                      max_arrivals,
                      how='inner',
                      on='trip_id',
                      suffixes=['_min','_max'])
# trip_times['duration'] = trip_times['arrival_time_max'] - trip_times['arrival_time_min']


In [None]:
trip_times['arrival_time_max'] = pd.to_datetime(trip_times['arrival_time_max'], format='%H:%M:%S', errors='coerce')
trip_times['arrival_time_min'] = pd.to_datetime(trip_times['arrival_time_min'], format='%H:%M:%S', errors='coerce')

In [None]:
trip_times.head()
trip_times['duration'] = (trip_times['arrival_time_max'] - trip_times['arrival_time_min']).dt.total_seconds() / 60

In [None]:
trip_times.head()

In [None]:
trips_w_duration = pd.merge(trip_times,
                            trips.loc[:,['trip_id','route_id']],
                            how='left',
                            on='trip_id')
trips_w_duration = pd.merge(trips_w_duration,
                            routes.loc[:,['route_id','route_long_name','route_short_name','route_type']],
                            how='left',
                            on='route_id')
trips_w_duration.head()

In [None]:
not_bus_idx = trips_w_duration.index[trips_w_duration['route_type']!=3]
trips_w_duration.drop(not_bus_idx, inplace=True)
print(trips_w_duration.shape)
trips_w_duration.head()

# Get Start/Stop Lat/long pairs

In [None]:
routes.head()

In [None]:
stops.head()

In [None]:
stops_first_last.head()

In [None]:
first_idx = stops_first_last.index[stops_first_last['First']]
last_idx = stops_first_last.index[stops_first_last['Last']]
first_stops = (stops_first_last.loc[first_idx,['route_id','stop_id']]).groupby('route_id').min().reset_index()
last_stops = (stops_first_last.loc[last_idx,['route_id','stop_id']]).groupby('route_id').min().reset_index()
first_stops.shape

In [None]:
latlongs = pd.merge(pd.merge(first_stops,
         stops.loc[:,['stop_id','stop_lat','stop_lon']],
         on='stop_id'),
         pd.merge(last_stops,
        stops.loc[:,['stop_id','stop_lat','stop_lon']],
        on='stop_id'),
         suffixes=['_first','_last'],
         on='route_id')

In [None]:
latlongs.to_csv('lat_longs.csv')

# aggregate the durations

In [None]:
trips_w_duration.head()

In [None]:
medians_by_route = trips_w_duration.groupby('route_short_name').median()['duration'].reset_index()

In [None]:
medians_by_route.to_clipboard()

# Throw out rush hour

In [None]:
trips_w_duration.head()

In [None]:
drop_idx = trips_w_duration.index[((trips_w_duration['arrival_time_max'] < dt.datetime(1900,1,1,7,0,0)) |
    (trips_w_duration['arrival_time_max'] > dt.datetime(1900,1,1,10,0,0)))]
morning_rush = trips_w_duration.drop(drop_idx)
morning_rush.groupby('route_short_name').median()['duration'].reset_index().to_clipboard()

In [None]:
drop_idx = trips_w_duration.index[((trips_w_duration['arrival_time_max'] > dt.datetime(1900,1,1,6,0,0)))]
owl = trips_w_duration.drop(drop_idx)
owl.groupby('route_short_name').median()['duration'].reset_index().to_clipboard()


In [None]:
trips_w_duration.shape