In [1]:
import os
import sys

import psycopg2
from textwrap import dedent

import pandas as pd
import geopandas as gpd
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go

import dash
import dash_core_components as dcc
import dash_html_components as html

from utils import connect_and_query, get_days, get_precip, get_colors

In [2]:
mapbox_token = os.environ.get('MAPBOX_TOKEN')
assert mapbox_token is not None, 'empty token'
px.set_mapbox_access_token(mapbox_token)

In [4]:
geo_info_query = dedent(
    """
    SELECT
        station_code AS "STNCODE",
        amtrak_station_name as "STNNAME",
        longitude as "LON",
        latitude as "LAT",
        nb_mile AS "Northbound Mile",
        sb_mile AS "Southbound Mile"
    FROM
        station_info;
    """
)
geo_info = connect_and_query(geo_info_query)
geo_info.head()

Unnamed: 0,STNCODE,STNNAME,LON,LAT,Northbound Mile,Southbound Mile
0,BOS,"Boston (South Station), Massachusetts",-71.055305,42.35231,457.0,0.0
1,BBY,"Boston (Back Bay), Massachusetts",-71.07583,42.347317,456.0,1.0
2,RTE,"Westwood (Route 128), Massachusetts",-71.147896,42.210243,446.0,11.0
3,PVD,"Providence, Rhode Island",-71.413475,41.82949,414.0,43.0
4,KIN,"West Kingston, Rhode Island",-71.5606,41.48396,387.0,70.0


In [5]:
amtrak_stations = list(geo_info['STNCODE'])
location_names = list(geo_info['STNNAME'])

In [6]:
geo_route_query = dedent(
    """
    SELECT 
        longitude AS "Longitude",
        latitude AS "Latitude",
        CAST(path_group AS INTEGER) as "Group",
        connecting_path AS "Station Pair",
        nb_station_group AS "NB Station Group", 
        sb_station_group AS "SB Station Group"
    FROM 
        regional_route;
    """
)
geo_route = connect_and_query(geo_route_query)
geo_route.head()

Unnamed: 0,Longitude,Latitude,Group,Station Pair,NB Station Group,SB Station Group
0,-71.055115,42.351437,0,BBY-BOS,BOS,BBY
1,-71.05515,42.351093,0,BBY-BOS,BOS,BBY
2,-71.05526,42.350636,0,BBY-BOS,BOS,BBY
3,-71.05552,42.350147,0,BBY-BOS,BOS,BBY
4,-71.05627,42.348732,0,BBY-BOS,BOS,BBY


# Mapbox with Heatmap Route

In [7]:
map_style = 'outdoors'
config = dict({'scrollZoom': False})

In [16]:
default_query = dedent(
            """
            SELECT
                t.direction AS "Direction",
                t.station_code AS "Station",
                t.sb_mile,
                t.arrival_or_departure AS "Arrival or Departure",
                CAST(AVG(t.timedelta_from_sched) AS INTEGER) AS "Average Delay",
                COUNT(*) AS "Num Records"
            FROM
                stops_joined t
            WHERE
                t.direction = 'Southbound' AND
                t.sched_arr_dep_week_day IN
                    ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
            GROUP BY t.station_code, t.direction, t.sb_mile, t.arrival_or_departure
            ORDER BY t.sb_mile ASC;
            """
)
default_query_df = connect_and_query(default_query)
colors_dict, delays, color_group_key = get_colors(geo_route, default_query_df)

In [17]:
route = px.line_mapbox(geo_route,
                       lat=geo_route['Latitude'],
                       lon=geo_route['Longitude'],
                       line_group=geo_route['Station Pair'],
                       color=geo_route[color_group_key],
                       color_discrete_map=colors_dict,
                       hover_data={color_group_key: False, 'Group': False},
                       mapbox_style=map_style,
                       zoom=5.75)
route.update_traces(line=dict(width=3))

route.add_trace(go.Scattermapbox(lat=geo_info['LAT'].round(decimals=5),
                                 lon=geo_info['LON'].round(decimals=5),
                                 name='Amtrak Stations',
                                 hoverinfo='text',
                                 customdata=delays,
                                 hovertext=geo_info['STNNAME'],
                                 hovertemplate="%{hovertext} (Avg. Delay: %{customdata} mins)<extra></extra>",
                                 mode='markers',
                                 marker={'size': 6, 'color': 'Navy'},
                                 fill='none'
                                 )
                )
route.update_layout(
    dict(paper_bgcolor="white", plot_bgcolor="white",
         margin=dict(t=35, l=80, b=0, r=0)))
route.update_yaxes(automargin=True)
config = dict({'scrollZoom': False})
route.show(config=config)

# Tabs with different plots

In [None]:
#figs_NB = []
#figs_SB = []
#for station in tabs_df['Station']:
#data_station = tabs_df.loc[tabs_df['Station'] == 'PVD']
#fig = go.Figure(data = go.Histogram(name = 'NYP', x = data_station['Average Delay']))
#fig.show()

In [18]:
command = """
    SELECT  
        t.train_num AS "Train Number", 
        t.direction AS "Direction",
        t.station_code AS "Station", 
        t.origin_year AS "Year", 
        ROUND(AVG(t.timedelta_from_sched),2) as "Average Delay", 
        COUNT(t.timedelta_from_sched) AS "Num Instances"
    FROM stops_joined t
    GROUP BY t.train_num, t.direction, t.station_code, t.origin_year
    ORDER BY AVG(t.timedelta_from_sched) DESC;
    """

In [19]:
trains_all = connect_and_query(command)

In [20]:
trains_all.shape[0]

9026

In [21]:
pvd_yearly = trains_all.loc[trains_all['Station'] == 'PVD']

In [22]:
pvd_yearly.shape[0]

419

## Providence Northbound  & Southbound Stacked Bar Plot by Train Number

In [23]:
pvd_yearly_nb = pvd_yearly.loc[pvd_yearly['Direction'] == 'Northbound']
pvd_yearly_sb = pvd_yearly.loc[pvd_yearly['Direction'] == 'Southbound']

In [24]:
pvd_nb_bar_plt = px.bar(pvd_yearly_nb, title='Providence Northbound Train Delays by Year', x = 'Year', y = 'Average Delay', color = 'Train Number', hover_data=['Num Instances'])
pvd_nb_bar_plt.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})

In [25]:
pvd_sb_bar_plt = px.bar(pvd_yearly_sb, x = 'Year', y = 'Average Delay', color = 'Train Number', hover_data=['Num Instances'])
pvd_sb_bar_plt.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})

# Stacked Bar Plot by Year (split by Direction)

In [26]:
geo_info.loc[geo_info['STNCODE'] == 'NYP']

Unnamed: 0,STNCODE,STNNAME,LON,LAT,Northbound Mile,Southbound Mile
13,NYP,"New York (Penn Station), New York",-73.99446,40.750328,226.0,231.0


In [27]:
# The length of the route divided by two
geo_info['Northbound Mile'].loc[geo_info['STNCODE'] == 'BOS']/2

0    228.5
Name: Northbound Mile, dtype: float64

In [28]:
nyp_yearly = trains_all.loc[trains_all['Station'] == 'NYP']

In [29]:
nyp_yearly.shape[0]

420

In [30]:
nyp_yearly_bar_plt = px.bar(nyp_yearly, 
                            x = 'Year',
                            y = 'Average Delay', 
                            color = 'Direction', 
                            hover_data=['Train Number', 'Num Instances'],
                            title = 'NY-Penn Additive Average Delays by Direction')
nyp_yearly_bar_plt.update_layout(barmode='group', xaxis={'categoryorder':'category ascending'})

## NYP - Stacked Bar by Year and Direction
Penn Station is at almost exactly the halfway point for both Northbound and Southbound trains, so we would expect that they should both have approximately equal delays. It is interesting to note that the Southbound trains show a more uniform distribution for most of the trains, while for Northbound, there are some trains that are significantly worse compared to other trains in their year/direction group. 

It is also very cool to see the 2020 and 2021 data and the huge reduction in delay times. I was hypothesizing that COVID and the reduced ridership on public transportation could have an impact on the delays.


In [31]:
nyp_yearly_bar_plt.show()

In [None]:
tabs_query = dedent(  
            """
            SELECT
                t.direction AS "Direction",
                t.station_code AS "Station",
                t.origin_year AS "Origin Year",
                t.precip_type AS "Precipitation",
                ROUND(AVG(d.depart_diff),2) AS "Average Delay"
            FROM
                stops_joined t
                INNER JOIN (
                SELECT
                    precip_type,
                    date_time,
                    location,
                    si.station_code AS station_code
                FROM
                    weather_hourly wh
                    INNER JOIN (
                        SELECT
                            station_code,
                            weather_loc
                        FROM
                            station_info
                    ) si ON wh.location = si.weather_loc
                WHERE
                    wh.precip_type IN ('Rain', 'Snow', 'No Precipitation')
                ) wh ON wh.station_code = d.station_code AND
                DATE_TRUNC('hour', d.full_act_dep_datetime) = wh.date_time
            GROUP BY d.direction, d.origin_year, wh.precip_type, d.station_code;
            """
)

In [None]:
tabs_df = connect_and_query(tabs_query)
tabs_df['Origin Year'] = tabs_df['Origin Year'].astype(str)

In [None]:
violin_by_precip = px.violin(tabs_df, x='Average Delay', color = 'Precipitation', orientation='h')
violin_by_precip.update_layout(barmode='overlay')
violin_by_precip.update_traces(opacity=0.7)
violin_by_precip.show()

## Testing COVID Ridership Theory

In [None]:
command = """
    SELECT  d.train_num, d.station_code, d.origin_year, ti.depart_origin_time, AVG(d.depart_diff), COUNT(d.depart_diff)
    FROM train_info ti
    INNER JOIN(
        SELECT d.train_num, d.station_code, d.origin_year, d.depart_diff 
        FROM departures d
    ) AS d
    ON ti.train_num = d.train_num
    WHERE reg_operates_on_mon = 't' AND reg_operates_on_thurs = 't'
    GROUP BY d.train_num, ti.depart_origin_time, d.origin_year, d.station_code
    ORDER BY AVG(d.depart_diff) DESC;
    """