In [40]:
import os
from time import strftime
from datetime import date
from collections import defaultdict
from six import string_types
from copy import deepcopy

import pandas as pd

import duckdb

import plotly
import plotly.express as px
from dash import Dash, html, dcc

import sqlalchemy
import jinja2
from jinjasql import JinjaSql
import dash_bootstrap_components as dbc

from dotenv import load_dotenv

# Import ipython-sql Jupyter extension to create SQL cells
# %load_ext sql
# directly output data to Pandas and to simplify the output that is printed to the notebook.
# %config SqlMagic.autopandas = True
# %config SqlMagic.feedback = False
# %config SqlMagic.displaycon = False

pd.options.display.max_rows=100

# Connect ipython-sql to DuckDB using a SQLAlchemy-style connection string. You may either connect to an in memory DuckDB, or a file backed db.
# %sql duckdb:///mta.db

# import qgrid
# from qgrid import show_grid

print(f"pandas               {pd.__version__:<20}")
print(f"plotly               {plotly.__version__:<20}")
print(f"sqlalchemy           {sqlalchemy.__version__:<20}")
print(f"duckdb               {duckdb.__version__:<20}")
print(f"jinja2               {jinja2.__version__:<20}")



pandas               1.5.3               
plotly               5.13.0              
sqlalchemy           1.4.46              
duckdb               0.6.1               
jinja2               3.1.2               


In [29]:
# put up text and markdown in 1, 2, 3 columns https://medium.com/a-r-g-o/using-plotlys-dash-to-deliver-public-sector-decision-support-dashboards-ac863fa829fb

# run the overall filter, have sum of 2019, pandemic, current period
# print this
# put it in the dashboard

# 2019    pandemic   2022 (filter period)
# avg          avg                    avg
#        ch v 2019        chg vs pandemic
#                             chg vs 2019

# put in dashboard:
# time series - by dow - by tod

# put in dashboard - table with sorts

# put filter widgets in daashboard

# map - 3 column

# station table - 3 col
# pretty name , entries, ch from 2019, ch from pandemic

# filter hourly and daily using
# startdate/end_date
# day of week
# hour of day
# cbd

# show startdate to enddate
# markdown 3 big text boxes: 2019, pandemic (%), selected (% vs 2019, %v pandemic)

# entries by date - selected timeperiod
# entries by dow - selected timeperiod
# entries by hour of day - selected timeperiod

# map across full dashboard

# station table
# by date startdate to enddate
# 2019

In [41]:
load_dotenv()

mapbox_token = os.getenv('MAPBOX_TOKEN')


In [42]:
connection_string = "duckdb:////Users/drucev/projects/MTA/mta.db"
con = sqlalchemy.create_engine(connection_string, connect_args={'read_only': True})

# query = "describe station_hourly "
# con.execute(query)
# con.fetchall()



In [43]:
filters = defaultdict(str)
filters['dow'] = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',]
filters['tod'] = [4,8,12,16,20,24]
filters['cbd'] = ['Y', 'N']
filters['startdate'] = '2022-01-01'
filters['enddate'] = '2023-01-01'

filters['pandemic_start'] = '2020-03-01'
filters['pandemic_end'] = '2021-03-01'


In [44]:
query_template = """
select
    DATE,
    STATION,
    pretty_name,
    Latitude,
    Longitude,
    CBD,
    dow,
    is_weekend,
    hour,
    entries,
    exits
from
    station_hourly
where true
{% if startdate %} and date >= {{startdate}} {% endif %}
{% if enddate %} and date < {{enddate}} {% endif %}
{% if dow %} and dow in {{ dow | inclause }}  {% endif %}
{% if tod %} and hour in {{ tod | inclause }} {% endif %}
{% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
limit 10
"""


In [45]:
def quote_sql_string(value):
    '''
    If `value` is a string type, escapes single quotes in the string
    and returns the string enclosed in single quotes.
    else if value is a list, map quote_sql_string to each item in list
    else return value unchanged
    '''
    if isinstance(value, string_types):
        new_value = str(value)
        new_value = new_value.replace("'", "''")
        return "'{}'".format(new_value)
    elif isinstance(value, list):
        return [quote_sql_string(v) for v in value]
    return value


def get_sql_from_template(con, query, bind_params=None, verbose=False):
    """
    Run Jinja template query against con, substituting bind_params
    """
    if not bind_params:
        if verbose:
            print(query)
        return pd.read_sql(query, con)
    
    if verbose:
        # copy and escape params for legibility
        params = deepcopy(bind_params)
        for key, val in params.items():
            params[key] = quote_sql_string(val)
        query_str, query_vals = JinjaSql().prepare_query(query, params)
        print(query_str % tuple(query_vals))
        
    # process params using ? style, run query, return dataframe 
    query_str, query_vals = JinjaSql(param_style='qmark').prepare_query(query, bind_params)
    return pd.read_sql(query_str, con, params=query_vals)


get_sql_from_template(con, query_template, filters, verbose=True)




select
    DATE,
    STATION,
    pretty_name,
    Latitude,
    Longitude,
    CBD,
    dow,
    is_weekend,
    hour,
    entries,
    exits
from
    station_hourly
where true
 and date >= '2022-01-01' 
 and date < '2023-01-01' 
 and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
 and hour in (4,8,12,16,20,24) 
 and cbd in ('Y','N') 
limit 10


Unnamed: 0,DATE,STATION,pretty_name,Latitude,Longitude,CBD,dow,is_weekend,hour,entries,exits
0,2022-10-18,GRD CNTRL-42 ST-4567S,Grand Central-42 St-4 5 6 7 S (M),40.751776,-73.976848,Y,Tuesday,False,20,18077,22976
1,2022-10-18,GRD CNTRL-42 ST-4567S,Grand Central-42 St-4 5 6 7 S (M),40.751776,-73.976848,Y,Tuesday,False,8,6336,22235
2,2022-10-19,GRD CNTRL-42 ST-4567S,Grand Central-42 St-4 5 6 7 S (M),40.751776,-73.976848,Y,Wednesday,False,24,4008,5188
3,2022-11-01,14 ST-123FLM,14 St-F M (M),40.738228,-73.996209,Y,Tuesday,False,8,776,2815
4,2022-10-18,CHAMBERS ST-ACE23,Chambers St-A C 2 3 (M),40.714111,-74.008585,Y,Tuesday,False,24,636,526
5,2022-10-18,BLEECKER ST-6DF,Bleecker St-6 (M),40.725915,-73.994659,Y,Tuesday,False,16,1138,930
6,2022-10-18,CHAMBERS ST-123,Chambers St-1 2 3 (M),40.715478,-74.009266,Y,Tuesday,False,24,628,522
7,2022-10-21,59 ST COLUMBUS-1ABCD,59 St-Columbus Circle-1 A B C D (M),40.768296,-73.981736,Y,Friday,False,24,4515,3238
8,2022-10-18,34 ST-PENN STA-123,34 St-Penn Station-1 2 3 (M),40.750373,-73.991057,Y,Tuesday,False,24,1759,2301
9,2022-10-18,WTC-CORTLANDT-1,WTC Cortlandt-1 (M),40.711835,-74.012188,Y,Tuesday,False,8,970,1887


In [46]:
def day_count(con, filters, verbose=False):
    """return number of days in the filter"""

    query = """

    select count(*) as days from
    (select
        date, count(*) as n
    from
        station_hourly
    where TRUE
        {% if startdate %} and date >= {{startdate}} {% endif %}
        {% if enddate %} and date < {{enddate}} {% endif %}
        {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
        {% if tod %} and hour in {{ tod | inclause }} {% endif %}
        {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
    group by
        "date"
    )
    """

    return get_sql_from_template(con, query, filters, verbose)

df_day_count = day_count(con, filters)
day_count = df_day_count.iloc[0][0]
day_count



260

In [47]:
def day_count_2019(con, filters, verbose=False):
    """return number of days in the filter"""

    query = """

    select count(*) as days from
    (select
        date, count(*) as n
    from
        station_hourly
    where
        date_part('year', DATE)=2019
        {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
        {% if tod %} and hour in {{ tod | inclause }} {% endif %}
        {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
    group by
        "date"
    )
    """

    return get_sql_from_template(con, query, filters, verbose)

df_day_count_2019 = day_count_2019(con, filters)
day_count_2019 = df_day_count_2019.iloc[0][0]
day_count_2019



261

In [48]:
def day_count_pandemic(con, filters, verbose=False):
    """return number of days in the filter"""

    query = """

    select count(*) as days from
    (select
        date, count(*) as n
    from
        station_hourly
    where
        date >= {{pandemic_start}} and date < {{pandemic_end}}
        {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
        {% if tod %} and hour in {{ tod | inclause }} {% endif %}
        {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
    group by
        "date"
    )
    """

    return get_sql_from_template(con, query, filters, verbose)

df_day_count_pandemic = day_count_pandemic(con, filters)
day_count_pandemic = df_day_count_pandemic.iloc[0][0]
day_count_pandemic


260

In [49]:
def entries_by_date(con, filters, verbose=False):
    """return dataframe of all entries by date, subject to filters"""

    query = """
    select date, sum(entries) entries
    from station_hourly
    where TRUE
    {% if startdate %} and date >= {{startdate}} {% endif %}
    {% if enddate %} and date < {{enddate}} {% endif %}
    {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
    {% if tod %} and hour in {{ tod | inclause }} {% endif %}
    {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
    group by date
    order by date
    """

    return get_sql_from_template(con, query, filters, verbose)

df_entries_by_date = entries_by_date(con, filters)

df_entries_by_date



Unnamed: 0,DATE,entries
0,2022-01-03,1710796
1,2022-01-04,1810000
2,2022-01-05,1797963
3,2022-01-06,1880303
4,2022-01-07,1628217
...,...,...
255,2022-12-26,1152490
256,2022-12-27,1779358
257,2022-12-28,1913341
258,2022-12-29,1957983


In [50]:
fig1 = px.line(df_entries_by_date, x="DATE", y="entries", height=360)
fig1.update_traces(line=dict(width=2))
fig1.update_layout(
    paper_bgcolor="LightSteelBlue",
    showlegend=False,
    plot_bgcolor="white",
    xaxis_title="Date",
    yaxis_title="Entries",
    legend_title="Legend Title",

    xaxis={
        'ticks': 'inside',
        'showgrid': True,            # thin lines in the background
        'zeroline': False,           # thick line at x=0
        'visible': True,             # numbers below
        'showline': True,            # Show X-Axis
        'linecolor': 'black',        # Color of X-axis
        'tickfont_color': 'black',   # Color of ticks
        'showticklabels': True,      # Show X labels
        'mirror': True,              # draw right axis
    },
    yaxis={
        'ticks': 'inside',
        'showgrid': True,            # thin lines in the background
        'zeroline': False,           # thick line at x=0
        'visible': True,             # numbers below
        'showline': True,            # Show X-Axis
        'linecolor': 'black',        # Color of X-axis
        'tickfont_color': 'black',   # Color of ticks
        'showticklabels': True,      # Show X labels
        'side': 'left',
        'mirror': True,
    },
    #     font=dict(
    #         family="Courier New, monospace",
    #         size=18,
    #         color="RebeccaPurple"
    #     )
 )



In [66]:
def entries_by_dow(con, filters, verbose=False):
    """return dataframe of all entries by day of week, subject to filters"""

    query = """
    with sh as
        (select
            date,
            dow,
            sum(entries) as entries,
        from
            station_hourly
        where TRUE
        group by date, dow),
    sh1 as
        (select
            dow,
            sum(entries) as entries,
            count(*) as n,
            sum(entries)/n as entries_per_day
        from
            sh
        group by
            dow),
    sh2019 as
        (select
            dow,
            sum(entries) as entries_2019,
            count(*) as n_2019,
        from
            sh
        where date_part('year', DATE)=2019
        group by
            dow),
    sh_pandemic as
        (select
            dow,
            sum(entries) as entries_pandemic,
            count(*) as n_pandemic,
        from
            sh
        where date>='2020-04-01' and date <'2021-04-01'
        group by
            dow)
    select
        sh1.dow,
        sh1.entries_per_day,
        sh_pandemic.entries_pandemic/sh_pandemic.n_pandemic as avg_pandemic,
        sh2019.entries_2019/sh2019.n_2019 as avg_2019
    from
        sh1 join sh2019 on sh1.dow = sh2019.dow
        join sh_pandemic on sh1.dow=sh_pandemic.dow
    """

    return get_sql_from_template(con, query, filters, verbose)


df_entries_by_dow = entries_by_dow(con, filters, verbose=True)
df_entries_by_dow





    with sh as
        (select
            date,
            dow,
            sum(entries) as entries,
        from
            station_hourly
        where TRUE
        group by date, dow),
    sh1 as
        (select
            dow,
            sum(entries) as entries,
            count(*) as n,
            sum(entries)/n as entries_per_day
        from
            sh
        group by
            dow),
    sh2019 as
        (select
            dow,
            sum(entries) as entries_2019,
            count(*) as n_2019,
        from
            sh
        where date_part('year', DATE)=2019
        group by
            dow),
    sh_pandemic as
        (select
            dow,
            sum(entries) as entries_pandemic,
            count(*) as n_pandemic,
        from
            sh
        where date>='2020-04-01' and date <'2021-04-01'
        group by
            dow)
    select
        sh1.dow,
        sh1.entries_per_day,
        sh_pandemic.entries_pandemic/sh_pandemic.n_pand

Unnamed: 0,dow,entries_per_day,avg_pandemic,avg_2019
0,Thursday,2915979,1248762,5514089
1,Saturday,1728137,816391,3197802
2,Wednesday,2946000,1291611,5562314
3,Monday,2630426,1182892,5009493
4,Tuesday,2888182,1279562,5400089
5,Sunday,1377130,626980,2541992
6,Friday,2849426,1231698,5441720


In [70]:
dow_map = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3, 
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}
df_entries_by_dow['sort_order'] = df_entries_by_dow['dow'].apply(lambda d: dow_map[d])
df_entries_by_dow = df_entries_by_dow.sort_values('sort_order').reset_index(drop=True)
df_entries_by_dow.columns=['dow', 'selection', 'pandemic', '2019', 'sort_order']
df_entries_by_dow = pd.melt(df_entries_by_dow[['dow', 'selection', 'pandemic', '2019']],
                            id_vars='dow', var_name='when', value_name='entries')
df_entries_by_dow

Unnamed: 0,dow,when,entries
0,Monday,selection,2630426
1,Tuesday,selection,2888182
2,Wednesday,selection,2946000
3,Thursday,selection,2915979
4,Friday,selection,2849426
5,Saturday,selection,1728137
6,Sunday,selection,1377130
7,Monday,pandemic,1182892
8,Tuesday,pandemic,1279562
9,Wednesday,pandemic,1291611


In [71]:
fig2 = px.bar(df_entries_by_dow,
              x="dow", y="entries", color='when', barmode="group", height=360)
fig2.update_layout(
    paper_bgcolor="LightSteelBlue",
    showlegend=True,
    plot_bgcolor="white",
    xaxis_title="Date",
    yaxis_title="Entries",
    legend_title=None,
    xaxis={
        'tickmode': 'linear',
        'tick0': 0,
        'dtick': 1,
        'title': None,
        'ticks': 'inside',
        'showgrid': True,            # thin lines in the background
        'zeroline': False,           # thick line at x=0
        'visible': True,             # numbers below
        'showline': True,            # Show X-Axis
        'linecolor': 'black',        # Color of X-axis
        'tickfont_color': 'black',   # Color of ticks
        'showticklabels': True,      # Show X labels
        'mirror': True,              # draw right axis
    },
    yaxis={
        'ticks': 'inside',
        'showgrid': True,            # thin lines in the background
        'zeroline': False,           # thick line at x=0
        'visible': True,             # numbers below
        'showline': True,            # Show X-Axis
        'linecolor': 'black',        # Color of X-axis
        'tickfont_color': 'black',   # Color of ticks
        'showticklabels': True,      # Show X labels
        'side': 'left',
        'mirror': True,
    },
 )


In [75]:

def entries_by_tod(con, filters, verbose=False):
    """return dataframe of all entries by time of day, subject to filters"""

    query = """
        with sh as
            (select
                date,
                hour,
                sum(entries) as entries,
            from
                station_hourly
            where TRUE
                {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
                {% if tod %} and hour in {{ tod | inclause }} {% endif %}
                {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
            group by date, hour),
        sh1 as
            (select
                hour,
                sum(entries) as entries,
                count(*) as n,
                sum(entries)/n as entries_per_day
            from
                sh
            where TRUE
                {% if enddate %} and date < {{enddate}} {% endif %}
                {% if startdate %} and date >= {{startdate}} {% endif %}
            group by
                hour),
        sh2019 as
            (select
                hour,
                sum(entries) as entries_2019,
                count(*) as n_2019,
            from
                sh
            where date_part('year', DATE)=2019
            group by
                hour),
        sh_pandemic as
            (select
                hour,
                sum(entries) as entries_pandemic,
                count(*) as n_pandemic,
            from
                sh
            where date>='2020-04-01' and date <'2021-04-01'
            group by
                hour)
        select
            sh1.hour,
            sh1.entries_per_day,
            sh_pandemic.entries_pandemic/sh_pandemic.n_pandemic as avg_pandemic,
            sh2019.entries_2019/sh2019.n_2019 as avg_2019
        from
            sh1 join sh2019 on sh1.hour = sh2019.hour
            join sh_pandemic on sh1.hour=sh_pandemic.hour
    """

    return get_sql_from_template(con, query, filters, verbose)

df_entries_by_tod = entries_by_tod(con, filters)
df_entries_by_tod.columns=['tod', 'selection', 'pandemic', '2019']
df_entries_by_tod = pd.melt(df_entries_by_tod,
                            id_vars='tod', var_name='when', value_name='entries')
df_entries_by_tod


Unnamed: 0,tod,when,entries
0,4,selection,59203
1,8,selection,445115
2,12,selection,439064
3,16,selection,579082
4,20,selection,516940
5,24,selection,140324
6,4,pandemic,38278
7,8,pandemic,267603
8,12,pandemic,250083
9,16,pandemic,345707


In [77]:
fig = px.bar(df_entries_by_tod,
              x="tod", y="entries", color='when', barmode="group", height=360)
fig.update_layout(
    xaxis = dict(
        tickmode= 'linear',
        tick0=0,
        dtick=4,
        title='Time of day'
    )
)

fig.show()

In [19]:
def entries_by_station(con, filters, verbose=False):
    """
    query from hourly subject to filters, then sum by station
    include comparison to 2019 and pandemic (also subject to filters)
    """

    query = """
    with sd as
        (SELECT
        pretty_name,
        latitude,
        longitude,
        sum(entries) as entries
        FROM station_hourly
        where
            TRUE
            {% if startdate %} and date >= {{startdate}} {% endif %}
            {% if enddate %} and date < {{enddate}} {% endif %}
            {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
            {% if tod %} and hour in {{ tod | inclause }} {% endif %}
            {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
        GROUP BY
        pretty_name,
        latitude,
        longitude
        ORDER BY
        pretty_name
        )
    select
    sd.pretty_name,
    latitude,
    longitude,
    sd.entries,
    sd.entries::float/vs2019.entries_2019-1 as pct_v_2019,
    sd.entries::float/vspandemic.entries_pandemic-1 as pct_v_pandemic,
    vs2019.entries_2019,
    vspandemic.entries_pandemic
    FROM
    sd
    LEFT OUTER JOIN (
        SELECT pretty_name, sum(entries) entries_2019
            FROM station_hourly
            WHERE date_part('year', DATE)=2019
            {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
            {% if tod %} and hour in {{ tod | inclause }} {% endif %}
            {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
            GROUP BY pretty_name
            ORDER BY pretty_name
    ) vs2019 on vs2019.pretty_name=sd.pretty_name
    LEFT OUTER JOIN (
        SELECT pretty_name, sum(entries) entries_pandemic
            FROM station_hourly
            WHERE
            date >= '2020-03-01'
            and date < '2021-03-01'
            {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
            {% if tod %} and hour in {{ tod | inclause }} {% endif %}
            {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
            GROUP BY pretty_name
            ORDER BY pretty_name
    ) vspandemic on vspandemic.pretty_name=sd.pretty_name
    """

    return get_sql_from_template(con, query, filters, verbose)


df_entries_by_station = entries_by_station(con, filters, verbose=True)
df_entries_by_station




    with sd as
        (SELECT
        pretty_name,
        latitude,
        longitude,
        sum(entries) as entries
        FROM station_hourly
        where
            TRUE
             and date >= '2022-01-01' 
             and date < '2023-01-01' 
             and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
             and hour in (4,8,12,16,20,24) 
             and cbd in ('Y','N') 
        GROUP BY
        pretty_name,
        latitude,
        longitude
        ORDER BY
        pretty_name
        )
    select
    sd.pretty_name,
    latitude,
    longitude,
    sd.entries,
    sd.entries::float/vs2019.entries_2019-1 as pct_v_2019,
    sd.entries::float/vspandemic.entries_pandemic-1 as pct_v_pandemic,
    vs2019.entries_2019,
    vspandemic.entries_pandemic
    FROM
    sd
    LEFT OUTER JOIN (
        SELECT pretty_name, sum(entries) entries_2019
            FROM station_hourly
            WHERE date_part('year', DATE)=2019
             and dow in ('Mon

Unnamed: 0,pretty_name,Latitude,Longitude,entries,pct_v_2019,pct_v_pandemic,entries_2019,entries_pandemic
0,1 Av-L (M),40.730953,-73.981628,2032429,-0.560891,0.583979,4628531,1283116
1,103 St-1 (M),40.799446,-73.968379,1362191,-0.558222,0.822868,3083432,747279
2,103 St-6 (M),40.790600,-73.947478,1541481,-0.541349,0.378569,3360899,1118175
3,103 St-B C (M),40.796092,-73.961454,489120,-0.598364,0.585966,1217820,308405
4,103 St-Corona Plaza-7 (Q),40.749865,-73.862700,3017965,-0.414081,0.408958,5150822,2141983
...,...,...,...,...,...,...,...,...
438,181 St-A (M),40.851695,-73.937969,957119,-0.648296,0.365799,2721378,700776
439,215 St-1 (M),40.869444,-73.915279,263394,-0.432180,1.035754,463869,129384
440,75 Av-E F (Q),40.718331,-73.837324,325092,-0.611140,0.627413,836013,199760
441,Astoria-Ditmars Blvd-N W (Q),40.775036,-73.912034,1667190,-0.617347,0.527052,4356925,1091770


In [20]:
subquery_2019 = """
SELECT pretty_name, avg(entries) entries_2019
    FROM station_hourly
    WHERE date_part('year', DATE)=2019
    {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
    {% if tod %} and hour in {{ tod | inclause }} {% endif %}
    {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
    GROUP BY pretty_name
    ORDER BY pretty_name
""" 
get_sql_from_template(con, subquery_2019, filters, verbose=True)



SELECT pretty_name, avg(entries) entries_2019
    FROM station_hourly
    WHERE date_part('year', DATE)=2019
     and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
     and hour in (4,8,12,16,20,24) 
     and cbd in ('Y','N') 
    GROUP BY pretty_name
    ORDER BY pretty_name


Unnamed: 0,pretty_name,entries_2019
0,1 Av-L (M),2963.208067
1,103 St-1 (M),1972.765195
2,103 St-6 (M),2155.804362
3,103 St-B C (M),779.654289
4,103 St-Corona Plaza-7 (Q),3308.170841
...,...,...
438,Woodlawn-4 (Bx),1120.619231
439,Woodside-61 St-7 (Q),2529.391726
440,World Trade Center-E (M),3098.229193
441,York St-F (Bk),1683.748399


In [21]:
subquery_pandemic = """
SELECT pretty_name, avg(entries) entries_pandemic
    FROM station_hourly
    WHERE 
    date >= '2020-03-01'
    and date < '2021-03-01'
    {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
    {% if tod %} and hour in {{ tod | inclause }} {% endif %}
    {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
    GROUP BY pretty_name
    ORDER BY pretty_name
"""
get_sql_from_template(con, subquery_pandemic, filters, verbose=True)



SELECT pretty_name, avg(entries) entries_pandemic
    FROM station_hourly
    WHERE 
    date >= '2020-03-01'
    and date < '2021-03-01'
     and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
     and hour in (4,8,12,16,20,24) 
     and cbd in ('Y','N') 
    GROUP BY pretty_name
    ORDER BY pretty_name


Unnamed: 0,pretty_name,entries_pandemic
0,1 Av-L (M),827.816774
1,103 St-1 (M),479.639923
2,103 St-6 (M),716.778846
3,103 St-B C (M),198.586607
4,103 St-Corona Plaza-7 (Q),1373.946761
...,...,...
438,Woodlawn-4 (Bx),407.541774
439,Woodside-61 St-7 (Q),845.137909
440,World Trade Center-E (M),426.815496
441,York St-F (Bk),371.523748


In [22]:
query = """
SELECT
  pretty_name, 
  latitude, 
  longitude, 
  avg(entries)
FROM station_hourly
where 
    TRUE
    {% if startdate %} and date >= {{startdate}} {% endif %}
    {% if enddate %} and date < {{enddate}} {% endif %}    
    {% if dow %} and dow in {{ dow | inclause }}  {% endif %}
    {% if tod %} and hour in {{ tod | inclause }} {% endif %}
    {% if cbd %} and cbd in {{ cbd | inclause }} {% endif %}
GROUP BY
  pretty_name,
  latitude, 
  longitude
ORDER BY
  pretty_name
"""
get_sql_from_template(con, query, filters, verbose=True)



SELECT
  pretty_name, 
  latitude, 
  longitude, 
  avg(entries)
FROM station_hourly
where 
    TRUE
     and date >= '2022-01-01' 
     and date < '2023-01-01'     
     and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
     and hour in (4,8,12,16,20,24) 
     and cbd in ('Y','N') 
GROUP BY
  pretty_name,
  latitude, 
  longitude
ORDER BY
  pretty_name


Unnamed: 0,pretty_name,Latitude,Longitude,avg(entries)
0,1 Av-L (M),40.730953,-73.981628,1307.028296
1,103 St-1 (M),40.799446,-73.968379,877.135222
2,103 St-6 (M),40.790600,-73.947478,990.032755
3,103 St-B C (M),40.796092,-73.961454,315.154639
4,103 St-Corona Plaza-7 (Q),40.749865,-73.862700,1939.566195
...,...,...,...,...
438,Woodlawn-4 (Bx),40.886037,-73.878751,548.097561
439,Woodside-61 St-7 (Q),40.745630,-73.902984,1375.981900
440,World Trade Center-E (M),40.712582,-74.009781,1013.944264
441,York St-F (Bk),40.701397,-73.986751,690.234576


In [23]:
query = """
with sd as 
(SELECT
  pretty_name, 
  latitude, 
  longitude, 
  avg(entries) as entries
FROM station_hourly
where 
    TRUE
    {%% if startdate %%} and date >= {{startdate}} {%% endif %%}
    {%% if enddate %%} and date < {{enddate}} {%% endif %%}    
    {%% if dow %%} and dow in {{ dow | inclause }}  {%% endif %%}
    {%% if tod %%} and hour in {{ tod | inclause }} {%% endif %%}
    {%% if cbd %%} and cbd in {{ cbd | inclause }} {%% endif %%}
GROUP BY
  pretty_name, 
  latitude, 
  longitude
ORDER BY
  pretty_name

)
select 
  sd.pretty_name,
  latitude,
  longitude,
  entries,
  entries/entries_2019-1 as pct_v_2019,
  entries/entries_pandemic-1 as pct_v_pandemic
FROM
  sd
LEFT OUTER JOIN (
    %s
) vs2019 on vs2019.pretty_name=sd.pretty_name
LEFT OUTER JOIN (
    %s
) vspandemic on vspandemic.pretty_name=sd.pretty_name
""" % (subquery_2019, subquery_pandemic)


df = get_sql_from_template(con, query, filters, verbose=True)
df


with sd as 
(SELECT
  pretty_name, 
  latitude, 
  longitude, 
  avg(entries) as entries
FROM station_hourly
where 
    TRUE
     and date >= '2022-01-01' 
     and date < '2023-01-01'     
     and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
     and hour in (4,8,12,16,20,24) 
     and cbd in ('Y','N') 
GROUP BY
  pretty_name, 
  latitude, 
  longitude
ORDER BY
  pretty_name

)
select 
  sd.pretty_name,
  latitude,
  longitude,
  entries,
  entries/entries_2019-1 as pct_v_2019,
  entries/entries_pandemic-1 as pct_v_pandemic
FROM
  sd
LEFT OUTER JOIN (
    
SELECT pretty_name, avg(entries) entries_2019
    FROM station_hourly
    WHERE date_part('year', DATE)=2019
     and dow in ('Monday','Tuesday','Wednesday','Thursday','Friday')  
     and hour in (4,8,12,16,20,24) 
     and cbd in ('Y','N') 
    GROUP BY pretty_name
    ORDER BY pretty_name

) vs2019 on vs2019.pretty_name=sd.pretty_name
LEFT OUTER JOIN (
    
SELECT pretty_name, avg(entries) entries_pandemic
    

Unnamed: 0,pretty_name,Latitude,Longitude,entries,pct_v_2019,pct_v_pandemic
0,1 Av-L (M),40.730953,-73.981628,1307.028296,-0.558914,0.578886
1,103 St-1 (M),40.799446,-73.968379,877.135222,-0.555378,0.828737
2,103 St-6 (M),40.790600,-73.947478,990.032755,-0.540759,0.381225
3,103 St-B C (M),40.796092,-73.961454,315.154639,-0.595776,0.586988
4,103 St-Corona Plaza-7 (Q),40.749865,-73.862700,1939.566195,-0.413704,0.411675
...,...,...,...,...,...,...
438,181 St-A (M),40.851695,-73.937969,615.510611,-0.646487,0.366677
439,215 St-1 (M),40.869444,-73.915279,171.592182,-0.421823,0.990662
440,75 Av-E F (Q),40.718331,-73.837324,209.062379,-0.609890,0.631599
441,Astoria-Ditmars Blvd-N W (Q),40.775036,-73.912034,1071.458869,-0.618577,0.529015


In [24]:
df['Avg Daily Entries'] = df['entries'].apply(lambda f: "%.1fk" % (f/1000))
df['%Ch vs. 2019'] = df['pct_v_2019'].apply(lambda f: "%.1f%%" % (f * 100))



In [25]:
df

Unnamed: 0,pretty_name,Latitude,Longitude,entries,pct_v_2019,pct_v_pandemic,Avg Daily Entries,%Ch vs. 2019
0,1 Av-L (M),40.730953,-73.981628,1307.028296,-0.558914,0.578886,1.3k,-55.9%
1,103 St-1 (M),40.799446,-73.968379,877.135222,-0.555378,0.828737,0.9k,-55.5%
2,103 St-6 (M),40.790600,-73.947478,990.032755,-0.540759,0.381225,1.0k,-54.1%
3,103 St-B C (M),40.796092,-73.961454,315.154639,-0.595776,0.586988,0.3k,-59.6%
4,103 St-Corona Plaza-7 (Q),40.749865,-73.862700,1939.566195,-0.413704,0.411675,1.9k,-41.4%
...,...,...,...,...,...,...,...,...
438,181 St-A (M),40.851695,-73.937969,615.510611,-0.646487,0.366677,0.6k,-64.6%
439,215 St-1 (M),40.869444,-73.915279,171.592182,-0.421823,0.990662,0.2k,-42.2%
440,75 Av-E F (Q),40.718331,-73.837324,209.062379,-0.609890,0.631599,0.2k,-61.0%
441,Astoria-Ditmars Blvd-N W (Q),40.775036,-73.912034,1071.458869,-0.618577,0.529015,1.1k,-61.9%


In [26]:
fig = px.scatter_mapbox(df, 
                        lat="Latitude", lon="Longitude", 
                        hover_name="pretty_name", hover_data={"Avg Daily Entries": True, "%Ch vs. 2019": True, "entries": False, "Latitude": False, "Longitude": False, "pct_v_2019": False, "pct_v_pandemic": False}, 
                        size="entries", size_max=20,
                        color_continuous_scale=px.colors.sequential.Jet, color="pct_v_2019",
                        zoom=10, height=480, width=880)
fig.update_layout(mapbox_style="carto-darkmatter", mapbox_accesstoken=mapbox_token)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, showlegend=False)
fig.show()


In [27]:
fig.write_html("/Users/drucev/projects/druce.github.io/_includes/MtA/MTA_map.html")