## Prepare Dataset for Clustering Stations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import shutil
import sys
import warnings
from glob import glob
from io import BytesIO
from itertools import product
from typing import Dict, List, Union
from zipfile import ZipFile

import duckdb
import geopandas as gpd
import pandas as pd
import requests
from tqdm.contrib import concurrent as concurrent_tq
from contexttimer import Timer
from watermark import watermark

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport file_utils
import file_utils as flut

%aimport pandas_utils
import pandas_utils as pu

## About

### Objective

The processed bike share ridership and other datasets are now combined in order in order to prepare a combined station summary dataset that can be used to cluster bike share stations.

### Implementation Overview

This station summary dataset is prepared as follows

1. Periodic ridership data, such as yearly or quarterly ridership, is aggregated per station. This provides user behavioural data.
2. The distance between points of interest station attributes. This gives station attributes.

In order to segment (cluster) bike share stations as effectively as possibly, both station metadata and user ridership trends are extracted. After segments have been extracted, both attributes and behavioural data can be used to create as rich a profile of each cluster as possible.

### Data

The following previously-created datasets are used in this aggregation step

1. list of downtown neighbourhoods
2. station info (includes geodata)
3. public library branch locations
4. cultural hotspots
5. places of interest
6. cycle path entrances
7. public transit (bus) stops
8. public transit (train) stations
9. processed bike share ridership

### Assumptions

1. Same as in data retrieval and processing steps.

### Outputs

1. (1 file) Performance for all currently active stations, with a filename of the format `station_attributes_behavioural_data__YYYYmmdd_HHMMSS.parquet.gzip`.

## User Inputs

In [5]:
# ridership
years_proc_trips = {
    2018: [f'Q{k}' for k in range(1, 4+1)],
    2019: [f'Q{k}' for k in range(1, 4+1)],
    2020: [f'{str(k).zfill(2)}' for k in range(1, 12+1)],
    2021: [f'{str(k).zfill(2)}' for k in range(1, 12+1)],
    2022: [f'{str(k).zfill(2)}' for k in range(1, 12+1)],
    2023: [f'{str(k).zfill(2)}' for k in range(1, 3+1)],
}

# top-performing stations
last_full_year = 2022
# # selection threshold to get top-performers
top_perform_frac = 0.16

# Toronto open datasets
library_cols = [
    '_id',
    'BranchName',
    'Address',
    'Lat',
    'Long',
    'PhysicalBranch',
    'WardName',
]
cycle_network_cols = [
    '_id', 'OBJECTID', 'FROM_STREET', 'TO_STREET', 'STREET_NAME', 'geometry'
]

# geodata
crs = 4326
epsg = 4536  # gives distance in metres

sid_cols = [
    'station_id',
    'name',
    'physical_configuration',
    'lat',
    'lon',
    'is_charging_station',
    'capacity',
]

max_rank = 5
max_rank_train = 4

# export
my_timezone = 'US/Eastern'

In [6]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw', 'systems', 'toronto')
processed_data_dir = os.path.join(data_dir, 'processed')

# processed trips
fpaths_proc = {
    y: [
        f
        for p in periods
        for f in sorted(
            glob(
                os.path.join(
                    processed_data_dir,
                    f'processed__trips_{y}_{p}*.parquet.gzip',
                )
            )
        )
    ]
    for y, periods in years_proc_trips.items()
}
fpaths_proc_all = [f for _, v in fpaths_proc.items() for f in v]
fpaths_proc_2018_2022 = [f for y in range(2018, 2022+1) for f in fpaths_proc[y]]

# station info for currently active stations
fpath_stations_info = glob(
    os.path.join(raw_data_dir, 'stations_info__*.parquet.gzip')
)[0]

# downtown neighbourhoods
fpath_downtown_neighs = glob(
    os.path.join(raw_data_dir, 'downtown_neighbourhoods__*.parquet.gzip')
)[0]

# public transit (train) lines
fpath_transit_lines = glob(
    os.path.join(
        raw_data_dir, 'ttc-subway-shapefile-wgs84', '*.shp'
    )
)[0]

# public transit (bus) stops
fpath_transit_stops = os.path.join(
    raw_data_dir, 'opendata_ttc_schedules', 'stops.txt'
)

# public library branch locations
fpath_libraries = glob(
    os.path.join(raw_data_dir, 'libraries__*parquet.gzip')
)[0]

# cultural hotspots
fpath_ch = glob(
    os.path.join(raw_data_dir, 'cultural_hotspots__*parquet.gzip')
)[0]

# places of interest
fpath_poi = glob(
    os.path.join(raw_data_dir, 'places_of_interest__*parquet.gzip')
)[0]

# cycling network
fpath_cycle = glob(
    os.path.join(raw_data_dir, 'cycle_paths__*parquet.gzip')
)[0]

In [7]:
def run_sql_query(query: str, verbose: bool=False) -> pd.DataFrame:
    """Run SQL query using DuckDB."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        df_query = duckdb.sql(query).df()
    if verbose:
        print(f"Query returned {len(df_query):,} rows")
    return df_query


def extract_coords_from_geometry(
    gdf: gpd.GeoDataFrame, lat_col_name: str, lon_col_name: str
) -> gpd.GeoDataFrame:
    """."""
    gdf[lat_col_name] = gdf['geometry'].explode(index_parts=False).x
    gdf[lon_col_name] = gdf['geometry'].explode(index_parts=False).y
    return gdf


def run_parallel(
    inputs_product: product,
    fn,
    chunk_size: int=100,
) -> pd.DataFrame:
    """Run function against multiple inputs in parallel."""
    iterables = list(inputs_product)
    outputs = list(
        concurrent_tq.process_map(
            fn,
            *zip(*iterables),
            max_workers=12,
            chunksize=chunk_size,
        )
    )
    return outputs

In [8]:
def get_top_n_distances(
    df_stations: pd.DataFrame,
    df_points: Union[pd.DataFrame, gpd.GeoDataFrame],
    sid: str,
    points_lat_col: str,
    points_lon_col: str,
    points_name_col: str,
    distance_type: str,
    epsg: int,
    sid_cols: List[str],
    max_rank: int=5,
    reassign_target_geo: bool=True,
) -> pd.DataFrame:
    """."""
    df_stations_single = df_stations.query(f"station_id.isin(['{sid}'])")
    gdf_stations = gpd.GeoDataFrame(
        df_stations_single,
        geometry=gpd.points_from_xy(
            df_stations_single['lon'], df_stations_single['lat']
        ),
        crs=crs
    )
    if reassign_target_geo:
        gdf_points = (
            gpd.GeoDataFrame(
                df_points,
                geometry=gpd.points_from_xy(
                    df_points[points_lon_col], df_points[points_lat_col]
                ),
                crs=crs,
            )
            .to_crs(epsg=epsg)
        )
    else:
        gdf_points = df_points
    df_station_info_new = (
        gpd.sjoin_nearest(
            gdf_stations.to_crs(epsg=epsg),
            gdf_points,
            distance_col='distance',
            how="right",
        )
        .sort_values(by=sid_cols+['distance'])
    )
    # assign ranks
    df_station_info_new = (
        df_station_info_new
        .assign(
            rank=lambda df: (
                df
                .groupby(sid_cols)
                ['distance']
                .rank(method='dense')
            )
        )
    )
    # drop duplicates (tied ranks)
    df_station_info_new = (
        df_station_info_new
        .drop_duplicates(subset=sid_cols+['distance', 'rank'])
    )
    # filter to select closest
    max_comps = min(len(df_points), max_rank)
    df_station_info_new = (
        df_station_info_new
        .query(f"rank <= {max_comps}")
        .drop_duplicates(subset=sid_cols+[points_name_col]+['distance', 'rank'])
        .reset_index(drop=True)
        # reorder columns
        [sid_cols + [points_name_col]+ ['distance']]
    )
    return df_station_info_new


def reshape_untidy(
    df: pd.DataFrame,
    points_name_col: str,
    distance_type: str,
    max_rank: int=5,
    verbose: bool=False,
) -> pd.DataFrame:
    """."""
    new_col_names = [f"distance_{distance_type}_{k}" for k in range(1, max_rank+1)]

    #
    df_grid = pd.concat(
        [
            (
                pd.DataFrame([df.head(1).squeeze()])
                .drop(columns=[points_name_col, 'distance'])
            )
            for _ in range(1, max_rank+1)
        ],
        ignore_index=True,
    )
    df_res = (
        df_grid
        .merge(
            df.reset_index()[[points_name_col, 'distance']],
            how='left',
            left_index=True,
            right_index=True,
        )
        .assign(c=lambda df: df[points_name_col].fillna(99).astype(int))
        .drop(columns=[points_name_col])
        .rename(columns={"c": points_name_col})
    )

    try:
        assert len(df_res) == max_rank
        if verbose:
            print(f"Found {max_rank} closest locations")
    except AssertionError as e:
        print(
            f"Found {len(df_res)} less than {max_rank} closest locations"
        )
    return df_res


def select_top_n_closest_untidy(
    df: pd.DataFrame,
    points_name_col: str='_id',
    distance_type: str='ch',
    max_comps:int=5,
) -> pd.DataFrame:
    """."""
    df = (
        df
        .groupby(sid_cols)
        .apply(
            reshape_untidy,
            points_name_col=points_name_col,
            distance_type=distance_type,
            max_rank=max_comps,
            include_groups=False,
        )
        .reset_index()
    )
    new_col_names = [
        f"distance_{distance_type}_{k}" for k in range(1, max_comps+1)
    ]
    df = (
        df
        .assign(c=lambda df: (df['level_7'].fillna(98)+1).astype(int))
        .drop(columns={points_name_col})
        .rename(columns={'c': points_name_col})
        .pivot(index=sid_cols, columns=points_name_col, values='distance')
        .dropna(axis=1, how='all')
        .set_axis(new_col_names, axis='columns')
        .reset_index()
    )
    return df

## Extract

### Downtown Neighbourhoods

Show previously retrieved neighbourhoods within downtown Toronto

In [9]:
df_downtown_neighs = pd.read_parquet(fpath_downtown_neighs)
df_downtown_neighs

Unnamed: 0,Neighbourhood,Location,is_downtown
0,University,Downtown,True
1,Kensington-Chinatown,Downtown,True
2,Wellington Place,Downtown,True
3,Harbourfront-CityPlace,Downtown,True
4,Bay-Cloverhill,Downtown,True
5,Yonge-Bay Corridor,Downtown,True
6,St Lawrence-East Bayfront-The Islands,Downtown,True
7,Church-Wellesley,Downtown,True
8,Downtown Yonge East,Downtown,True
9,North St.James Town,Downtown,True


### Bike Share Station Info (MetaData)

Show the stations info data (metadata) that was retrieved previously, containing station name and its associated neighbourhood name among other non-unique valued columns

In [10]:
%%time
query = f"""
        SELECT station_id,
               name,
               physical_configuration,
               capacity,
               lat,
               lon,
               is_charging_station,
               rental_methods LIKE '%CREDITCARD%' AS credit,
               Neighbourhood,
               COALESCE(Location, NULL, 'Others') AS Location,
               COALESCE(is_downtown, NULL, False) AS is_downtown,
               census_tract_id
        FROM read_parquet({[fpath_stations_info]})
        LEFT JOIN df_downtown_neighs USING (Neighbourhood)
        -- WHERE physical_configuration <> 'VAULT'
        ORDER BY station_id, name
        """
df_stations = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_columns', None):
    display(df_stations)

Unnamed: 0,station_id,name,physical_configuration,capacity,lat,lon,is_charging_station,credit,Neighbourhood,Location,is_downtown,census_tract_id
0,7000,Fort York Blvd / Capreol Ct,REGULAR,35,43.639832,-79.395954,False,True,Harbourfront-CityPlace,Downtown,True,5350012.01
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,23,43.664964,-79.38355,True,True,Church-Wellesley,Downtown,True,5350063.06
2,7002,St. George St / Bloor St W,REGULAR,19,43.667333,-79.399429,False,True,University,Downtown,True,5350061.00
3,7003,Madison Ave / Bloor St W,REGULAR,15,43.667158,-79.402761,False,True,Annex,Others,False,5350091.01
4,7005,King St W / York St,REGULAR,23,43.648001,-79.383177,False,True,Yonge-Bay Corridor,Downtown,True,5350014.00
...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,24,43.709793,-79.363892,False,False,Leaside-Bennington,Others,False,5350195.02
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,24,43.639065,-79.41081,False,False,Fort York-Liberty Village,West of Downtown,False,5350008.01
787,7928,Simcoe St / Pullan Pl,REGULAR,31,43.651053,-79.387649,False,True,Kensington-Chinatown,Downtown,True,5350036.00
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,12,43.649354,-79.396757,False,False,Kensington-Chinatown,Downtown,True,5350039.00


CPU times: user 9.65 ms, sys: 7.32 ms, total: 17 ms
Wall time: 16 ms


### Public Transit Stops

Show previously retrieved public transit bus stop locations across the city

In [11]:
%%time
df_public_transit_stops = pd.read_csv(
    fpath_transit_stops,
    usecols=['stop_id', 'stop_code', 'stop_name', 'stop_lat', 'stop_lon'],
).convert_dtypes()
with pd.option_context('display.max_columns', None):
    pu.show_df(df_public_transit_stops)

column,stop_id,stop_code,stop_name,stop_lat,stop_lon
dtype,Int64,Int64,string[python],Float64,Float64
nunique,9418,9418,7609,9183,9287
missing,0,0,0,0,0
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938
3,265,1871,Davisville Ave at Cleveland St,43.702088,-79.378112
4,266,11700,Disco Rd at Attwell Dr,43.701362,-79.594843
...,...,...,...,...,...
9413,24782,16482,Bloor St West at Acorn Ave,43.640547,-79.541534
9414,24783,16483,Pape Station,43.679781,-79.344912
9415,24784,16484,Beecroft Rd at Sheppard Ave West North Side,43.761367,-79.413167
9416,24785,16485,Don Mills Rd at Van Horne Ave South Side,43.787194,-79.353094


CPU times: user 20.7 ms, sys: 4.06 ms, total: 24.7 ms
Wall time: 24.7 ms


### Libraries

Show previously retrieved public library branch locations across the city

In [12]:
%%time
df_lib = (
    pd.read_parquet(
        fpath_libraries,
        columns=library_cols,
        filters=[('PhysicalBranch', '=', 1)]
    )
)
with pd.option_context('display.max_columns', None):
    pu.show_df(df_lib)

column,_id,BranchName,Address,Lat,Long,PhysicalBranch,WardName
dtype,Int64,string[python],string[python],Float64,Float64,Int64,string[python]
nunique,100,100,100,100,100,1,25
missing,0,0,0,0,0,0,0
0,1,Albion,"1515 Albion Road, Toronto, ON, M9V 1B2",43.739826,-79.584096,1,Etobicoke North
1,2,Albert Campbell,"496 Birchmount Road, Toronto, ON, M1K 1N8",43.708019,-79.269252,1,Scarborough Southwest
2,3,Alderwood,"2 Orianna Drive, Toronto, ON, M8W 4Y1",43.601944,-79.547252,1,Etobicoke-Lakeshore
3,4,Agincourt,"155 Bonis Avenue, Toronto, ON, M1T 3W6",43.785167,-79.29343,1,Scarborough-Agincourt
4,5,Armour Heights,"2140 Avenue Road, Toronto, ON, M5M 4M7",43.739337,-79.421889,1,Eglinton-Lawrence
...,...,...,...,...,...,...,...
95,108,Woodview Park,"16 Bradstock Road, Toronto, ON, M9M 1M8",43.739722,-79.538941,1,Humber River-Black Creek
96,109,Woodside Square,"Woodside Square Mall, 1571 Sandhurst Circle, T...",43.809463,-79.269548,1,Scarborough North
97,110,Wychwood,"1431 Bathurst Street, Toronto, ON, M5R 3J2",43.682181,-79.417548,1,Toronto-St. Paul's
98,111,Yorkville,"22 Yorkville Avenue, Toronto, ON, M4W 1L4",43.671826,-79.388644,1,University-Rosedale


CPU times: user 16.9 ms, sys: 33 µs, total: 16.9 ms
Wall time: 14.7 ms


### Cultural Hotspots

Show previously retrieved public cultural hotspot locations across the city

In [13]:
%%time
gdf_ch = (
    gpd.read_parquet(fpath_ch)
    .pipe(extract_coords_from_geometry, 'ch_lat', 'ch_lon')
)
print(gdf_ch.crs.to_epsg())
with pd.option_context('display.max_columns', None):
    pu.show_df(
        gdf_ch[
            [
                '_id',
                'SiteName',
                'LoopTourName',
                'Interests',
                'ch_lat',
                'ch_lon',
                'geometry',
            ]
        ].head(2)
    )

4326


column,_id,SiteName,LoopTourName,Interests,ch_lat,ch_lon,geometry
dtype,Int64,string[python],string[python],string[python],float64,float64,geometry
nunique,2,2,1,2,2,2,2
missing,0,0,0,0,0,0,0
0,1,Up To This Moment (2019),The Future in Mount Dennis,Art,-79.48458,43.68838,MULTIPOINT ((-79.48458 43.68838))
1,2,Untitled (2019),The Future in Mount Dennis,"Art, History",-79.48487,43.68835,MULTIPOINT ((-79.48487 43.68835))


CPU times: user 37.7 ms, sys: 5.2 ms, total: 42.9 ms
Wall time: 35.9 ms


### Places of Interest

Show previously retrieved places of interest across the city

In [14]:
%%time
gdf_poi = (
    gpd.read_parquet(fpath_poi)
    .pipe(extract_coords_from_geometry, 'poi_lat', 'poi_lon')
)
print(gdf_poi.crs.to_epsg())
with pd.option_context('display.max_columns', None):
    pu.show_df(
        gdf_poi[
            [
                '_id',
                'NAME',
                'ADDRESS_FULL',
                'CATEGORY',
                'poi_lat',
                'poi_lon',
                'geometry',
            ]
        ].head(2)
    )

4326


column,_id,NAME,ADDRESS_FULL,CATEGORY,poi_lat,poi_lon,geometry
dtype,Int64,string[python],string[python],string[python],float64,float64,geometry
nunique,2,2,2,2,2,2,2
missing,0,0,0,0,0,0,0
0,1,BMO Field,170 Princes' Blvd,Sports / Entertainment Venue,-79.418416,43.634663,MULTIPOINT ((-79.41842 43.63466))
1,2,Aga Khan Museum,77 Wynford Dr,Museum,-79.331804,43.727331,MULTIPOINT ((-79.33180 43.72733))


CPU times: user 27.5 ms, sys: 3.5 ms, total: 31 ms
Wall time: 27.6 ms


### Cycle Paths

Show previously retrieved cycle path entrances that are part of the cycling network across the city

In [15]:
%%time
gdf_cycle = gpd.read_parquet(fpath_cycle, columns=cycle_network_cols)
print(gdf_cycle.crs.to_epsg())
with pd.option_context('display.max_columns', None):
    pu.show_df(gdf_cycle)

4326


column,_id,OBJECTID,FROM_STREET,TO_STREET,STREET_NAME,geometry
dtype,Int64,Int64,string[python],string[python],string[python],geometry
nunique,1445,1445,894,880,673,1445
missing,0,0,0,0,0,0
0,1,1,Highbourne Rd,Oriole Pkwy,Kilbarry Rd,"MULTILINESTRING ((-79.40351 43.69526, -79.4030..."
1,2,2,Lake Shore Blvd W,Martin Goodman Trl Branch,Martin Goodman / Waterfront Trl,"MULTILINESTRING ((-79.40364 43.63499, -79.4035..."
2,3,3,Givendale Rd,Kennedy Rd,Gatineau Hydro Corridor Trl,"MULTILINESTRING ((-79.27522 43.74158, -79.2751..."
3,4,4,Dufferin St,G Ross Lord Park Trl Branch,G Ross Lord Park Trl,"MULTILINESTRING ((-79.46772 43.77208, -79.4675..."
4,5,5,Etobicoke Creek Trl Branch,Forty Third St,Etobicoke Creek Trl,"MULTILINESTRING ((-79.54812 43.59027, -79.5480..."
...,...,...,...,...,...,...
1440,1441,1441,Princes' Blvd,Princes' Blvd,Ontario Dr,"MULTILINESTRING ((-79.41952 43.63201, -79.4198..."
1441,1442,1442,Eastbourne Cres,Lake Shore Blvd W,Royal York Rd,"MULTILINESTRING ((-79.49350 43.60451, -79.4934..."
1442,1443,1443,Hamstead Ave,Lumsden Ave,Main St,"MULTILINESTRING ((-79.30584 43.69705, -79.3057..."
1443,1444,1444,Bay St,Yonge St,Temperance St,"MULTILINESTRING ((-79.38107 43.65057, -79.3803..."


CPU times: user 37 ms, sys: 3.68 ms, total: 40.7 ms
Wall time: 39.2 ms


### Public Transit (Train) Routes in Toronto

Show previously retrieved public transit train station locations across the city

In [16]:
%%time
gdf_train = gpd.read_file(fpath_transit_lines)
print(gdf_train.crs)
with pd.option_context('display.max_columns', None):
    pu.show_df(gdf_train)

EPSG:4326


ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/envs/get-data/share/proj failed


column,OBJECTID,ROUTE_NAME,RID,geometry
dtype,float64,object,int64,geometry
nunique,4,4,4,4
missing,0,0,0,0
0,53420.0,LINE 1 (YONGE-UNIVERSITY),1,"LINESTRING (-79.52813 43.79677, -79.52689 43.7..."
1,53421.0,LINE 2 (BLOOR - DANFORTH),2,"LINESTRING (-79.53540 43.63781, -79.53386 43.6..."
2,53422.0,LINE 3 (SCARBOROUGH),3,"LINESTRING (-79.26332 43.73266, -79.26332 43.7..."
3,53423.0,LINE 4 (SHEPPARD),4,"LINESTRING (-79.41113 43.76145, -79.40981 43.7..."


CPU times: user 77.4 ms, sys: 3.8 ms, total: 81.2 ms
Wall time: 80.8 ms


As shown below, the following datasets from above can also be retrieved from raw online soures

1. Bike Share Station Info (MetaData)
2. Public Transit Stops
3. Libraries
4. Cultural Hotspots
5. Places of Interest
6. Cycle Paths
7. Public Transit (Train) Routes in Toronto

```python
# User Inputs
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"
params_bike = {"id": "bike-share-toronto"}
params_pub_trans_stops = {"id": "ttc-routes-and-schedules"}
params_library = {'id': "library-branch-general-information"}
params_poi = {"id": "places-of-interest-and-toronto-attractions"}
params_ch = {"id": "cultural-hotspot-points-of-interest"}
params_cycle = {"id": "cycling-network"}

# 1. Bike Share Station Info (MetaData)
df_stations_geo = get_open_data_package_resources(base_url, params_bike)
df_stations_geo = df_stations_geo.query(
    "(name == 'bike-share-json') & (format == 'JSON')"
)
url_file = df_stations_geo['url'].squeeze()
r = requests.get(url_file)
url_station_info_endpoint = r.json()['data']['en']['feeds'][2]['url']
r = requests.get(url_station_info_endpoint)
df_stations = pd.DataFrame.from_records(r.json()['data']['stations'])

# 2. Public Transit Stops
df_stops_geo = get_open_data_package_resources(base_url, params_pub_trans_stops)
url_file = df_stops_geo['url'].squeeze()
zip_filepath = download_zip_file(raw_data_dir, url_file)
df_public_transit_stops = pd.read_csv(
    fpath_transit_stops,
    usecols=['stop_id', 'stop_code', 'stop_name', 'stop_lat', 'stop_lon'],
).convert_dtypes()

# 3. Libraries
df_libraries_geo = get_open_data_package_resources(base_url, params_library)
df_libraries_geo = df_libraries_geo.query(
    "(datastore_active == False) & (format == 'CSV')"
)
url_file = df_libraries_geo['url'].iloc[0]
df_lib = (
    pd.read_csv(url_file, usecols=library_cols).query("PhysicalBranch == 1")
)

# 4. Cultural Hotspots
df_ch_geo = get_open_data_package_resources(base_url, params_ch)
df_ch_geo = df_ch_geo.query(
    "(datastore_active == False) & (name.str.endswith('4326.geojson'))"
)
url_geo_file = df_ch_geo['url'].iloc[0]
gdf_ch = (
    gpd.read_parquet(url_geo_file)
    .pipe(extract_coords_from_geometry, 'ch_lat', 'ch_lon')
)
print(gdf_ch.crs)

# 5. Places of Interest
df_poi_geo = get_open_data_package_resources(base_url, params_poi)
df_poi_geo = df_poi_geo.query(
    "(datastore_active == False) & (name.str.endswith('4326.geojson'))"
)
url_geo_file = df_poi_geo['url'].iloc[0]
gdf_poi = (
    gpd.read_file(url_geo_file)
    .pipe(extract_coords_from_geometry, 'poi_lat', 'poi_lon')
)
print(gdf_poi.crs)

# 6. Cycle Paths
df_params_cycle = get_open_data_package_resources(base_url, params_cycle)
df_params_cycle = df_params_cycle.query(
    "(datastore_active == False) & (name.str.endswith('4326.geojson'))"
)
url_geo_file = df_params_cycle['url'].iloc[0]
gdf_cycle = gpd.read_file(url_geo_file)[cycle_network_cols]
print(gdf_cycle.crs)
```

For

1. Downtown Neighbourhoods
   - the dataset was previously generated and so only the processed data should be used
2. Public Transit (Train) Routes in Toronto
   - retrieval of the raw dataset version is not shown above

## Transform

### Get Distances Between Bike Share Stations and Points of Interest Across City

For each bike share station, get the distance to the five closest

1. public library branches
2. cultural hotspots
3. places of interest
4. bikeways (cycle paths)
5. public transit (train) stations
6. public transit (bus) stops

In [17]:
sids_wanted = df_stations['station_id'].tolist()

#### Libraries

Get the five closest distances between each bike share station and the public library branches across the city

In [18]:
%%time
with Timer() as t:
    dfs_station_info_new = run_parallel(
        product(
            [df_stations],
            [df_lib],
            df_stations['station_id'].tolist(),
            ['Lat'],
            ['Long'],
            ['_id'],
            ['library'],
            [epsg],
            [sid_cols],
            [max_rank],
        ),
        get_top_n_distances,
        chunk_size=12,
    )
print(f"Found top {max_rank} closest distances in {t.elapsed:.2f}s")
with Timer() as t:
    df_station_info_new = select_top_n_closest_untidy(
        pd.concat(dfs_station_info_new, ignore_index=True),
        points_name_col='_id',
        distance_type='library',
        max_comps=max_rank,
    )
print(f"Selected top {max_rank} closest per station in {t.elapsed:.2f}s")
with pd.option_context('display.max_columns', 1000):
    pu.show_df(df_station_info_new)

100%|██████████| 790/790 [00:04<00:00, 185.00it/s]


Found top 5 closest distances in 4.40s
Selected top 5 closest per station in 3.35s


column,station_id,name,physical_configuration,lat,lon,is_charging_station,capacity,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Float64,Float64,boolean,Int64,float64,float64,float64,float64,float64
nunique,790,790,6,787,789,2,40,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,False,35,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,43.664964,-79.38355,True,23,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,43.667333,-79.399429,False,19,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,False,15,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,43.648001,-79.383177,False,23,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,43.709793,-79.363892,False,24,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,43.639065,-79.41081,False,24,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,43.651053,-79.387649,False,31,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,43.649354,-79.396757,False,12,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


CPU times: user 3.79 s, sys: 86.4 ms, total: 3.87 s
Wall time: 7.76 s


#### Cultural Hotspots

Get the five closest distances between each bike share station and the city's cultural hotspots, and merge with the above

In [19]:
%%time
with Timer() as t:
    dfs_station_info_new_ch = run_parallel(
        product(
            [df_stations[sid_cols]],
            [gdf_ch.drop(columns=['geometry'])],
            df_stations['station_id'].tolist(),
            ['ch_lat'],
            ['ch_lon'],
            ['_id'],
            ['ch'],
            [epsg],
            [sid_cols],
            [max_rank],
        ),
        get_top_n_distances,
        chunk_size=12,
    )
print(f"Found top {max_rank} closest distances in {t.elapsed:.2f}s")
with Timer() as t:
    df_station_info_new_ch = (
        select_top_n_closest_untidy(
            pd.concat(dfs_station_info_new_ch, ignore_index=True),
            points_name_col='_id',
            distance_type='ch',
            max_comps=max_rank,
        )
        .merge(df_station_info_new, on=sid_cols, how='left')
    )
print(f"Selected top {max_rank} closest per station in {t.elapsed:.2f}s")
with pd.option_context('display.max_columns', 1000):
    pu.show_df(df_station_info_new_ch)

100%|██████████| 790/790 [00:08<00:00, 90.91it/s] 


Found top 5 closest distances in 8.80s
Selected top 5 closest per station in 3.30s


column,station_id,name,physical_configuration,lat,lon,is_charging_station,capacity,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Float64,Float64,boolean,Int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
nunique,790,790,6,787,789,2,40,790,790,790,790,790,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,False,35,2.402484e+07,2.402629e+07,2.402723e+07,2.402835e+07,2.402905e+07,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,43.664964,-79.38355,True,23,2.402181e+07,2.402326e+07,2.402421e+07,2.402532e+07,2.402603e+07,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,43.667333,-79.399429,False,19,2.402181e+07,2.402326e+07,2.402421e+07,2.402533e+07,2.402603e+07,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,False,15,2.402189e+07,2.402334e+07,2.402429e+07,2.402540e+07,2.402611e+07,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,43.648001,-79.383177,False,23,2.402370e+07,2.402515e+07,2.402610e+07,2.402722e+07,2.402792e+07,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,43.709793,-79.363892,False,24,2.401645e+07,2.401790e+07,2.401884e+07,2.401996e+07,2.402066e+07,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,43.639065,-79.41081,False,24,2.402517e+07,2.402662e+07,2.402757e+07,2.402869e+07,2.402939e+07,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,43.651053,-79.387649,False,31,2.402344e+07,2.402489e+07,2.402583e+07,2.402695e+07,2.402765e+07,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,43.649354,-79.396757,False,12,2.402378e+07,2.402523e+07,2.402618e+07,2.402730e+07,2.402800e+07,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


CPU times: user 4.27 s, sys: 257 ms, total: 4.53 s
Wall time: 12.1 s


#### Places of Interest

Get the five closest distances between each bike share station and the city's places of interest, and merge with the above

In [20]:
%%time
with Timer() as t:
    dfs_station_info_new_poi = run_parallel(
        product(
            [df_stations[sid_cols]],
            [gdf_poi.drop(columns=['geometry'])],
            df_stations['station_id'].tolist(),
            ['poi_lat'],
            ['poi_lon'],
            ['_id'],
            ['poi'],
            [epsg],
            [sid_cols],
            [max_rank]
        ),
        get_top_n_distances,
        chunk_size=12,
    )
print(f"Found top {max_rank} closest distances in {t.elapsed:.2f}s")
with Timer() as t:
    df_station_info_new_poi = (
        select_top_n_closest_untidy(
            pd.concat(dfs_station_info_new_poi, ignore_index=True),
            points_name_col='_id',
            distance_type='poi',
            max_comps=max_rank,
        )
        .merge(df_station_info_new_ch, on=sid_cols, how='left')
    )
print(f"Selected top {max_rank} closest per station in {t.elapsed:.2f}s")
with pd.option_context('display.max_columns', 1000):
    pu.show_df(df_station_info_new_poi)

100%|██████████| 790/790 [00:06<00:00, 130.21it/s]


Found top 5 closest distances in 6.16s
Selected top 5 closest per station in 3.46s


column,station_id,name,physical_configuration,lat,lon,is_charging_station,capacity,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Float64,Float64,boolean,Int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
nunique,790,790,6,787,789,2,40,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,False,35,2.402448e+07,2.402580e+07,2.402883e+07,2.402936e+07,2.403151e+07,2.402484e+07,2.402629e+07,2.402723e+07,2.402835e+07,2.402905e+07,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,43.664964,-79.38355,True,23,2.402146e+07,2.402277e+07,2.402580e+07,2.402633e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402532e+07,2.402603e+07,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,43.667333,-79.399429,False,19,2.402146e+07,2.402278e+07,2.402580e+07,2.402634e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402533e+07,2.402603e+07,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,False,15,2.402154e+07,2.402285e+07,2.402588e+07,2.402641e+07,2.402856e+07,2.402189e+07,2.402334e+07,2.402429e+07,2.402540e+07,2.402611e+07,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,43.648001,-79.383177,False,23,2.402335e+07,2.402467e+07,2.402770e+07,2.402823e+07,2.403037e+07,2.402370e+07,2.402515e+07,2.402610e+07,2.402722e+07,2.402792e+07,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,43.709793,-79.363892,False,24,2.401610e+07,2.401741e+07,2.402044e+07,2.402097e+07,2.402312e+07,2.401645e+07,2.401790e+07,2.401884e+07,2.401996e+07,2.402066e+07,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,43.639065,-79.41081,False,24,2.402482e+07,2.402614e+07,2.402917e+07,2.402970e+07,2.403184e+07,2.402517e+07,2.402662e+07,2.402757e+07,2.402869e+07,2.402939e+07,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,43.651053,-79.387649,False,31,2.402309e+07,2.402440e+07,2.402743e+07,2.402796e+07,2.403011e+07,2.402344e+07,2.402489e+07,2.402583e+07,2.402695e+07,2.402765e+07,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,43.649354,-79.396757,False,12,2.402343e+07,2.402475e+07,2.402777e+07,2.402831e+07,2.403045e+07,2.402378e+07,2.402523e+07,2.402618e+07,2.402730e+07,2.402800e+07,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


CPU times: user 3.93 s, sys: 122 ms, total: 4.06 s
Wall time: 9.64 s


#### Cycle Paths

Get the five closest distances between each bike share station and the city's cycling network (cycle path entrances), and merge with the above

In [21]:
%%time
with Timer() as t:
    dfs_station_info_new_cycle = run_parallel(
        product(
            [df_stations[sid_cols]],
            [gdf_cycle.to_crs(epsg=epsg)],
            sids_wanted,
            [''],
            [''],
            ['_id'],
            ['cycle'],
            [epsg],
            [sid_cols],
            [max_rank],
            [False],
        ),
        get_top_n_distances,
        chunk_size=12,
    )
print(f"Found top {max_rank} closest distances in {t.elapsed:.2f}s")
with Timer() as t:
    df_station_info_new_cycle = (
        select_top_n_closest_untidy(
            pd.concat(dfs_station_info_new_cycle, ignore_index=True),
            points_name_col='_id',
            distance_type='cycle',
            max_comps=max_rank,
        )
        .merge(df_station_info_new_poi, on=sid_cols, how='left')
    )
print(f"Selected top {max_rank} closest per station in {t.elapsed:.2f}s")
with pd.option_context('display.max_columns', 1000):
    pu.show_df(df_station_info_new_cycle)

100%|██████████| 790/790 [00:04<00:00, 174.75it/s]


Found top 5 closest distances in 4.69s
Selected top 5 closest per station in 3.47s


column,station_id,name,physical_configuration,lat,lon,is_charging_station,capacity,distance_cycle_1,distance_cycle_2,distance_cycle_3,distance_cycle_4,distance_cycle_5,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Float64,Float64,boolean,Int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
nunique,790,790,6,787,789,2,40,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,False,35,8.903554,128.002329,138.804963,167.307523,168.987297,2.402448e+07,2.402580e+07,2.402883e+07,2.402936e+07,2.403151e+07,2.402484e+07,2.402629e+07,2.402723e+07,2.402835e+07,2.402905e+07,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,43.664964,-79.38355,True,23,20.686722,86.997269,309.151938,471.048028,512.381973,2.402146e+07,2.402277e+07,2.402580e+07,2.402633e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402532e+07,2.402603e+07,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,43.667333,-79.399429,False,19,26.461799,29.777171,175.378961,177.536445,247.120553,2.402146e+07,2.402278e+07,2.402580e+07,2.402634e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402533e+07,2.402603e+07,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,False,15,28.984745,108.640357,153.897768,245.074174,246.525358,2.402154e+07,2.402285e+07,2.402588e+07,2.402641e+07,2.402856e+07,2.402189e+07,2.402334e+07,2.402429e+07,2.402540e+07,2.402611e+07,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,43.648001,-79.383177,False,23,140.230424,152.468977,221.502560,253.665951,253.683454,2.402335e+07,2.402467e+07,2.402770e+07,2.402823e+07,2.403037e+07,2.402370e+07,2.402515e+07,2.402610e+07,2.402722e+07,2.402792e+07,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,43.709793,-79.363892,False,24,752.107545,999.580682,1087.061018,1153.353263,1161.952369,2.401610e+07,2.401741e+07,2.402044e+07,2.402097e+07,2.402312e+07,2.401645e+07,2.401790e+07,2.401884e+07,2.401996e+07,2.402066e+07,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,43.639065,-79.41081,False,24,3.312816,201.511211,201.593236,204.457912,233.564209,2.402482e+07,2.402614e+07,2.402917e+07,2.402970e+07,2.403184e+07,2.402517e+07,2.402662e+07,2.402757e+07,2.402869e+07,2.402939e+07,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,43.651053,-79.387649,False,31,44.911126,74.439465,138.024097,138.049690,259.058288,2.402309e+07,2.402440e+07,2.402743e+07,2.402796e+07,2.403011e+07,2.402344e+07,2.402489e+07,2.402583e+07,2.402695e+07,2.402765e+07,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,43.649354,-79.396757,False,12,170.467856,254.080720,266.085477,267.089500,267.090332,2.402343e+07,2.402475e+07,2.402777e+07,2.402831e+07,2.403045e+07,2.402378e+07,2.402523e+07,2.402618e+07,2.402730e+07,2.402800e+07,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


CPU times: user 4.63 s, sys: 171 ms, total: 4.8 s
Wall time: 8.2 s


#### Public Transit (Train) Stations

Get the five closest distances between each bike share station and the city's public transit train stations, and merge with the above

In [22]:
%%time
with Timer() as t:
    dfs_station_info_new_train = run_parallel(
        product(
            [df_stations[sid_cols]],
            [gdf_train.to_crs(epsg=epsg)],
            sids_wanted,
            [''],
            [''],
            ['RID'],
            ['train'],
            [epsg],
            [sid_cols],
            [max_rank_train],
            [False],
        ),
        get_top_n_distances,
        chunk_size=12,
    )
print(f"Found top {max_rank} closest distances in {t.elapsed:.2f}s")
with Timer() as t:
    df_station_info_new_train = (
        select_top_n_closest_untidy(
            pd.concat(dfs_station_info_new_train, ignore_index=True),
            points_name_col='RID',
            distance_type='train',
            max_comps=max_rank_train,
        )
        .merge(df_station_info_new_cycle, on=sid_cols, how='left')
    )
print(f"Selected top {max_rank} closest per station in {t.elapsed:.2f}s")
with pd.option_context('display.max_columns', 1000):
    pu.show_df(df_station_info_new_train)

100%|██████████| 790/790 [00:03<00:00, 258.26it/s]


Found top 5 closest distances in 3.19s
Selected top 5 closest per station in 2.42s


column,station_id,name,physical_configuration,lat,lon,is_charging_station,capacity,distance_train_1,distance_train_2,distance_train_3,distance_train_4,distance_cycle_1,distance_cycle_2,distance_cycle_3,distance_cycle_4,distance_cycle_5,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Float64,Float64,boolean,Int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
nunique,790,790,6,787,789,2,40,790,790,790,786,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,False,35,1233.803951,3206.125512,13987.008148,15213.129327,8.903554,128.002329,138.804963,167.307523,168.987297,2.402448e+07,2.402580e+07,2.402883e+07,2.402936e+07,2.403151e+07,2.402484e+07,2.402629e+07,2.402723e+07,2.402835e+07,2.402905e+07,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,43.664964,-79.38355,True,23,7.961900,697.546932,11286.686757,12540.364469,20.686722,86.997269,309.151938,471.048028,512.381973,2.402146e+07,2.402277e+07,2.402580e+07,2.402633e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402532e+07,2.402603e+07,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,43.667333,-79.399429,False,19,100.485103,10823.760739,13453.311844,,26.461799,29.777171,175.378961,177.536445,247.120553,2.402146e+07,2.402278e+07,2.402580e+07,2.402634e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402533e+07,2.402603e+07,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,False,15,41.657403,48.057295,10822.356282,13695.814346,28.984745,108.640357,153.897768,245.074174,246.525358,2.402154e+07,2.402285e+07,2.402588e+07,2.402641e+07,2.402856e+07,2.402189e+07,2.402334e+07,2.402429e+07,2.402540e+07,2.402611e+07,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,43.648001,-79.383177,False,23,158.050655,2598.919447,13200.617035,13798.949661,140.230424,152.468977,221.502560,253.665951,253.683454,2.402335e+07,2.402467e+07,2.402770e+07,2.402823e+07,2.403037e+07,2.402370e+07,2.402515e+07,2.402610e+07,2.402722e+07,2.402792e+07,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,43.709793,-79.363892,False,24,2903.667551,3730.419520,6804.737403,8646.677802,752.107545,999.580682,1087.061018,1153.353263,1161.952369,2.401610e+07,2.401741e+07,2.402044e+07,2.402097e+07,2.402312e+07,2.401645e+07,2.401790e+07,2.401884e+07,2.401996e+07,2.402066e+07,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,43.639065,-79.41081,False,24,2357.705060,2923.567344,14017.599119,16180.591032,3.312816,201.511211,201.593236,204.457912,233.564209,2.402482e+07,2.402614e+07,2.402917e+07,2.402970e+07,2.403184e+07,2.402517e+07,2.402662e+07,2.402757e+07,2.402869e+07,2.402939e+07,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,43.651053,-79.387649,False,31,57.288196,2163.721887,12794.601358,13829.251675,44.911126,74.439465,138.024097,138.049690,259.058288,2.402309e+07,2.402440e+07,2.402743e+07,2.402796e+07,2.403011e+07,2.402344e+07,2.402489e+07,2.402583e+07,2.402695e+07,2.402765e+07,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,43.649354,-79.396757,False,12,838.806496,2138.934126,12894.736025,14524.936195,170.467856,254.080720,266.085477,267.089500,267.090332,2.402343e+07,2.402475e+07,2.402777e+07,2.402831e+07,2.403045e+07,2.402378e+07,2.402523e+07,2.402618e+07,2.402730e+07,2.402800e+07,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


CPU times: user 2.87 s, sys: 158 ms, total: 3.03 s
Wall time: 5.63 s


#### Public Transit (Bus) Stops

Get the five closest distances between each bike share station and the city's public transit bus stops, and merge with the above

In [23]:
%%time
with Timer() as t:
    dfs_station_info_new_stops = run_parallel(
        product(
            [df_stations[sid_cols]],
            [df_public_transit_stops],
            df_stations['station_id'].tolist(),
            ['stop_lat'],
            ['stop_lon'],
            ['stop_id'],
            ['pt'],
            [epsg],
            [sid_cols],
        ),
        get_top_n_distances,
        chunk_size=12,
    )
print(f"Found top {max_rank} closest distances in {t.elapsed:.2f}s")
with Timer() as t:
    df_station_info_new_stops = (
        select_top_n_closest_untidy(
            pd.concat(dfs_station_info_new_stops, ignore_index=True),
            points_name_col='stop_id',
            distance_type='pt',
            max_comps=max_rank,
        )
        .merge(df_station_info_new_train, on=sid_cols, how='left')
    )
print(f"Selected top {max_rank} closest per station in {t.elapsed:.2f}s")
with pd.option_context('display.max_columns', None):
    pu.show_df(df_station_info_new_stops)

100%|██████████| 790/790 [00:12<00:00, 65.48it/s] 


Found top 5 closest distances in 12.19s
Selected top 5 closest per station in 3.14s


column,station_id,name,physical_configuration,lat,lon,is_charging_station,capacity,distance_pt_1,distance_pt_2,distance_pt_3,distance_pt_4,distance_pt_5,distance_train_1,distance_train_2,distance_train_3,distance_train_4,distance_cycle_1,distance_cycle_2,distance_cycle_3,distance_cycle_4,distance_cycle_5,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Float64,Float64,boolean,Int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
nunique,790,790,6,787,789,2,40,790,790,790,790,790,790,790,790,786,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,False,35,257.313495,280.053224,347.999709,378.580565,381.006053,1233.803951,3206.125512,13987.008148,15213.129327,8.903554,128.002329,138.804963,167.307523,168.987297,2.402448e+07,2.402580e+07,2.402883e+07,2.402936e+07,2.403151e+07,2.402484e+07,2.402629e+07,2.402723e+07,2.402835e+07,2.402905e+07,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,43.664964,-79.38355,True,23,5.464962,39.419506,55.592633,69.547506,78.534780,7.961900,697.546932,11286.686757,12540.364469,20.686722,86.997269,309.151938,471.048028,512.381973,2.402146e+07,2.402277e+07,2.402580e+07,2.402633e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402532e+07,2.402603e+07,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,43.667333,-79.399429,False,19,38.404918,46.664997,107.122527,157.324525,168.292159,100.485103,10823.760739,13453.311844,,26.461799,29.777171,175.378961,177.536445,247.120553,2.402146e+07,2.402278e+07,2.402580e+07,2.402634e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402533e+07,2.402603e+07,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,False,15,75.508845,86.036586,87.246597,94.365131,123.696337,41.657403,48.057295,10822.356282,13695.814346,28.984745,108.640357,153.897768,245.074174,246.525358,2.402154e+07,2.402285e+07,2.402588e+07,2.402641e+07,2.402856e+07,2.402189e+07,2.402334e+07,2.402429e+07,2.402540e+07,2.402611e+07,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,43.648001,-79.383177,False,23,27.800265,88.820571,154.201611,161.475834,224.353595,158.050655,2598.919447,13200.617035,13798.949661,140.230424,152.468977,221.502560,253.665951,253.683454,2.402335e+07,2.402467e+07,2.402770e+07,2.402823e+07,2.403037e+07,2.402370e+07,2.402515e+07,2.402610e+07,2.402722e+07,2.402792e+07,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,43.709793,-79.363892,False,24,25.663783,29.351904,68.617520,86.008284,91.520208,2903.667551,3730.419520,6804.737403,8646.677802,752.107545,999.580682,1087.061018,1153.353263,1161.952369,2.401610e+07,2.401741e+07,2.402044e+07,2.402097e+07,2.402312e+07,2.401645e+07,2.401790e+07,2.401884e+07,2.401996e+07,2.402066e+07,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,43.639065,-79.41081,False,24,48.322976,238.775531,278.044564,296.848477,298.206290,2357.705060,2923.567344,14017.599119,16180.591032,3.312816,201.511211,201.593236,204.457912,233.564209,2.402482e+07,2.402614e+07,2.402917e+07,2.402970e+07,2.403184e+07,2.402517e+07,2.402662e+07,2.402757e+07,2.402869e+07,2.402939e+07,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,43.651053,-79.387649,False,31,73.252190,95.402613,101.825204,116.201597,118.814063,57.288196,2163.721887,12794.601358,13829.251675,44.911126,74.439465,138.024097,138.049690,259.058288,2.402309e+07,2.402440e+07,2.402743e+07,2.402796e+07,2.403011e+07,2.402344e+07,2.402489e+07,2.402583e+07,2.402695e+07,2.402765e+07,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,43.649354,-79.396757,False,12,29.731335,78.881022,87.389597,137.330466,196.906737,838.806496,2138.934126,12894.736025,14524.936195,170.467856,254.080720,266.085477,267.089500,267.090332,2.402343e+07,2.402475e+07,2.402777e+07,2.402831e+07,2.403045e+07,2.402378e+07,2.402523e+07,2.402618e+07,2.402730e+07,2.402800e+07,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


CPU times: user 3.95 s, sys: 145 ms, total: 4.09 s
Wall time: 15.4 s


### Combine Distances with Station Metadata, To Get Station Attributes

Merge all distance outputs with original station metadata

In [24]:
df_merged = (
    df_stations
    .merge(df_station_info_new_stops, on=sid_cols, how='left')
)
with pd.option_context('display.max_columns', 1000):
    pu.show_df(df_merged)

column,station_id,name,physical_configuration,capacity,lat,lon,is_charging_station,credit,Neighbourhood,Location,is_downtown,census_tract_id,distance_pt_1,distance_pt_2,distance_pt_3,distance_pt_4,distance_pt_5,distance_train_1,distance_train_2,distance_train_3,distance_train_4,distance_cycle_1,distance_cycle_2,distance_cycle_3,distance_cycle_4,distance_cycle_5,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5
dtype,string[python],string[python],string[python],Int64,Float64,Float64,boolean,boolean,string[python],string[python],boolean,string[python],float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
nunique,790,790,6,40,787,789,2,2,107,4,2,272,790,790,790,790,790,790,790,790,786,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790,790
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7000,Fort York Blvd / Capreol Ct,REGULAR,35,43.639832,-79.395954,False,True,Harbourfront-CityPlace,Downtown,True,5350012.01,257.313495,280.053224,347.999709,378.580565,381.006053,1233.803951,3206.125512,13987.008148,15213.129327,8.903554,128.002329,138.804963,167.307523,168.987297,2.402448e+07,2.402580e+07,2.402883e+07,2.402936e+07,2.403151e+07,2.402484e+07,2.402629e+07,2.402723e+07,2.402835e+07,2.402905e+07,368.720182,1613.293131,1842.141138,2102.331938,2562.887483
1,7001,Wellesley Station Green P,ELECTRICBIKESTATION,23,43.664964,-79.38355,True,True,Church-Wellesley,Downtown,True,5350063.06,5.464962,39.419506,55.592633,69.547506,78.534780,7.961900,697.546932,11286.686757,12540.364469,20.686722,86.997269,309.151938,471.048028,512.381973,2.402146e+07,2.402277e+07,2.402580e+07,2.402633e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402532e+07,2.402603e+07,799.891075,820.496833,892.833627,1328.922487,1358.865436
2,7002,St. George St / Bloor St W,REGULAR,19,43.667333,-79.399429,False,True,University,Downtown,True,5350061.00,38.404918,46.664997,107.122527,157.324525,168.292159,100.485103,10823.760739,13453.311844,,26.461799,29.777171,175.378961,177.536445,247.120553,2.402146e+07,2.402278e+07,2.402580e+07,2.402634e+07,2.402848e+07,2.402181e+07,2.402326e+07,2.402421e+07,2.402533e+07,2.402603e+07,411.969201,1033.893679,1061.009740,1167.688187,1242.615050
3,7003,Madison Ave / Bloor St W,REGULAR,15,43.667158,-79.402761,False,True,Annex,Others,False,5350091.01,75.508845,86.036586,87.246597,94.365131,123.696337,41.657403,48.057295,10822.356282,13695.814346,28.984745,108.640357,153.897768,245.074174,246.525358,2.402154e+07,2.402285e+07,2.402588e+07,2.402641e+07,2.402856e+07,2.402189e+07,2.402334e+07,2.402429e+07,2.402540e+07,2.402611e+07,134.445765,968.130891,1099.051467,1289.785481,1429.808810
4,7005,King St W / York St,REGULAR,23,43.648001,-79.383177,False,True,Yonge-Bay Corridor,Downtown,True,5350014.00,27.800265,88.820571,154.201611,161.475834,224.353595,158.050655,2598.919447,13200.617035,13798.949661,140.230424,152.468977,221.502560,253.665951,253.683454,2.402335e+07,2.402467e+07,2.402770e+07,2.402823e+07,2.403037e+07,2.402370e+07,2.402515e+07,2.402610e+07,2.402722e+07,2.402792e+07,618.419203,1242.038388,1714.984575,1739.128562,1916.225732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,7926,McRae Dr / Laird Dr - SMART,SMARTMAPFRAME,24,43.709793,-79.363892,False,False,Leaside-Bennington,Others,False,5350195.02,25.663783,29.351904,68.617520,86.008284,91.520208,2903.667551,3730.419520,6804.737403,8646.677802,752.107545,999.580682,1087.061018,1153.353263,1161.952369,2.401610e+07,2.401741e+07,2.402044e+07,2.402097e+07,2.402312e+07,2.401645e+07,2.401790e+07,2.401884e+07,2.401996e+07,2.402066e+07,485.725715,1933.912449,2123.809222,2407.543019,2823.776774
786,7927,Strachan Ave / East Liberty St - SMART,SMARTMAPFRAME,24,43.639065,-79.41081,False,False,Fort York-Liberty Village,West of Downtown,False,5350008.01,48.322976,238.775531,278.044564,296.848477,298.206290,2357.705060,2923.567344,14017.599119,16180.591032,3.312816,201.511211,201.593236,204.457912,233.564209,2.402482e+07,2.402614e+07,2.402917e+07,2.402970e+07,2.403184e+07,2.402517e+07,2.402662e+07,2.402757e+07,2.402869e+07,2.402939e+07,872.272977,1554.910022,1833.393787,1999.122039,2412.278626
787,7928,Simcoe St / Pullan Pl,REGULAR,31,43.651053,-79.387649,False,True,Kensington-Chinatown,Downtown,True,5350036.00,73.252190,95.402613,101.825204,116.201597,118.814063,57.288196,2163.721887,12794.601358,13829.251675,44.911126,74.439465,138.024097,138.049690,259.058288,2.402309e+07,2.402440e+07,2.402743e+07,2.402796e+07,2.403011e+07,2.402344e+07,2.402489e+07,2.402583e+07,2.402695e+07,2.402765e+07,403.418534,1204.557648,1490.903091,1597.832589,1708.419683
788,7929,Spadina Ave / Bulwer St- SMART,SMARTMAPFRAME,12,43.649354,-79.396757,False,False,Kensington-Chinatown,Downtown,True,5350039.00,29.731335,78.881022,87.389597,137.330466,196.906737,838.806496,2138.934126,12894.736025,14524.936195,170.467856,254.080720,266.085477,267.089500,267.090332,2.402343e+07,2.402475e+07,2.402777e+07,2.402831e+07,2.403045e+07,2.402378e+07,2.402523e+07,2.402618e+07,2.402730e+07,2.402800e+07,793.003871,1011.005220,1156.836636,1187.175583,2052.880666


**Notes**

1. This combined dataset consisting of metadata and distances is considered as the station attributes.

Perform sanity checks after merging, except for train station locations

In [25]:
%%time
filters = '|'.join(
    [
        cs
        for c in ['library', 'ch', 'poi', 'pt', 'cycle']
        for cs in [f"distance_{c}_{r}.isna()" for r in range(1, 5+1)]
    ]
)
try:
    assert df_merged.query(filters).empty
    print(
        "Found no stations with missing distances, excluding train stops, as "
        "expected"
    )
except AssertionError as e:
    print(
        f"{str(e)}Unexpectedly found some stations with missing distances, "
        "excluding train stops"
    )

Found no stations with missing distances, excluding train stops, as expected
CPU times: user 11.8 ms, sys: 0 ns, total: 11.8 ms
Wall time: 11.4 ms


Perform sanity checks after merging, for train station locations

In [26]:
%%time
filters = '|'.join(
    [
        cs
        for c in ['train']
        for cs in [f"distance_{c}_{r}.isna()" for r in range(1, 4+1)]
    ]
)
try:
    assert not df_merged.query(filters).empty
    print(
        "Found some stations with missing distances, as expected. Found "
        f"{len(df_merged.query(filters)):,} such stations."
    )
except AssertionError as e:
    print(
        f"{str(e)}Unexpectedly Found no stations with missing distances to "
        "train stops"
    )

Found some stations with missing distances, as expected. Found 4 such stations.
CPU times: user 13.6 ms, sys: 73 µs, total: 13.6 ms
Wall time: 13.3 ms


### Get Periodic Ridership Trends (Behavioural Data) at Bike Share Stations

#### Get Top `N` Stations Based on Specified Threshold

Get the number of wanted top stations to be separated from all other stations based on their performance in each of the following metrics

1. total departures during the last **full** year (2022)
2. total arrivals during the last **full** year (2022)
3. total departures over the last `X` **full** years (2018 to 2022)
4. total arrivals over the last `X` **full** years (2018 to 2022)

In [27]:
num_top_stations = int(top_perform_frac*len(df_stations))
print(
    f"Selected stations that rank in the top {num_top_stations:,} "
    f"in four metrics, out of {len(df_stations):,}"
)

Selected stations that rank in the top 126 in four metrics, out of 790


**Notes**

1. Here
   - `X` will be chosen as 5, which corresponds to all historical **full** years since 2018 inclusive
   - based on the assumptions earlier, the last **full** year is 2022

#### Get Quarterly and Yearly Station Ridership Totals

Get the top-performing stations with all the processed bike share ridership data

In [28]:
%%time
query = f"""
        -- 1. get yearly departures per station
        WITH t1 AS (
            SELECT start_station_id AS station_id,
                   -- CAST(started_at_year AS VARCHAR) AS year,
                   CONCAT(CAST(started_at_year AS VARCHAR), '_q', datepart('quarter', started_at)) AS year_quarter,
                   COUNT(DISTINCT(trip_id)) AS trips,
                   'departures' AS type
            FROM read_parquet({fpaths_proc_all})
            WHERE started_at_year <= 2022
            GROUP BY all
        ),
        -- 2. get yearly arrivals per station
        t2 AS (
            SELECT end_station_id AS station_id,
                   -- CAST(ended_at_year AS VARCHAR) AS year,
                   CONCAT(CAST(ended_at_year AS VARCHAR), '_q', datepart('quarter', ended_at)) AS year_quarter,
                   COUNT(DISTINCT(trip_id)) AS trips,
                   'arrivals' AS type
            FROM read_parquet({fpaths_proc_all})
            WHERE ended_at_year <= 2022
            GROUP BY all
        ),
        -- 3. get total departures and number of stations from N most-recent full years (2018 to 2022)
        t3 AS (
            SELECT start_station_id AS station_id,
                   'all' AS year_quarter,
                   -- 'all' AS year,
                   COUNT(DISTINCT(trip_id)) AS trips,
                   'departures' AS type
            FROM read_parquet({fpaths_proc_2018_2022})
            GROUP BY all
        ),
        -- 4. get total arrivals and number of stations from N most-recent full years (2018 to 2022)
        t4 AS (
            SELECT end_station_id AS station_id,
                   'all' AS year_quarter,
                   -- 'all' AS year,
                   COUNT(DISTINCT(trip_id)) AS trips,
                   'arrivals' AS type
            FROM read_parquet({fpaths_proc_2018_2022})
            GROUP BY all
        ),
        -- 5. get useful station attributes (excludes any single-value attributes since these
        -- are not useful for extracting insights)
        t5 AS (
            SELECT *,
                   True as is_active
            -- FROM read_parquet({[fpath_stations_info]})
            FROM df_merged
            -- LEFT JOIN df_downtown_neighs USING (Neighbourhood)
        ),
        -- 6. combine all
        t6 AS (
            SELECT *
            FROM t1
            UNION
            SELECT *
            FROM t2
            UNION
            SELECT *
            FROM t3
            UNION
            SELECT *
            FROM t4
        ),
        -- 7. reshape into untidy data
        t7 AS (
            PIVOT t6
            ON type || '_' || year_quarter
            -- ON type || '_' || year
            USING sum(trips)
        ),
        -- 8. filter to get active stations, fill NULLs and assign ranks based on departures & arrivals
        t8 AS (
            SELECT * EXCLUDE (
                       is_downtown,
                       departures_all,
                       arrivals_all,
                       departures_2022_q1,
                       departures_2022_q2,
                       departures_2022_q3,
                       departures_2022_q4,
                       arrivals_2022_q1,
                       arrivals_2022_q2,
                       arrivals_2022_q3,
                       arrivals_2022_q4,
                       departures_2021_q1,
                       departures_2021_q2,
                       departures_2021_q3,
                       departures_2021_q4,
                       arrivals_2021_q1,
                       arrivals_2021_q2,
                       arrivals_2021_q3,
                       arrivals_2021_q4,
                       departures_2020_q1,
                       departures_2020_q2,
                       departures_2020_q3,
                       departures_2020_q4,
                       arrivals_2020_q1,
                       arrivals_2020_q2,
                       arrivals_2020_q3,
                       arrivals_2020_q4,
                       departures_2019_q1,
                       departures_2019_q2,
                       departures_2019_q3,
                       departures_2019_q4,
                       arrivals_2019_q1,
                       arrivals_2019_q2,
                       arrivals_2019_q3,
                       arrivals_2019_q4,
                       departures_2018_q1,
                       departures_2018_q2,
                       departures_2018_q3,
                       departures_2018_q4,
                       arrivals_2018_q1,
                       arrivals_2018_q2,
                       arrivals_2018_q3,
                       arrivals_2018_q4
                   ),
                   COALESCE(departures_2022_q1, NULL, 0) AS departures_2022_q1,
                   COALESCE(departures_2022_q2, NULL, 0) AS departures_2022_q2,
                   COALESCE(departures_2022_q3, NULL, 0) AS departures_2022_q3,
                   COALESCE(departures_2022_q4, NULL, 0) AS departures_2022_q4,
                   COALESCE(arrivals_2022_q1, NULL, 0) AS arrivals_2022_q1,
                   COALESCE(arrivals_2022_q2, NULL, 0) AS arrivals_2022_q2,
                   COALESCE(arrivals_2022_q3, NULL, 0) AS arrivals_2022_q3,
                   COALESCE(arrivals_2022_q4, NULL, 0) AS arrivals_2022_q4,
                   COALESCE(departures_2021_q1, NULL, 0) AS departures_2021_q1,
                   COALESCE(departures_2021_q2, NULL, 0) AS departures_2021_q2,
                   COALESCE(departures_2021_q3, NULL, 0) AS departures_2021_q3,
                   COALESCE(departures_2021_q4, NULL, 0) AS departures_2021_q4,
                   COALESCE(arrivals_2021_q1, NULL, 0) AS arrivals_2021_q1,
                   COALESCE(arrivals_2021_q2, NULL, 0) AS arrivals_2021_q2,
                   COALESCE(arrivals_2021_q3, NULL, 0) AS arrivals_2021_q3,
                   COALESCE(arrivals_2021_q4, NULL, 0) AS arrivals_2021_q4,
                   COALESCE(departures_2020_q1, NULL, 0) AS departures_2020_q1,
                   COALESCE(departures_2020_q2, NULL, 0) AS departures_2020_q2,
                   COALESCE(departures_2020_q3, NULL, 0) AS departures_2020_q3,
                   COALESCE(departures_2020_q4, NULL, 0) AS departures_2020_q4,
                   COALESCE(arrivals_2020_q1, NULL, 0) AS arrivals_2020_q1,
                   COALESCE(arrivals_2020_q2, NULL, 0) AS arrivals_2020_q2,
                   COALESCE(arrivals_2020_q3, NULL, 0) AS arrivals_2020_q3,
                   COALESCE(arrivals_2020_q4, NULL, 0) AS arrivals_2020_q4,
                   COALESCE(departures_2019_q1, NULL, 0) AS departures_2019_q1,
                   COALESCE(departures_2019_q2, NULL, 0) AS departures_2019_q2,
                   COALESCE(departures_2019_q3, NULL, 0) AS departures_2019_q3,
                   COALESCE(departures_2019_q4, NULL, 0) AS departures_2019_q4,
                   COALESCE(arrivals_2019_q1, NULL, 0) AS arrivals_2019_q1,
                   COALESCE(arrivals_2019_q2, NULL, 0) AS arrivals_2019_q2,
                   COALESCE(arrivals_2019_q3, NULL, 0) AS arrivals_2019_q3,
                   COALESCE(arrivals_2019_q4, NULL, 0) AS arrivals_2019_q4,
                   COALESCE(departures_2018_q1, NULL, 0) AS departures_2018_q1,
                   COALESCE(departures_2018_q2, NULL, 0) AS departures_2018_q2,
                   COALESCE(departures_2018_q3, NULL, 0) AS departures_2018_q3,
                   COALESCE(departures_2018_q4, NULL, 0) AS departures_2018_q4,
                   COALESCE(arrivals_2018_q1, NULL, 0) AS arrivals_2018_q1,
                   COALESCE(arrivals_2018_q2, NULL, 0) AS arrivals_2018_q2,
                   COALESCE(arrivals_2018_q3, NULL, 0) AS arrivals_2018_q3,
                   COALESCE(arrivals_2018_q4, NULL, 0) AS arrivals_2018_q4,
                   COALESCE(departures_all, NULL, 0) AS departures_all,
                   COALESCE(arrivals_all, NULL, 0) AS arrivals_all,
                   COALESCE(is_downtown, NULL, False) AS is_downtown
            FROM t7
            INNER JOIN t5 USING (station_id)
        ),
        t9 AS (
            SELECT *,
                   (arrivals_2018_q1+arrivals_2018_q2+arrivals_2018_q3+arrivals_2018_q4) AS arrivals_2018,
                   (arrivals_2019_q1+arrivals_2019_q2+arrivals_2019_q3+arrivals_2019_q4) AS arrivals_2019,
                   (arrivals_2020_q1+arrivals_2020_q2+arrivals_2020_q3+arrivals_2020_q4) AS arrivals_2020,
                   (arrivals_2021_q1+arrivals_2021_q2+arrivals_2021_q3+arrivals_2021_q4) AS arrivals_2021,
                   (arrivals_2022_q1+arrivals_2022_q2+arrivals_2022_q3+arrivals_2022_q4) AS arrivals_2022,
                   (departures_2018_q1+departures_2018_q2+departures_2018_q3+departures_2018_q4) AS departures_2018,
                   (departures_2019_q1+departures_2019_q2+departures_2019_q3+departures_2019_q4) AS departures_2019,
                   (departures_2020_q1+departures_2020_q2+departures_2020_q3+departures_2020_q4) AS departures_2020,
                   (departures_2021_q1+departures_2021_q2+departures_2021_q3+departures_2021_q4) AS departures_2021,
                   (departures_2022_q1+departures_2022_q2+departures_2022_q3+departures_2022_q4) AS departures_2022
            FROM t8
        )
        SELECT *
        FROM t9
        """
df_s = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_columns', None):
    pu.show_df(df_s)

column,station_id,name,physical_configuration,capacity,lat,lon,is_charging_station,credit,Neighbourhood,Location,census_tract_id,distance_pt_1,distance_pt_2,distance_pt_3,distance_pt_4,distance_pt_5,distance_train_1,distance_train_2,distance_train_3,distance_train_4,distance_cycle_1,distance_cycle_2,distance_cycle_3,distance_cycle_4,distance_cycle_5,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5,is_active,departures_2022_q1,departures_2022_q2,departures_2022_q3,departures_2022_q4,arrivals_2022_q1,arrivals_2022_q2,arrivals_2022_q3,arrivals_2022_q4,departures_2021_q1,departures_2021_q2,departures_2021_q3,departures_2021_q4,arrivals_2021_q1,arrivals_2021_q2,arrivals_2021_q3,arrivals_2021_q4,departures_2020_q1,departures_2020_q2,departures_2020_q3,departures_2020_q4,arrivals_2020_q1,arrivals_2020_q2,arrivals_2020_q3,arrivals_2020_q4,departures_2019_q1,departures_2019_q2,departures_2019_q3,departures_2019_q4,arrivals_2019_q1,arrivals_2019_q2,arrivals_2019_q3,arrivals_2019_q4,departures_2018_q1,departures_2018_q2,departures_2018_q3,departures_2018_q4,arrivals_2018_q1,arrivals_2018_q2,arrivals_2018_q3,arrivals_2018_q4,departures_all,arrivals_all,is_downtown,arrivals_2018,arrivals_2019,arrivals_2020,arrivals_2021,arrivals_2022,departures_2018,departures_2019,departures_2020,departures_2021,departures_2022
dtype,string[python],string[python],string[python],Int64,Float64,Float64,boolean,boolean,string[python],string[python],string[python],Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
nunique,629,629,6,39,626,628,2,2,83,4,214,629,629,629,629,629,629,629,629,626,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,1,426,550,560,565,425,536,569,559,429,508,537,492,433,511,521,506,359,454,502,487,360,446,515,481,280,330,406,389,288,322,403,383,235,263,326,311,235,262,324,309,618,620,2,326,409,529,554,606,327,413,538,548,611
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7007,College St / Huron St,REGULAR,19,43.658148,-79.398167,False,True,Kensington-Chinatown,Downtown,5350037.00,122.806317,156.429313,167.062761,175.042704,191.410353,651.152173,1140.559602,11880.996033,13979.522795,13.133159,180.705866,198.594566,503.063997,505.708291,24022468.633229,24023784.152988,24026812.078905,24027345.149739,24029490.348577,24022819.618799,24024269.373743,24025217.991448,24026335.582614,24027037.799985,20.14115,925.258039,1154.122533,1299.781243,1541.911906,True,2074,6392,9107,5804,2068,6414,9069,5765,1200,3362,5477,4715,1150,3137,5423,4663,2423,1807,3622,2154,2324,1778,3595,2057,1403,3138,5099,3400,1352,2933,5016,3366,1707,3362,3643,2538,1511,3224,3577,2401,72427,70823,True,10713,12667,9754,14373,23316,11250,13040,10006,14754,23377
1,7262,Wychwood Ave / Benson Ave - SMART,SMARTMAPFRAME,16,43.680916,-79.422968,False,False,Wychwood,Others,5350116.00,133.332635,159.814329,206.330734,207.50717,220.234754,690.876892,1947.706021,9276.068618,14419.227477,193.017368,311.812013,567.406762,647.206503,698.974158,24020337.041984,24021652.428999,24024680.095669,24025213.112413,24027358.361015,24020687.990145,24022137.68962,24023086.216353,24024203.720512,24024905.898604,473.250474,861.837877,1413.99278,1679.009132,1958.025974,True,132,624,640,458,88,375,434,283,150,348,542,375,108,247,299,193,175,247,602,270,109,134,351,180,123,342,474,226,69,131,200,133,206,572,526,271,108,305,318,134,7303,4199,False,865,533,774,847,1180,1575,1165,1294,1415,1854
2,7000,Fort York Blvd / Capreol Ct,REGULAR,35,43.639832,-79.395954,False,True,Harbourfront-CityPlace,Downtown,5350012.01,257.313495,280.053224,347.999709,378.580565,381.006053,1233.803951,3206.125512,13987.008148,15213.129327,8.903554,128.002329,138.804963,167.307523,168.987297,24024484.745346,24025800.296774,24028828.284528,24029361.368263,24031506.555228,24024835.739841,24026285.508017,24027234.147425,24028351.759344,24029053.986084,368.720182,1613.293131,1842.141138,2102.331938,2562.887483,True,1499,6282,9920,4624,1378,6200,9903,4451,1448,6164,8235,4198,1423,6010,8100,3846,1392,3390,7751,3092,1243,3329,7628,2983,1104,4606,6322,2698,789,4284,6304,2449,1352,4103,5735,2319,1079,4187,5201,1901,86234,82688,True,12368,13826,15183,19379,21932,13509,14730,15625,20045,22325
3,7566,Weston Rd / St Clair Ave W - SMART,REGULAR,19,43.673019,-79.467994,False,True,Junction Area,Others,5350106.00,43.846626,61.878873,76.46446,106.780075,132.027826,2120.722549,3816.449463,11174.736898,18217.966178,444.423963,523.533741,763.590477,875.684848,1015.758096,24021985.907436,24023301.113384,24026328.426389,24026861.369338,24029006.685785,24022336.804572,24023786.428368,24024734.831007,24025852.216472,24026554.340973,632.099587,1114.655603,1490.875705,1550.952901,2130.108766,True,124,501,647,285,96,542,640,312,89,429,552,236,106,495,559,248,0,80,689,148,0,88,716,170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3780,3972,False,0,0,974,1408,1590,0,0,917,1306,1557
4,7095,Danforth Ave / Ellerbeck St,REGULAR,14,43.677076,-79.35667,False,True,Playter Estates-Danforth,Others,5350085.00,98.390736,117.93654,123.013289,125.867194,127.879383,43.708758,2531.774228,9928.741257,10564.97092,49.495801,208.691724,278.940781,285.267097,312.353467,24019640.441622,24020956.114034,24023984.338112,24024517.471164,24026662.612723,24019991.470211,24021441.28893,24022390.011255,24023507.702483,24024209.965028,1020.274451,1328.814745,1775.068605,1858.312377,1979.168441,True,402,1647,2105,1057,389,1582,1981,1014,582,1511,1883,1061,575,1386,1642,983,408,802,1976,1059,393,801,1875,981,257,783,1103,604,241,770,993,548,470,1210,1409,476,438,1170,1264,404,20805,19430,False,3276,2552,4050,4586,4966,3565,2747,4245,5037,5211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,7536,Palmerston Ave / Dundas St W,REGULAR,23,43.651603,-79.408346,False,True,Trinity-Bellwoods,West of Downtown,5350040.00,185.041248,205.610764,207.239822,215.879437,226.073157,1617.610188,1682.308651,12583.641052,15107.618334,4.412463,24.903278,388.302294,465.113193,483.103441,24023375.408502,24024690.893114,24027718.750378,24028251.806886,24030397.018907,24023726.384168,24025176.124426,24026124.718043,24027242.286168,24027944.493138,241.507269,1059.796679,1112.630251,1556.893977,1616.31593,True,568,2909,5101,3063,511,2922,5100,3006,652,1729,2457,1518,544,1689,2496,1379,0,244,1965,1209,0,236,2038,1163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21415,21084,False,0,0,3437,6108,11539,0,0,3418,6356,11641
625,7755,Park Lawn Rd / Berry Rd,VAULT,15,43.635606,-79.493462,False,False,Stonegate-Queensway,Others,5350216.00,39.864018,62.639476,76.790294,83.821959,99.267303,1487.591508,8269.435147,15952.807121,22014.902157,559.194207,560.872211,570.979741,571.819771,580.612184,24026611.508334,24027926.651773,24030953.84265,24031486.760112,24033632.099979,24026962.387856,24028411.985522,24029360.345316,24030477.689796,24031179.795789,673.59787,1907.945735,2197.661383,2370.538717,2474.700923,True,0,0,0,6,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,8,False,0,0,0,0,8,0,0,0,0,6
626,7203,Bathurst St/Queens Quay(Billy Bishop Airport),REGULAR,35,43.635492,-79.398253,False,True,Fort York-Liberty Village,West of Downtown,5350008.02,75.436457,77.145614,139.352627,192.245865,218.303041,1702.318252,3632.123115,14466.725195,15698.238351,20.308343,22.483128,228.404823,302.645745,408.066598,24025010.448257,24026325.995227,24029353.974269,24029887.056185,24032032.244824,24025361.441496,24026811.207809,24027759.844161,24028877.453156,24029579.678576,471.287071,1994.370445,2366.271817,2589.604983,2894.769058,True,985,8088,10272,2785,1057,9260,11591,3376,1509,9758,9744,2971,1734,10595,11068,3330,1193,5396,9527,2434,1378,5754,10025,2700,1249,7086,9570,2419,1248,7435,10315,2546,887,4218,8109,2380,958,4488,8474,2454,100580,109788,False,16374,21544,19857,26727,25284,15594,20324,18550,23982,22130
627,7635,Runnymede Rd / Annette St,REGULAR,15,43.65963,-79.479918,False,True,Runnymede-Bloor West Village,Others,5350104.00,36.902592,43.825822,61.779171,65.421436,134.958668,941.20803,5577.794793,12985.493853,19738.430678,6.112482,49.490309,49.929702,701.277175,794.388284,24023688.888445,24025004.060141,24028031.306232,24028564.235218,24030709.564497,24024039.77593,24025489.385408,24026437.764571,24027555.127579,24028257.24194,1023.471936,1166.517188,1206.238495,1349.034025,2255.355752,True,89,403,739,448,85,360,705,404,141,390,700,268,138,338,615,244,0,0,315,234,0,0,309,227,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3727,3425,False,0,0,536,1335,1554,0,0,549,1499,1679


CPU times: user 2min 27s, sys: 4.4 s, total: 2min 32s
Wall time: 13.7 s


#### Get Yearly Station Ridership Totals on Weekdays and Weekends Separately (Excluding Metadata)

Get the station ridership totals on weekdays and weekends using the same approach as above, but using only weekday or weekend trips from the processed bike share ridership data

In [29]:
%%time
df_stations_intra_week = {}
for k, dow in zip(['weekday', 'weekend'], [[0,1,2,3,4], [5,6]]):
    dow_str = ','.join([str(d) for d in dow])
    query = f"""
            -- 1. get total departures per station from most recent full year (2022)
            WITH t1 AS (
                SELECT start_station_id AS station_id,
                       CAST(started_at_year AS VARCHAR) AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'departures_{k}' As type
                FROM read_parquet({fpaths_proc_all})
                WHERE ISODOW(started_at)-1 IN ({dow_str})
                AND started_at_year <= 2022
                GROUP BY all
            ),
            -- 2. get total arrivals per station from most recent full year (2022)
            t2 AS (
                SELECT end_station_id AS station_id,
                       CAST(ended_at_year AS VARCHAR) AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'arrivals_{k}' As type
                FROM read_parquet({fpaths_proc_all})
                WHERE ISODOW(ended_at)-1 IN ({dow_str})
                AND ended_at_year <= 2022
                GROUP BY all
            ),
            -- 3. get total departures and number of stations from N most-recent full years (2018 to 2022)
            t3 AS (
                SELECT start_station_id AS station_id,
                       'all' AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'departures_{k}' AS type
                FROM read_parquet({fpaths_proc_2018_2022})
                WHERE ISODOW(started_at)-1 IN ({dow_str})
                GROUP BY all
            ),
            -- 4. get total arrivals and number of stations from N most-recent full years (2018 to 2022)
            t4 AS (
                SELECT end_station_id AS station_id,
                       'all' AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'arrivals_{k}' AS type
                FROM read_parquet({fpaths_proc_2018_2022})
                WHERE ISODOW(ended_at)-1 IN ({dow_str})
                GROUP BY all
            ),
            -- 5. get useful station attributes (excludes any single-value attributes since these
            -- are not useful for extracting insights)
            t5 AS (
                SELECT station_id,
                       name
                -- FROM read_parquet({[fpath_stations_info]})
                FROM df_merged
                LEFT JOIN df_downtown_neighs USING (Neighbourhood)
            ),
            -- 6. combine all
            t6 AS (
                SELECT *
                FROM t1
                UNION
                SELECT *
                FROM t2
                UNION
                SELECT *
                FROM t3
                UNION
                SELECT *
                FROM t4
            ),
            -- 7. reshape into untidy data
            t7 AS (
                PIVOT t6
                ON type || '_' || year
                USING sum(trips)
            ),
            -- 8. filter to get active stations, fill NULLs and assign ranks based on departures & arrivals
            t8 AS (
                SELECT * EXCLUDE (
                           departures_{k}_all,
                           arrivals_{k}_all,
                           departures_{k}_2022,
                           arrivals_{k}_2022,
                           departures_{k}_2021,
                           arrivals_{k}_2021,
                           departures_{k}_2020,
                           arrivals_{k}_2020,
                           departures_{k}_2019,
                           arrivals_{k}_2019,
                           departures_{k}_2018,
                           arrivals_{k}_2018
                       ),
                       COALESCE(departures_{k}_2022, NULL, 0) AS departures_{k}_2022,
                       COALESCE(arrivals_{k}_2022, NULL, 0) AS arrivals_{k}_2022,
                       COALESCE(departures_{k}_2021, NULL, 0) AS departures_{k}_2021,
                       COALESCE(arrivals_{k}_2021, NULL, 0) AS arrivals_{k}_2021,
                       COALESCE(departures_{k}_2020, NULL, 0) AS departures_{k}_2020,
                       COALESCE(arrivals_{k}_2020, NULL, 0) AS arrivals_{k}_2020,
                       COALESCE(departures_{k}_2019, NULL, 0) AS departures_{k}_2019,
                       COALESCE(arrivals_{k}_2019, NULL, 0) AS arrivals_{k}_2019,
                       COALESCE(departures_{k}_2019, NULL, 0) AS departures_{k}_2018,
                       COALESCE(arrivals_{k}_2018, NULL, 0) AS arrivals_{k}_2018,
                       COALESCE(departures_{k}_all, NULL, 0) AS departures_{k}_all,
                       COALESCE(arrivals_{k}_all, NULL, 0) AS arrivals_{k}_all
                FROM t7
                INNER JOIN t5 USING (station_id)
            )
            SELECT *
            FROM t8
            """
    df_stations_intra_week[k] = run_sql_query(query).convert_dtypes()
df_s_weekdays, df_s_weekends = list(df_stations_intra_week.values())
with pd.option_context('display.max_columns', None):
    pu.show_df(df_s_weekdays)
    pu.show_df(df_s_weekends)

column,station_id,name,departures_weekday_2022,arrivals_weekday_2022,departures_weekday_2021,arrivals_weekday_2021,departures_weekday_2020,arrivals_weekday_2020,departures_weekday_2019,arrivals_weekday_2019,departures_weekday_2018,arrivals_weekday_2018,departures_weekday_all,arrivals_weekday_all
dtype,string[python],string[python],Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
nunique,628,628,607,599,546,544,532,528,410,407,410,328,616,605
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7037,Bathurst St / Dundas St W,11296,11022,11099,10915,8567,8390,9176,8489,9176,7041,47937,45857
1,7050,Richmond St E / Jarvis St Green P,3733,3035,1905,1704,1751,1709,4059,4072,4059,5709,17104,16229
2,7450,Carlaw Ave / Danforth Ave,3111,3007,3001,2852,2258,2177,925,801,925,0,9295,8837
3,7053,Metro Hall Plaza,6484,7267,3777,3829,3077,3358,6817,7689,6817,8113,27277,30256
4,7323,457 King St W,5676,5995,3431,3884,2650,2777,5691,5913,5691,3906,21275,22475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,7598,Teddington Park Ave,395,359,324,263,240,212,0,0,0,0,959,834
624,7600,Ursula Franklin St / Huron St - SMART,6453,6004,3365,3430,545,466,0,0,0,0,10363,9900
625,7526,Bartlett Parkette,3343,3207,2076,1969,1220,1206,0,0,0,0,6639,6382
626,7641,Annette St / Dundas St W,1048,854,839,680,255,208,0,0,0,0,2142,1742


column,station_id,name,departures_weekend_2022,arrivals_weekend_2022,departures_weekend_2021,arrivals_weekend_2021,departures_weekend_2020,arrivals_weekend_2020,departures_weekend_2019,arrivals_weekend_2019,departures_weekend_2018,arrivals_weekend_2018,departures_weekend_all,arrivals_weekend_all
dtype,string[python],string[python],Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
nunique,629,629,570,574,522,522,493,505,385,386,385,317,603,601
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7078,College St / Major St,8218,8221,5211,5202,3878,3861,3597,3596,3597,3449,24363,24329
1,7334,Simcoe St / Wellington St W North,1644,1694,1040,968,748,720,732,791,732,970,5226,5143
2,7466,Glendonwynne Rd / Bloor St W,1105,1144,942,964,559,633,213,207,213,0,2819,2948
3,7079,McGill St / Church St,4672,4473,4169,3722,2007,1900,1621,1482,1621,1694,14253,13271
4,7208,80 Clinton St,2882,2853,1539,1522,1387,1373,1410,1340,1410,1172,8460,8260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,7335,Bay St / Bloor St W (West Side),2053,1665,1646,1285,1277,941,1585,1649,1585,919,7494,6459
625,7639,135 Queens Wharf Rd - SMART,921,917,652,637,124,139,0,0,0,0,1697,1693
626,7246,Yonge St / Bloor St,4760,4588,3991,3716,3175,2958,1815,1564,1815,288,14061,13114
627,7527,Joseph J Piccininni Community Centre,313,281,568,503,281,225,0,0,0,0,1162,1009


CPU times: user 1min 47s, sys: 3.41 s, total: 1min 50s
Wall time: 9.86 s


#### Get Yearly Station Ridership Totals by Annual and Casual Members Separately (Excluding Metadata)

Get the station ridership totals by Annual and Casual members using the same approach as above, but using only Annual or Casual member trips from the processed bike share ridership data

In [30]:
%%time
df_stations_annual_casual = {}
for k, dow in zip(['annual', 'casual'], ['Annual', 'Casual']):
    query = f"""
            -- 1. get total departures per station from most recent full year (2022)
            WITH t1 AS (
                SELECT start_station_id AS station_id,
                       CAST(started_at_year AS VARCHAR) AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'departures_{k}' As type
                FROM read_parquet({fpaths_proc_all})
                WHERE user_type LIKE '{dow}%'
                AND started_at_year <= 2022
                GROUP BY all
            ),
            -- 2. get total arrivals per station from most recent full year (2022)
            t2 AS (
                SELECT end_station_id AS station_id,
                       CAST(ended_at_year AS VARCHAR) AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'arrivals_{k}' As type
                FROM read_parquet({fpaths_proc_all})
                WHERE user_type LIKE '{dow}%'
                AND ended_at_year <= 2022
                GROUP BY all
            ),
            -- 3. get total departures and number of stations from N most-recent full years (2018 to 2022)
            t3 AS (
                SELECT start_station_id AS station_id,
                       'all' AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'departures_{k}' AS type
                FROM read_parquet({fpaths_proc_2018_2022})
                WHERE user_type LIKE '{dow}%'
                GROUP BY all
            ),
            -- 4. get total arrivals and number of stations from N most-recent full years (2018 to 2022)
            t4 AS (
                SELECT end_station_id AS station_id,
                       'all' AS year,
                       COUNT(DISTINCT(trip_id)) AS trips,
                       'arrivals_{k}' AS type
                FROM read_parquet({fpaths_proc_2018_2022})
                WHERE user_type LIKE '{dow}%'
                GROUP BY all
            ),
            -- 5. get useful station attributes (excludes any single-value attributes since these
            -- are not useful for extracting insights)
            t5 AS (
                SELECT station_id,
                       name
                -- FROM read_parquet({[fpath_stations_info]})
                FROM df_merged
                LEFT JOIN df_downtown_neighs USING (Neighbourhood)
            ),
            -- 6. combine all
            t6 AS (
                SELECT *
                FROM t1
                UNION
                SELECT *
                FROM t2
                UNION
                SELECT *
                FROM t3
                UNION
                SELECT *
                FROM t4
            ),
            -- 7. reshape into untidy data
            t7 AS (
                PIVOT t6
                ON type || '_' || year
                USING sum(trips)
            ),
            -- 8. filter to get active stations, fill NULLs and assign ranks based on departures & arrivals
            t8 AS (
                SELECT * EXCLUDE (
                           departures_{k}_all,
                           arrivals_{k}_all,
                           departures_{k}_2022,
                           arrivals_{k}_2022,
                           departures_{k}_2021,
                           arrivals_{k}_2021,
                           departures_{k}_2020,
                           arrivals_{k}_2020,
                           departures_{k}_2019,
                           arrivals_{k}_2019,
                           departures_{k}_2018,
                           arrivals_{k}_2018
                       ),
                       COALESCE(departures_{k}_2022, NULL, 0) AS departures_{k}_2022,
                       COALESCE(arrivals_{k}_2022, NULL, 0) AS arrivals_{k}_2022,
                       COALESCE(departures_{k}_2021, NULL, 0) AS departures_{k}_2021,
                       COALESCE(arrivals_{k}_2021, NULL, 0) AS arrivals_{k}_2021,
                       COALESCE(departures_{k}_2020, NULL, 0) AS departures_{k}_2020,
                       COALESCE(arrivals_{k}_2020, NULL, 0) AS arrivals_{k}_2020,
                       COALESCE(departures_{k}_2019, NULL, 0) AS departures_{k}_2019,
                       COALESCE(arrivals_{k}_2019, NULL, 0) AS arrivals_{k}_2019,
                       COALESCE(departures_{k}_2019, NULL, 0) AS departures_{k}_2018,
                       COALESCE(arrivals_{k}_2018, NULL, 0) AS arrivals_{k}_2018,
                       COALESCE(departures_{k}_all, NULL, 0) AS departures_{k}_all,
                       COALESCE(arrivals_{k}_all, NULL, 0) AS arrivals_{k}_all
                FROM t7
                INNER JOIN t5 USING (station_id)
            )
            SELECT *
            FROM t8
            """
    df_stations_annual_casual[k] = run_sql_query(query).convert_dtypes()
df_stations_annual, df_stations_casual = list(df_stations_annual_casual.values())
with pd.option_context('display.max_columns', None):
    pu.show_df(df_stations_annual)
    pu.show_df(df_stations_casual)

column,station_id,name,departures_annual_2022,arrivals_annual_2022,departures_annual_2021,arrivals_annual_2021,departures_annual_2020,arrivals_annual_2020,departures_annual_2019,arrivals_annual_2019,departures_annual_2018,arrivals_annual_2018,departures_annual_all,arrivals_annual_all
dtype,string[python],string[python],Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
nunique,625,625,574,577,545,546,523,523,409,406,409,329,611,605
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7237,Ward St / Wallace Ave,1565,1390,2095,1973,2192,2146,1796,1574,1796,1959,9657,9042
1,7407,University Ave / Queen St W,2092,1915,1830,1744,1674,1516,1355,1262,1355,0,6951,6437
2,7019,Temperance St Station,2006,2708,3685,4122,3404,3819,9058,9346,9058,14915,30278,34910
3,7130,Davenport Rd / Bedford Rd,2137,2141,2317,2325,1580,1681,902,770,902,599,7635,7516
4,7022,Simcoe St / Queen St W,9605,10764,8858,10238,9071,10130,12824,13748,12824,13418,53086,58298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,7378,Yonge St / Davisville Ave,1118,781,1760,1328,1762,1362,1392,691,1392,156,6461,4318
621,7457,Queen's Park Cres W / Hoskin Ave,4654,4027,4769,4425,3147,2945,1742,1550,1742,0,14312,12947
622,7070,25 York St – Union Station South,2117,2408,2179,2931,1932,2251,4873,5318,4873,7074,17558,19982
623,7156,Salem Ave / Bloor St W,2218,2301,3075,2993,3126,3057,2846,2742,2846,2765,14125,13858


column,station_id,name,departures_casual_2022,arrivals_casual_2022,departures_casual_2021,arrivals_casual_2021,departures_casual_2020,arrivals_casual_2020,departures_casual_2019,arrivals_casual_2019,departures_casual_2018,arrivals_casual_2018,departures_casual_all,arrivals_casual_all
dtype,string[python],string[python],Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
nunique,629,629,601,589,524,519,486,493,376,381,376,296,616,601
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7040,Euclid Ave / Bloor St W,5966,5881,2669,2501,1758,1610,1021,784,1021,606,12200,11383
1,7650,St Clair West Subway Station - Heath Entrance,1158,708,677,533,182,152,0,0,0,0,2017,1393
2,7191,Central Tech (Harbord St),5454,5373,2090,2146,1212,1117,726,646,726,348,9907,9630
3,7176,Bathurst St / Fort York Blvd,6267,6255,3622,3250,2800,2429,1976,1632,1976,1132,16238,14698
4,7161,Beverley St / College St,6794,6624,1921,1930,898,656,1008,713,1008,506,11531,10429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,7156,Salem Ave / Bloor St W,3146,2929,1303,1248,1138,1053,648,596,648,497,6759,6323
625,7266,Victoria Park Subway Station - SMART,604,472,319,237,282,274,86,104,86,33,1302,1120
626,7516,Superior Ave / Lake Shore Blvd W,1711,1775,1493,1599,1183,1176,0,0,0,0,4387,4550
627,7457,Queen's Park Cres W / Hoskin Ave,7162,6966,3231,2889,1307,1066,402,415,402,0,12102,11336


CPU times: user 1min 40s, sys: 3.51 s, total: 1min 43s
Wall time: 9.43 s


### Combine Ridership Trends (Behavioural Data) with Station Attributes

Merge station attributes with behavioural data, using a `LEFT JOIN` on the `station_id` and `name` columns

In [31]:
%%time
query = f"""
        SELECT *
        FROM df_s
        LEFT JOIN df_s_weekdays USING (station_id, name)
        LEFT JOIN df_s_weekends USING (station_id, name)
        LEFT JOIN df_stations_annual USING (station_id, name)
        LEFT JOIN df_stations_casual USING (station_id, name)
        """
df_merged_combo = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_columns', None):
    pu.show_df(df_merged_combo)

column,station_id,name,physical_configuration,capacity,lat,lon,is_charging_station,credit,Neighbourhood,Location,census_tract_id,distance_pt_1,distance_pt_2,distance_pt_3,distance_pt_4,distance_pt_5,distance_train_1,distance_train_2,distance_train_3,distance_train_4,distance_cycle_1,distance_cycle_2,distance_cycle_3,distance_cycle_4,distance_cycle_5,distance_poi_1,distance_poi_2,distance_poi_3,distance_poi_4,distance_poi_5,distance_ch_1,distance_ch_2,distance_ch_3,distance_ch_4,distance_ch_5,distance_library_1,distance_library_2,distance_library_3,distance_library_4,distance_library_5,is_active,departures_2022_q1,departures_2022_q2,departures_2022_q3,departures_2022_q4,arrivals_2022_q1,arrivals_2022_q2,arrivals_2022_q3,arrivals_2022_q4,departures_2021_q1,departures_2021_q2,departures_2021_q3,departures_2021_q4,arrivals_2021_q1,arrivals_2021_q2,arrivals_2021_q3,arrivals_2021_q4,departures_2020_q1,departures_2020_q2,departures_2020_q3,departures_2020_q4,arrivals_2020_q1,arrivals_2020_q2,arrivals_2020_q3,arrivals_2020_q4,departures_2019_q1,departures_2019_q2,departures_2019_q3,departures_2019_q4,arrivals_2019_q1,arrivals_2019_q2,arrivals_2019_q3,arrivals_2019_q4,departures_2018_q1,departures_2018_q2,departures_2018_q3,departures_2018_q4,arrivals_2018_q1,arrivals_2018_q2,arrivals_2018_q3,arrivals_2018_q4,departures_all,arrivals_all,is_downtown,arrivals_2018,arrivals_2019,arrivals_2020,arrivals_2021,arrivals_2022,departures_2018,departures_2019,departures_2020,departures_2021,departures_2022,departures_weekday_2022,arrivals_weekday_2022,departures_weekday_2021,arrivals_weekday_2021,departures_weekday_2020,arrivals_weekday_2020,departures_weekday_2019,arrivals_weekday_2019,departures_weekday_2018,arrivals_weekday_2018,departures_weekday_all,arrivals_weekday_all,departures_weekend_2022,arrivals_weekend_2022,departures_weekend_2021,arrivals_weekend_2021,departures_weekend_2020,arrivals_weekend_2020,departures_weekend_2019,arrivals_weekend_2019,departures_weekend_2018,arrivals_weekend_2018,departures_weekend_all,arrivals_weekend_all,departures_annual_2022,arrivals_annual_2022,departures_annual_2021,arrivals_annual_2021,departures_annual_2020,arrivals_annual_2020,departures_annual_2019,arrivals_annual_2019,departures_annual_2018,arrivals_annual_2018,departures_annual_all,arrivals_annual_all,departures_casual_2022,arrivals_casual_2022,departures_casual_2021,arrivals_casual_2021,departures_casual_2020,arrivals_casual_2020,departures_casual_2019,arrivals_casual_2019,departures_casual_2018,arrivals_casual_2018,departures_casual_all,arrivals_casual_all
dtype,string[python],string[python],string[python],Int64,Float64,Float64,boolean,boolean,string[python],string[python],string[python],Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
nunique,629,629,6,39,626,628,2,2,83,4,214,629,629,629,629,629,629,629,629,626,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,1,426,550,560,565,425,536,569,559,429,508,537,492,433,511,521,506,359,454,502,487,360,446,515,481,280,330,406,389,288,322,403,383,235,263,326,311,235,262,324,309,618,620,2,326,409,529,554,606,327,413,538,548,611,607,599,546,544,532,528,410,407,410,328,616,605,570,574,522,522,493,505,385,386,385,317,603,601,574,577,545,546,523,523,409,406,409,329,611,605,601,589,524,519,486,493,376,381,376,296,616,601
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,4,4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0
0,7262,Wychwood Ave / Benson Ave - SMART,SMARTMAPFRAME,16,43.680916,-79.422968,False,False,Wychwood,Others,5350116.00,133.332635,159.814329,206.330734,207.50717,220.234754,690.876892,1947.706021,9276.068618,14419.227477,193.017368,311.812013,567.406762,647.206503,698.974158,24020337.041984,24021652.428999,24024680.095669,24025213.112413,24027358.361015,24020687.990145,24022137.68962,24023086.216353,24024203.720512,24024905.898604,473.250474,861.837877,1413.99278,1679.009132,1958.025974,True,132,624,640,458,88,375,434,283,150,348,542,375,108,247,299,193,175,247,602,270,109,134,351,180,123,342,474,226,69,131,200,133,206,572,526,271,108,305,318,134,7303,4199,False,865,533,774,847,1180,1575,1165,1294,1415,1854,1202,748,957,533,937,538,816,384,816,699,5114,2902,652,432,458,314,357,236,349,149,349,166,2189,1297,715,444,953,598,966,588,1000,470,1000,784,5127,2884,1139,736,462,249,328,186,165,63,165,81,2176,1315
1,7000,Fort York Blvd / Capreol Ct,REGULAR,35,43.639832,-79.395954,False,True,Harbourfront-CityPlace,Downtown,5350012.01,257.313495,280.053224,347.999709,378.580565,381.006053,1233.803951,3206.125512,13987.008148,15213.129327,8.903554,128.002329,138.804963,167.307523,168.987297,24024484.745346,24025800.296774,24028828.284528,24029361.368263,24031506.555228,24024835.739841,24026285.508017,24027234.147425,24028351.759344,24029053.986084,368.720182,1613.293131,1842.141138,2102.331938,2562.887483,True,1499,6282,9920,4624,1378,6200,9903,4451,1448,6164,8235,4198,1423,6010,8100,3846,1392,3390,7751,3092,1243,3329,7628,2983,1104,4606,6322,2698,789,4284,6304,2449,1352,4103,5735,2319,1079,4187,5201,1901,86234,82688,True,12368,13826,15183,19379,21932,13509,14730,15625,20045,22325,15662,15141,13699,13139,10564,10138,10848,10105,10848,9405,61173,57928,6663,6791,6346,6240,5061,5045,3882,3721,3882,2963,25061,24760,9189,8347,12346,11890,9914,9531,10920,10154,10920,10189,53869,50111,13136,13585,7699,7489,5711,5652,3810,3672,3810,2179,32365,32577
2,7566,Weston Rd / St Clair Ave W - SMART,REGULAR,19,43.673019,-79.467994,False,True,Junction Area,Others,5350106.00,43.846626,61.878873,76.46446,106.780075,132.027826,2120.722549,3816.449463,11174.736898,18217.966178,444.423963,523.533741,763.590477,875.684848,1015.758096,24021985.907436,24023301.113384,24026328.426389,24026861.369338,24029006.685785,24022336.804572,24023786.428368,24024734.831007,24025852.216472,24026554.340973,632.099587,1114.655603,1490.875705,1550.952901,2130.108766,True,124,501,647,285,96,542,640,312,89,429,552,236,106,495,559,248,0,80,689,148,0,88,716,170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3780,3972,False,0,0,974,1408,1590,0,0,917,1306,1557,974,1035,812,833,559,575,0,0,0,0,2345,2443,583,555,494,575,358,399,0,0,0,0,1435,1529,516,593,783,793,539,579,0,0,0,0,1838,1965,1041,997,523,615,378,395,0,0,0,0,1942,2007
3,7095,Danforth Ave / Ellerbeck St,REGULAR,14,43.677076,-79.35667,False,True,Playter Estates-Danforth,Others,5350085.00,98.390736,117.93654,123.013289,125.867194,127.879383,43.708758,2531.774228,9928.741257,10564.97092,49.495801,208.691724,278.940781,285.267097,312.353467,24019640.441622,24020956.114034,24023984.338112,24024517.471164,24026662.612723,24019991.470211,24021441.28893,24022390.011255,24023507.702483,24024209.965028,1020.274451,1328.814745,1775.068605,1858.312377,1979.168441,True,402,1647,2105,1057,389,1582,1981,1014,582,1511,1883,1061,575,1386,1642,983,408,802,1976,1059,393,801,1875,981,257,783,1103,604,241,770,993,548,470,1210,1409,476,438,1170,1264,404,20805,19430,False,3276,2552,4050,4586,4966,3565,2747,4245,5037,5211,3527,3324,3380,3102,2917,2770,1961,1799,1961,2420,14408,13415,1684,1642,1657,1484,1328,1280,786,753,786,856,6397,6015,1955,1764,3796,3297,3317,3223,2365,2229,2365,2895,14624,13408,3256,3202,1241,1289,928,827,382,323,382,381,6181,6022
4,7368,Lisgar St / Dundas St W - SMART,SMARTMAPFRAME,8,43.649347,-79.42668,False,False,Little Portugal,West of Downtown,5350044.01,19.520522,29.188063,148.963533,177.909426,247.180432,1396.277902,2809.041199,12904.31747,16487.384413,245.679349,263.294504,421.532005,425.067028,434.924629,24023939.567352,24025254.977047,24028282.687984,24028815.713959,24030960.95407,24024290.521906,24025740.230858,24026688.773131,24027806.292148,24028508.476946,861.112595,1059.975499,1351.822662,1787.59567,1914.737238,True,492,2336,3088,1924,481,2372,3107,1913,452,1427,2160,1132,473,1483,2261,1170,455,716,1837,856,459,719,1824,859,300,978,1439,769,276,952,1471,737,0,0,621,560,0,0,598,544,21542,21699,False,1142,3436,3861,5387,7873,1181,3486,3864,5171,7840,5210,5244,3572,3712,2700,2671,2414,2402,2414,802,14714,14831,2630,2629,1599,1675,1164,1190,1072,1034,1072,340,6828,6868,2892,2892,3600,3517,2895,2742,2991,2728,2991,992,13523,12871,4948,4981,1571,1870,969,1119,495,708,495,150,8019,8828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,7150,Dufferin St / Sylvan Av (Dufferin Grove Park),REGULAR,19,43.655556,-79.433611,False,True,Dufferin Grove,West of Downtown,5350054.00,44.394336,53.516288,88.627652,163.3723,201.649269,552.961592,2802.360049,12271.220444,16573.106626,121.917219,132.214252,229.768734,243.981522,251.821647,24023361.05858,24024676.431369,24027704.070228,24028237.081165,24030382.335108,24023712.002733,24025161.696262,24026110.213242,24027227.708069,24027929.881946,496.217235,1123.5408,1649.027559,1743.573205,1959.128968,True,369,1932,3304,1359,404,2115,3333,1407,495,1213,1795,1036,500,1274,1870,1130,385,949,2188,1153,468,1026,2320,1208,212,861,1558,611,245,920,1604,668,188,901,1141,480,220,972,1158,524,22130,23366,False,2874,3437,5022,4774,7259,2710,3242,4675,4539,6964,4637,4855,3109,3245,3061,3307,2210,2308,2210,1942,14857,15657,2327,2404,1430,1529,1614,1715,1032,1129,1032,932,7273,7709,2596,2692,3027,3094,3276,3561,2381,2650,2381,2378,13459,14375,4368,4567,1512,1680,1399,1461,861,787,861,496,8671,8991
625,7704,24 Mountjoy Ave,REGULAR,17,43.680877,-79.330449,False,True,Greenwood-Coxwell,Others,5350075.00,124.655842,145.241562,153.08186,156.719686,169.356698,206.811658,4750.280001,8039.914423,10744.539283,113.471066,134.685049,134.861313,209.989214,318.152918,24018767.406305,24020083.185087,24023111.616917,24023644.793321,24025789.894946,24019118.46487,24020568.328021,24021517.123246,24022634.884195,24023337.178213,813.325887,1186.698118,1203.729854,1288.92979,1742.698646,True,0,205,442,223,0,165,365,147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,870,677,False,0,0,0,0,677,0,0,0,0,870,620,456,0,0,0,0,0,0,0,0,620,456,250,221,0,0,0,0,0,0,0,0,250,221,251,163,0,0,0,0,0,0,0,0,251,163,619,514,0,0,0,0,0,0,0,0,619,514
626,7311,Sherbourne St / Isabella St,REGULAR,19,43.669576,-79.375961,False,True,North St.James Town,Downtown,5350064.00,25.256906,85.327198,281.654924,287.881642,289.944622,293.760995,757.41514,10921.631326,11717.894072,16.314622,303.438184,326.945822,327.002551,327.274772,24020809.720834,24022125.320711,24025153.403117,24025686.506606,24027831.675386,24021160.728982,24022610.517399,24023559.190013,24024676.833696,24025379.074778,292.627978,931.626708,1085.570642,1119.438374,1945.524768,True,1109,4712,6992,3857,927,3951,6303,3482,1475,4051,4958,3087,1229,3312,4085,2424,1273,2341,4359,2584,986,1989,3931,2198,878,2131,3214,1936,557,1775,3036,1624,0,153,2091,1504,0,135,1946,1092,52705,44982,True,3173,6992,9104,11050,14663,3748,8159,10557,13571,16670,12123,10599,9648,7849,7502,6467,6274,5369,6274,2379,38345,32663,4547,4064,3923,3201,3055,2637,1885,1623,1885,794,14360,12319,7077,6133,10394,8544,8781,7601,7392,6121,7392,2967,37158,31366,9593,8530,3177,2506,1776,1503,767,871,767,206,15547,13616
627,7020,Phoebe St / Spadina Ave,REGULAR,33,43.650033,-79.396555,False,True,Kensington-Chinatown,Downtown,5350036.00,98.061176,130.568102,137.630828,157.404986,165.33478,799.429318,2068.969627,12818.933528,14461.436656,227.809082,231.797576,248.059567,249.790167,315.658423,24023351.141418,24024666.677885,24027694.636426,24028227.714066,24030372.90664,24023702.131697,24025151.893622,24026100.522776,24027218.124891,24027920.347205,781.70269,936.664203,1112.404095,1266.633489,2045.13067,True,877,4067,6503,3485,1041,5357,7857,4610,838,2195,4637,2833,887,2495,5561,3527,1997,1364,4017,1686,2278,1423,4419,1969,1475,4175,6046,2763,1726,4629,6651,3160,1737,4590,6207,2791,1905,4963,6405,3222,64283,74085,True,16495,16166,10089,12470,18865,15325,14459,9064,10503,14932,10741,13590,7546,8943,6498,7304,11725,13054,11725,13571,49004,56462,4191,5275,2957,3527,2566,2785,2734,3112,2734,2924,15279,17623,5546,7487,6835,8108,6653,7710,11612,13557,11612,14426,43223,51288,9386,11378,3668,4362,2411,2379,2847,2609,2847,2069,21060,22797


CPU times: user 154 ms, sys: 3.94 ms, total: 158 ms
Wall time: 151 ms


**Observations**

1. All the performance stats on weekdays have missing values (as an example, see `arrivals_weekday_2020`). Overall and weekend values do not have missing values. This indicates that some stations were only used on weekends (not on weekdays). This is similarly observed for Annual and Casual member ridership per station (as an example, see `arrivals_annual_2018`).

## Load

Export the overall dataset of bike share station attributes and behavioural data, consisting of the following per station

1. station attributes
   - non-unique valued metadata columns
   - distances between bike share stations and the five closest
     - public library branches
     - cultural hotspots
     - places of interest
     - bikeways (cycle paths)
     - public transit (train) stations
     - public transit (bus) stops
2. behavioural data
   - quarterly and yearly bike share ridership trends
   - yearly bike share ridership trends by annual and casual members
   - yearly bike share ridership trends on weekdays and weekends

to disk

In [32]:
%%time
fpath_station_metadata_large = glob(
    os.path.join(
        processed_data_dir,
        'station_attributes_behavioural_data__*.parquet.gzip',
    )
)
if not fpath_station_metadata_large:
    flut.load(
        df_merged,
        processed_data_dir,
        'station_attributes_behavioural_data',
        my_timezone,
        verbose=True,
    )

Exported 790 rows of station_attributes_behavioural_data data to /home/jovyan/data/processed/station_attributes_behavioural_data__20240426_164815.parquet.gzip
CPU times: user 23.1 ms, sys: 7.94 ms, total: 31 ms
Wall time: 30.5 ms


## Discussion

### Conclusion

1. This step has created a dataset consisting of the following per bike share station
   - atttibutes
     - station metadata
     - distances to the five closest points of interest across the city
   - user behavioural data
     - periodic ridership trends

The created station summary dataset can be used for clustering bike share stations.

## Summary of Assumptions

1. Same as in data retrieval and processing step.

## Next Step

The next step will perform temporal exploratory data analysis using the processed bike share ridership data.

## Version Information

In [34]:
packages = [
    'requests',
    'tqdm',
    'contexttimer',
    'duckdb',
    'pandas',
    'geopandas',
    'pyarrow',
]
print(
    watermark(
        updated=True,
        current_date=True,
        current_time=True,
        timezone=True,
        custom_time="%Y-%m-%d %H:%M:%S %Z",
        python=True,
        machine=True,
        packages=','.join(packages),
    )
)

Last updated: 2024-04-26 20:48:35 UTC

Python implementation: CPython
Python version       : 3.12.3
IPython version      : 8.22.2

requests    : 2.31.0
tqdm        : 4.66.2
contexttimer: 0.3.3
duckdb      : 0.10.2
pandas      : 2.2.2
geopandas   : 0.14.3
pyarrow     : 15.0.1

Compiler    : GCC 12.3.0
OS          : Linux
Release     : 6.8.0-76060800daily20240311-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

