# Get Data

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
from glob import glob

import boto3
import geopandas as gpd
import pandas as pd
import pandera as pa
import requests
import snowflake.connector
from dotenv import find_dotenv, load_dotenv
from snowflake.connector.pandas_tools import write_pandas



In [3]:
%aimport src.aggregate_data
import src.aggregate_data as ad

%aimport src.city_neighbourhoods
import src.city_neighbourhoods as cn

%aimport src.city_pub_data
import src.city_pub_data as cpd

%aimport src.process_trips
from src.process_trips import process_trips_data

%aimport src.stations_metadata
from src.stations_metadata import get_stations_metadata, transform_metadata

%aimport src.trips
import src.trips as bt

%aimport src.utils
from src.utils import export_df_to_multiple_csv_files, show_sql_df, summarize_df

## About

Download Toronto Bikeshare trips data, bikeshare stations metadata and supplementary (neighbourhood-specific) datasets.

## User Inputs

In [80]:
# Datasets
# # Open Data Portal
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
# # Ridership
trips_params = {"id": "7e876c24-177c-4605-9cef-e50dd74c617f"}
years_wanted = {2021: list(range(1, 12 + 1)), 2022: list(range(1, 1 + 1))}
# # Stations Metadata
about_params = {"id": "2b44db0d-eea9-442d-b038-79335368ad5a"}
stations_cols_wanted = [
    "station_id",
    "name",
    "physical_configuration",
    "lat",
    "lon",
    "altitude",
    "address",
    "capacity",
    "physicalkey",
    "transitcard",
    "creditcard",
    "phone",
]

# Neighbourhood boundary columns to keep
neigh_cols_to_show = [
    "AREA_ID",
    "AREA_SHORT_CODE",
    "AREA_LONG_CODE",
    "AREA_NAME",
    "Shape__Area",
    "Shape__Length",
    "LATITUDE",
    "AREA_LATITUDE",
    "LONGITUDE",
    "AREA_LONGITUDE",
    "geometry",
]

# Ridership datetime columns
date_cols = ["Start Time", "End Time"]

# Ridership Columns in which to drop missing values
nan_cols = [
    "START_STATION_ID",
    "END_STATION_ID",
    "START_STATION_NAME",
    "END_STATION_NAME",
]

# Ridership Columns with duplicates, in which to drop rows
duplicated_cols = ["TRIP_ID", "START_TIME", "END_TIME"]

# Snowflake resources
# # Database
stations_db_name = "torbikestations"
# # Tables
trips_table_name = "trips"
station_stats_table_name = "station_stats"
# # Stage
trips_stage_name = "bikes_stage"
# # File Format
trips_file_format_name = "COMMASEP_ONEHEADROW"

geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

# Exporting to staged CSV files
cols_to_export = [
    "STATION_NAME",
    "YEAR",
    "MONTH",
    "DAY",
    "HOUR",
    "USER_TYPE",
    "NUM_TRIPS",
    "DURATION_MEAN",
    "STATION_TYPE",
    "AREA_NAME",
    "PHYSICAL_CONFIGURATION",
    "CAPACITY",
    "PHYSICALKEY",
    "TRANSITCARD",
    "CREDITCARD",
    "PHONE",
    "NEIGH_TRANSIT_STOPS",
    "NEIGH_COLLEGES_UNIVS",
    "NEIGH_CULTURAL_ATTRACTIONS",
    "NEIGH_PLACES_OF_INTEREST",
]
nrows_per_staged_csv_file = 380_000

ci_run = "no"

In [5]:
# Ridership dtypes dict
dtypes_dict_trips = {
    "Trip Id": pd.Int64Dtype(),
    "Trip Duration": pd.Int64Dtype(),
    "Start Station Id": pd.Int64Dtype(),
    "Start Station Name": pd.StringDtype(),
    "End Station Id": pd.Int64Dtype(),
    "End Station Name": pd.StringDtype(),
    "Bike Id": pd.Int64Dtype(),
    "User Type": pd.StringDtype(),
}

if ci_run == "no":
    load_dotenv(find_dotenv())

trips_db_name = os.getenv("DB_NAME")
snowflake_dict_no_db = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    role="sysadmin",
)
snowflake_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=trips_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)
snowflake_station_stats_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=stations_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)

In [6]:
aws_region = os.getenv("AWS_REGION")
account_id = (
    boto3.client("sts", region_name=aws_region).get_caller_identity().get("Account")
)

## Create AWS Python SDK Objects for Creating QuickSight Resources

In [7]:
qs_client_user = boto3.client("quicksight", region_name="us-east-1")
qs_client = boto3.client("quicksight", region_name=aws_region)

## Get Bikeshare Trips Data

### Get URLs for Raw Trips Data Files

In [8]:
# %%time
# all_urls = bt.get_file_urls(url, trips_params, years_wanted)

### Download Raw Trips Data Files

In [9]:
# %%time
# cols_dict_list = bt.get_all_data_files(all_urls, dtypes_dict_trips, date_cols, nan_cols, parallel=False)

Perform sanity checks on column names and column order in raw trips data files

In [10]:
# cols_cleaned = {
#     k: [re.sub("[^A-Za-z0-9\s]+", "", c) for c in l["columns"]]
#     for f in cols_dict_list
#     for k, l in f.items()
# }
# assert len(cols_cleaned) == len(cols_dict_list)

# cols_equality_checks = {
#     k: True if cols == list(cols_cleaned.values())[0] else False
#     for k, cols in {m: cols_cleaned[m] for m in list(cols_cleaned)[1:]}.items()
# }
# try:
#     assert all(list(cols_equality_checks.values()))
# except AssertionError:
#     print(cols_equality_checks)

## Get Supplementary Datasets

### Stations Metadata

In [11]:
%%time
df_stations = get_stations_metadata(url, about_params)
df_stations = transform_metadata(df_stations, stations_cols_wanted)
display(df_stations.head(2))
summarize_df(df_stations)

Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1


Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
station_id,int64,0,624,624,7383
name,string,0,624,624,12 Harbour St
physical_configuration,string,0,624,5,REGULAR
lat,float64,0,624,621,43.642564
lon,float64,0,624,623,-79.3762
altitude,float64,10,624,2,0.0
address,string,0,624,624,12 Harbour St
capacity,int64,0,624,37,15
physicalkey,int64,0,624,1,1
transitcard,int64,0,624,1,1


CPU times: user 59.9 ms, sys: 3.56 ms, total: 63.4 ms
Wall time: 501 ms


### Process Ridership Data

#### Load Saved Trips Data

In [12]:
%%time
df = bt.load_trips_data("data/raw/*.csv")
summarize_df(df)

Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
TRIP_ID,Int64,0,3621230,3621230,13242008
TRIP__DURATION,int64,0,3621230,19074,1089
START_STATION_ID,Int64,0,3621230,620,7491
START_TIME,datetime64[ns],0,3621230,447292,2021-08-23 17:00:00
START_STATION_NAME,string,0,3621230,735,D'Arcy St / Spadina Ave - SMART
END_STATION_ID,Int64,0,3621230,622,7335
END_TIME,datetime64[ns],0,3621230,446893,2021-08-23 17:18:00
END_STATION_NAME,string,0,3621230,740,Bay St / Bloor St W (West Side)
BIKE_ID,Int64,201,3621230,6501,1451
USER_TYPE,string,0,3621230,2,Casual Member


CPU times: user 14.7 s, sys: 657 ms, total: 15.3 s
Wall time: 15.4 s


#### Summarize Missing Values and Duplicates from Ridership Data

In [13]:
%%time
dups_to_drop = df.dropna(subset=nan_cols)[
    df.dropna(subset=nan_cols).duplicated(subset=duplicated_cols, keep="first")
]
not_missing = len(df.dropna(subset=nan_cols))
d_nan = {
    "all": len(df),
    "non_missing": not_missing,
    "frac_to_drop": ((len(df) - not_missing) / len(df)) * 100,
    "duplicates_to_drop": (len(dups_to_drop) / len(df)) * 100,
}
df_nan = pd.DataFrame.from_dict(d_nan, orient="index").T
summarize_df(df)
df_nan

Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
TRIP_ID,Int64,0,3621230,3621230,12790518
TRIP__DURATION,int64,0,3621230,19074,1247
START_STATION_ID,Int64,0,3621230,620,7153
START_TIME,datetime64[ns],0,3621230,447292,2021-07-30 07:29:00
START_STATION_NAME,string,0,3621230,735,Bloor St W / Christie St
END_STATION_ID,Int64,0,3621230,622,7059
END_TIME,datetime64[ns],0,3621230,446893,2021-07-30 07:49:00
END_STATION_NAME,string,0,3621230,740,Front St W / Blue Jays Way
BIKE_ID,Int64,201,3621230,6501,5537
USER_TYPE,string,0,3621230,2,Annual Member


CPU times: user 5.2 s, sys: 383 ms, total: 5.58 s
Wall time: 5.59 s


Unnamed: 0,all,non_missing,frac_to_drop,duplicates_to_drop
0,3621230.0,3621230.0,0.0,0.0


In [14]:
%%time
df = process_trips_data(df, nan_cols, duplicated_cols)
summarize_df(df)

Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
TRIP_ID,Int64,0,3621230,3621230,12281161
TRIP__DURATION,int64,0,3621230,19074,564
START_STATION_ID,Int64,0,3621230,620,7035
START_TIME,datetime64[ns],0,3621230,447292,2021-06-30 17:27:00
START_STATION_NAME,string,0,3621230,735,Queen St W / Ossington Ave
END_STATION_ID,Int64,0,3621230,622,7204
END_TIME,datetime64[ns],0,3621230,446893,2021-06-30 17:36:00
END_STATION_NAME,string,0,3621230,740,College St / Crawford St
BIKE_ID,Int64,201,3621230,6501,5995
USER_TYPE,string,0,3621230,2,Annual Member


CPU times: user 9.52 s, sys: 536 ms, total: 10.1 s
Wall time: 10.1 s


### Cultural Hotspots

In [15]:
%%time
ch_params = {"id": "c7be2ee7-d317-4a28-8cbe-bff1ce116b46"}
dfch_essentials = cpd.get_cultural_hotspots(url, ch_params)
dfch_essentials.head(2)

CPU times: user 89.9 ms, sys: 4.2 ms, total: 94.1 ms
Wall time: 480 ms


Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067
1,2,Crucified Again (Sculpture),43.753806,-79.21617


### Places of Interest

In [16]:
%%time
poi_params = {"id": "965247c0-c72e-49b4-bb1a-879cf98e1a32"}
df_poi = cpd.get_poi_data(url, poi_params)

CPU times: user 30.1 ms, sys: 7 µs, total: 30.1 ms
Wall time: 346 ms


Note that duplicate lat-long will be permitted here as multiple places of interest may share the same physical location, or immediately adjacent area. Some examples of such places of interest with a duplicated latitude and longitde are shown in `0_get_bikeshare_data.ipynb`. So, the duplicate lat-long sites will be retained in this dataset.

### Neighbourhood Boundary and Land Area Data

In [17]:
%%time
neigh_boundary_params = {"id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
gdf = cpd.get_neighbourhood_boundary_land_area_data(url, neigh_boundary_params, neigh_cols_to_show)
gdf[
    gdf["AREA_NAME"].str.contains(
        "Wychwood|Yonge-Eglinton|Yonge-St.|York Univ|Yorkdale-Glen"
    )
].sort_values(by=["AREA_NAME"])

CPU times: user 163 ms, sys: 4.29 ms, total: 167 ms
Wall time: 1.14 s


Unnamed: 0,AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,Shape__Area,Shape__Length,LATITUDE,AREA_LATITUDE,LONGITUDE,AREA_LONGITUDE,geometry
133,2480144,94,94,Wychwood (94),3217960.0,7515.779658,,43.67692,,-79.425515,"POLYGON ((-79.43592 43.68015, -79.43492 43.680..."
16,2480057,100,100,Yonge-Eglinton (100),3160334.0,7872.021074,,43.70469,,-79.40359,"POLYGON ((-79.41096 43.70408, -79.40962 43.704..."
134,2480143,97,97,Yonge-St.Clair (97),2222464.0,8130.411276,,43.68786,,-79.397871,"POLYGON ((-79.39119 43.68108, -79.39141 43.680..."
131,2480146,27,27,York University Heights (27),25418210.0,25632.335242,,43.76574,,-79.488883,"POLYGON ((-79.50529 43.75987, -79.50488 43.759..."
69,2480105,31,31,Yorkdale-Glen Park (31),11566690.0,13953.408098,,43.714673,,-79.457108,"POLYGON ((-79.43969 43.70561, -79.44011 43.705..."


In order to use the correct CRS for allowing an area calculation in square km, we'll get the current EPSG ([link](https://epsg.io/4326)) from the geodata

In [18]:
print(gdf.crs)

epsg:4326


Fix typographic errors in the name of the neighbourhood in this dataset
- [North St. James Town](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa74.pdf) and [Cabbagetown-South St. James Town](https://www.toronto.com/community-static/4550668-cabbagetown-south-st-james-town/)
  - missing space between ...St. and Ja...
- Weston-Pelham Park
  - incorrectly listed as its old name (from 2011) of Weston-Pellam Park ([link](https://www.toronto.ca/wp-content/uploads/2017/11/900b-91-Weston-Pellam-Park.pdf))
  - replace with [new name from 2016](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa91.pdf)

In [19]:
d_renaming = {
    "St.James": "St. James",
    "Weston-Pellam": "Weston-Pelham",
}
for k, v in d_renaming.items():
    gdf["AREA_NAME"] = gdf["AREA_NAME"].str.replace(k, v, regex=False)

The incorrect names have been successfully replaced as shown below

In [20]:
gdf.query("AREA_NAME.str.contains('James Town|Weston-|Cabbage')")[geo_cols]

Unnamed: 0,AREA_NAME,geometry,Shape__Area
24,North St. James Town (74),"POLYGON ((-79.38057 43.67161, -79.37947 43.671...",811303.9
46,Weston-Pelham Park (91),"POLYGON ((-79.46005 43.66723, -79.46092 43.668...",2794057.0
114,Cabbagetown-South St. James Town (71),"POLYGON ((-79.37672 43.66242, -79.37721 43.663...",2711742.0


Compare manual to provided neighbourhood areas (in square km)
- first, changes geodata projection to a cartesian system (EPSG = 3857, in units of m) ([1](https://epsg.io/3857))

In [21]:
area_diff = (gdf["geometry"].to_crs(epsg=3857).area) - gdf["Shape__Area"]
print(area_diff.min(), area_diff.max())

-0.10295796953141689 0.147477675229311


Since these are small differences (in units of square km), we'll use the provided neighbourhood areas from the `Shape__Area` column of the neighbourhood boundary file.

### Public Transit Locations

In [22]:
%%time
pt_params = {"id": "7795b45e-e65a-4465-81fc-c36b9dfff169"}
df_pt_slice = cpd.get_public_transit_locations(url, pt_params)

CPU times: user 665 ms, sys: 157 ms, total: 822 ms
Wall time: 9.31 s


### Colleges and Universities

In [23]:
df_coll_univ = cpd.get_coll_univ_locations()

### (Aggregations by Neighbourhood) Neighbourhood Profile Data - Population

In [24]:
%%time
neigh_profile_params = {"id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
df_neigh_demog = cn.get_neighbourhood_profile_data(url, neigh_profile_params)

CPU times: user 29.8 ms, sys: 1.1 ms, total: 30.9 ms
Wall time: 589 ms


### (Aggregations by Neighbourhood) Number of Locations Per Neighbourhood

#### Places of Interest

In [25]:
print(df_poi["ID"].nunique(), len(df_poi))
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_poi.head(2))

174 174


Unnamed: 0,ID,ADDRESS_INFO,NAME,CATEGORY,PHONE,EMAIL,WEBSITE,GEOID,RECEIVED_DATE,ADDRESS_POINT_ID,ADDRESS_NUMBER,LINEAR_NAME_FULL,ADDRESS_FULL,POSTAL_CODE,MUNICIPALITY,CITY,PLACE_NAME,GENERAL_USE_CODE,CENTRELINE,LO_NUM,LO_NUM_SUF,HI_NUM,HI_NUM_SUF,LINEAR_NAME_ID,WARD,WARD_2003,WARD_2018,MI_PRINX,ATTRACTION,MAP_ACCESS,geometry,POI_LONGITUDE,POI_LATITUDE
0,1,,BMO Field,Sports / Entertainment Venue,416-815-5982,,https://www.bmofield.com/,20229243.0,,20229243.0,170,Princes' Blvd,170 Princes' Blvd,M6K 3C3,former Toronto,Toronto,CNE BMO Field,107007.0,20231258.0,170.0,,,,20228.0,Spadina-Fort York,19.0,10.0,4163950.0,BMO Field is home to the Toronto FC (Major Lea...,Y,"{""type"": ""Point"", ""coordinates"": [-79.41841561...",-79.418416,43.634663
1,2,,Aga Khan Museum,Museum,416-646-4677,,https://www.agakhanmuseum.org/,10142948.0,,10142948.0,77,Wynford Dr,77 Wynford Dr,M3C 1K1,North York,Toronto,,107008.0,444094.0,77.0,,,,7128.0,Don Valley East,26.0,16.0,4094277.0,"Dedicated to sharing the artistic, intellectua...",Y,"{""type"": ""Point"", ""coordinates"": [-79.33180392...",-79.331804,43.727331


In [26]:
%%time
df_poi_new = pa.check_io(out=ad.poi_new_schema)(cn.get_data_with_neighbourhood)(
    gdf[geo_cols],
    df_poi.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(df_poi_new.head(2))
display(df_poi_new.dtypes.to_frame())

Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,ID,NAME,lat,lon,AREA_NAME,Shape__Area
0,1,BMO Field,43.634663,-79.418416,Niagara (82),6192651.0
1,2,Aga Khan Museum,43.727331,-79.331804,Banbury-Don Mills (42),19248970.0


Unnamed: 0,0
ID,int64
NAME,string
lat,float64
lon,float64
AREA_NAME,string
Shape__Area,float64


CPU times: user 30.4 ms, sys: 267 µs, total: 30.7 ms
Wall time: 29 ms


#### Cultural Hotspots

In [27]:
assert dfch_essentials["ID"].nunique() == len(dfch_essentials)
dfch_essentials.head(2)

Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067
1,2,Crucified Again (Sculpture),43.753806,-79.21617


In [28]:
%%time
dfch_essentials_new = pa.check_output(ad.ch_essentials_new_schema)(cn.get_data_with_neighbourhood)(
    gdf[geo_cols],
    dfch_essentials.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(dfch_essentials_new.head(2))
display(dfch_essentials_new.dtypes.to_frame())

Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,ID,NAME,lat,lon,AREA_NAME,Shape__Area
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067,Bendale (127),14303500.0
1,2,Crucified Again (Sculpture),43.753806,-79.21617,Woburn (137),23664990.0


Unnamed: 0,0
ID,int64
NAME,string
lat,float64
lon,float64
AREA_NAME,string
Shape__Area,float64


CPU times: user 31.4 ms, sys: 375 µs, total: 31.8 ms
Wall time: 29.7 ms


#### Colleges and Universities

In [29]:
print(df_coll_univ["institution_id"].nunique(), len(df_coll_univ))
df_coll_univ.head(2)

11 11


Unnamed: 0,institution_id,institution_name,lat,lon
0,0,centennial,43.7854,-79.22664
1,1,george-brown,43.6761,-79.4111


### (Aggregations by Neighbourhood) Get Neighbourhood Data for Supplementary Datasets

#### Colleges and Universities

In [30]:
%%time
df_coll_univ_new = pa.check_output(ad.coll_univ_schema_new)(cn.get_data_with_neighbourhood)(
    gdf[geo_cols],
    df_coll_univ,
    "lat",
    "lon",
    "institution_id",
)
display(df_coll_univ_new.head(2))
display(df_coll_univ_new.dtypes.to_frame())

Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,institution_id,institution_name,lat,lon,AREA_NAME,Shape__Area
0,0,centennial,43.7854,-79.22664,Woburn (137),23664990.0
1,1,george-brown,43.6761,-79.4111,Casa Loma (96),3678385.0


Unnamed: 0,0
institution_id,int64
institution_name,string
lat,float64
lon,float64
AREA_NAME,string
Shape__Area,float64


CPU times: user 28.8 ms, sys: 80 µs, total: 28.9 ms
Wall time: 26.9 ms


#### Public Transit Locations

In [31]:
print(df_pt_slice["stop_id"].nunique(), len(df_pt_slice))
df_pt_slice.head(2)

9456 9456


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,lat,lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1


In [32]:
%%time
df_pt_slice_new = pa.check_output(ad.pub_trans_locations_schema_new)(cn.get_data_with_neighbourhood)(
    gdf[geo_cols],
    df_pt_slice,
    "lat",
    "lon",
    "stop_id",
)
display(df_pt_slice_new.head(2))
display(df_pt_slice_new.dtypes.to_frame())

Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,lat,lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,AREA_NAME,Shape__Area
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1,Clairlea-Birchmount (120),14168540.0
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1,Annex (95),5337192.0


Unnamed: 0,0
stop_id,int64
stop_code,int64
stop_name,string
stop_desc,string
lat,float64
lon,float64
zone_id,float64
stop_url,string
location_type,float64
parent_station,float64


CPU times: user 54.7 ms, sys: 289 µs, total: 55 ms
Wall time: 53 ms


### (Aggregations by Neighbourhood) Merge Neighbourhood Aggregations with GeoData and Population Data

In [33]:
%%time
df_neigh_stats = ad.combine_neigh_stats(
    gdf,
    df_pt_slice_new,
    df_coll_univ_new,
    dfch_essentials_new,
    df_poi_new,
    df_neigh_demog,
)
display(df_neigh_stats.head())
display(df_neigh_stats.dtypes.to_frame())

Unnamed: 0,AREA_NAME,neigh_shape_area,neigh_shape_length,geometry,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
0,Casa Loma (96),3678385.0,8214.176485,"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",43.681853,-79.408007,42,1,0,3,10968.0,1080.0,4555.0
1,Annex (95),5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,9,30526.0,3750.0,15040.0
2,Caledonia-Fairbank (109),2955857.0,6849.911724,"POLYGON ((-79.46021 43.68156, -79.46044 43.681...",43.688569,-79.455212,35,0,0,0,9955.0,1220.0,4570.0
3,Woodbine Corridor (64),3052518.0,7512.966773,"POLYGON ((-79.31485 43.66674, -79.31660 43.666...",43.676774,-79.315408,29,0,0,0,12541.0,1035.0,6165.0
4,Lawrence Park South (103),6211341.0,13530.370002,"POLYGON ((-79.41096 43.70408, -79.41165 43.703...",43.717213,-79.406038,42,0,0,1,15179.0,2095.0,5870.0


Unnamed: 0,0
AREA_NAME,string
neigh_shape_area,float64
neigh_shape_length,float64
geometry,geometry
neigh_area_latitude,float64
neigh_area_longitude,float64
neigh_transit_stops,int64
neigh_colleges_univs,int64
neigh_cultural_attractions,int64
neigh_places_of_interest,int64


CPU times: user 49.7 ms, sys: 138 µs, total: 49.8 ms
Wall time: 46.7 ms


In [34]:
assert type(df_neigh_stats).__name__ == "GeoDataFrame"
assert df_stations["station_id"].nunique() == len(df_stations)
df_stations.head(2)

Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1


### (Aggregations by Neighbourhood) Merge Stations Metadata with Aggregated Neighbourhood Stats

Append the neighbourhood containing each bikeshare station to the station metadata

In [35]:
%%time
df_stations_new = pa.check_output(ad.stations_schema_merged)(cn.get_data_with_neighbourhood)(
    gdf[geo_cols],
    df_stations,
    "lat",
    "lon",
    "station_id",
)
display(df_stations_new.head(2))
display(df_stations_new.dtypes.to_frame())

Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone,AREA_NAME,Shape__Area
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1,Waterfront Communities-The Island (77),25629770.0
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1,Church-Yonge Corridor (75),2609014.0


Unnamed: 0,0
station_id,int64
name,string
physical_configuration,string
lat,float64
lon,float64
altitude,float64
address,string
capacity,int64
physicalkey,int64
transitcard,int64


CPU times: user 34.3 ms, sys: 246 µs, total: 34.5 ms
Wall time: 32.8 ms


Merge the modified stations metadata with the neighbourhood stats

In [36]:
%%time
df_stations_new = ad.combine_stations_metadata_neighbourhood(df_stations_new, df_neigh_stats)
print(df_stations_new.shape)
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_stations_new.head(4))
display(df_stations_new.dtypes.rename("dtype").to_frame())

(624, 26)


Unnamed: 0,AREA_NAME,STATION_ID,NAME,PHYSICAL_CONFIGURATION,LAT,LON,ALTITUDE,ADDRESS,CAPACITY,PHYSICALKEY,TRANSITCARD,CREDITCARD,PHONE,SHAPE_AREA,NEIGH_SHAPE_AREA,NEIGH_SHAPE_LENGTH,GEOMETRY,NEIGH_AREA_LATITUDE,NEIGH_AREA_LONGITUDE,NEIGH_TRANSIT_STOPS,NEIGH_COLLEGES_UNIVS,NEIGH_CULTURAL_ATTRACTIONS,NEIGH_PLACES_OF_INTEREST,NEIGH_POP_2016,NEIGH_YOUTH_15_24,NEIGH_WORK_AGE_25_54
0,Annex (95),7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,,Madison Ave / Bloor St W,15,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,9,30526.0,3750.0,15040.0
1,Annex (95),7040,Euclid Ave / Bloor St W,REGULAR,43.664467,-79.414783,0.0,Euclid Ave / Bloor St W,19,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,9,30526.0,3750.0,15040.0
2,Annex (95),7061,Dalton Rd / Bloor St W,REGULAR,43.666294,-79.406643,0.0,Dalton Rd / Bloor St W,15,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,9,30526.0,3750.0,15040.0
3,Annex (95),7126,Yonge St / Yorkville Ave,REGULAR,43.671944,-79.387778,0.0,Yonge St / Yorkville Ave,17,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,9,30526.0,3750.0,15040.0


Unnamed: 0,dtype
AREA_NAME,string
STATION_ID,int64
NAME,string
PHYSICAL_CONFIGURATION,string
LAT,float64
LON,float64
ALTITUDE,float64
ADDRESS,string
CAPACITY,int64
PHYSICALKEY,int64


CPU times: user 31.5 ms, sys: 0 ns, total: 31.5 ms
Wall time: 29.8 ms


### (Aggregations by Neighbourhood) Merge Modified Stations Metadata With Ridership Data

In [37]:
%%time
cols = ["STATION_NAME", "year", "month", "day", "hour"]
df_hour_by_station_merged = ad.combine_hourly_trips_per_station(df, cols, df_stations_new)
display(df_hour_by_station_merged.head(4).append(df_hour_by_station_merged.tail(4)))
display(
    df_hour_by_station_merged.isna().sum().rename("num_missing").to_frame().merge(
        df_hour_by_station_merged.dtypes.rename("dtype").to_frame(), left_index=True, right_index=True
    )
)

Unnamed: 0,STATION_NAME,YEAR,MONTH,DAY,HOUR,USER_TYPE,NUM_TRIPS,DURATION_MIN,DURATION_MEDIAN,DURATION_MEAN,...,GEOMETRY,NEIGH_AREA_LATITUDE,NEIGH_AREA_LONGITUDE,NEIGH_TRANSIT_STOPS,NEIGH_COLLEGES_UNIVS,NEIGH_CULTURAL_ATTRACTIONS,NEIGH_PLACES_OF_INTEREST,NEIGH_POP_2016,NEIGH_YOUTH_15_24,NEIGH_WORK_AGE_25_54
0,Ontario Place Blvd / Lake Shore Blvd W (East),2021,5,30,16,Casual Member,79,300,1320.0,1740.759494,...,"POLYGON ((-79.42778 43.62979, -79.42781 43.629...",43.636682,-79.41242,63,0,0,8,31180.0,2415.0,23320.0
1,Lake Shore Blvd W / Ontario Dr,2021,5,23,19,Casual Member,73,540,1500.0,2688.493151,...,"POLYGON ((-79.42778 43.62979, -79.42781 43.629...",43.636682,-79.41242,63,0,0,8,31180.0,2415.0,23320.0
2,Lake Shore Blvd W / Ontario Dr,2021,5,24,19,Casual Member,72,60,1500.0,1632.5,...,"POLYGON ((-79.42778 43.62979, -79.42781 43.629...",43.636682,-79.41242,63,0,0,8,31180.0,2415.0,23320.0
3,Lake Shore Blvd W / Ontario Dr,2021,5,24,15,Casual Member,70,60,1560.0,1738.285714,...,"POLYGON ((-79.42778 43.62979, -79.42781 43.629...",43.636682,-79.41242,63,0,0,8,31180.0,2415.0,23320.0
3734669,Gould St / Mutual St,2022,1,12,23,Casual Member,1,960,960.0,960.0,...,"POLYGON ((-79.37672 43.66242, -79.37658 43.662...",43.659651,-79.379018,48,1,0,16,31340.0,5060.0,18780.0
3734670,Gould St / Mutual St,2022,1,12,21,Casual Member,1,180,180.0,180.0,...,"POLYGON ((-79.37672 43.66242, -79.37658 43.662...",43.659651,-79.379018,48,1,0,16,31340.0,5060.0,18780.0
3734671,Gould St / Mutual St,2022,1,12,21,Annual Member,1,240,240.0,240.0,...,"POLYGON ((-79.37672 43.66242, -79.37658 43.662...",43.659651,-79.379018,48,1,0,16,31340.0,5060.0,18780.0
3734672,York University Station (South) - SMART,2022,1,31,7,Annual Member,1,300,300.0,300.0,...,"POLYGON ((-79.50529 43.75987, -79.50488 43.759...",43.76574,-79.488883,220,1,0,0,27593.0,4750.0,12290.0


Unnamed: 0,num_missing,dtype
STATION_NAME,0,string
YEAR,0,int64
MONTH,0,int64
DAY,0,int64
HOUR,0,int64
USER_TYPE,0,string
NUM_TRIPS,0,int64
DURATION_MIN,0,int64
DURATION_MEDIAN,0,float64
DURATION_MEAN,0,float64


CPU times: user 11.3 s, sys: 683 ms, total: 12 s
Wall time: 12 s


In [38]:
%%time
# print(gdf.shape)
# display(gdf.describe())
display(df_neigh_demog.describe())
# display(df_poi_new.describe())
# display(dfch_essentials_new.describe())
# display(df_coll_univ_new.describe())
# display(df_pt_slice_new.describe())
with pd.option_context('display.max_columns', 100):
    display(df_neigh_stats.describe())
    display(df_stations_new.describe())
    display(df_hour_by_station_merged.describe())

Characteristic,name,Neighbourhood Number,"Population, 2016",Youth (15-24 years),Working Age (25-54 years),AREA_NAME
count,140,140,140,140,140,140
unique,140,140,140,130,136,140
top,Agincourt North,129,29113,1065,3790,Agincourt North (129)
freq,1,1,1,2,2,1


Unnamed: 0,neigh_shape_area,neigh_shape_length,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
count,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0
mean,8794110.0,14030.877618,43.708842,-79.400186,65.078571,0.078571,3.35,1.228571,19511.221429,2430.928571,8783.678571
std,8950625.0,7676.557581,0.051275,0.102044,45.46626,0.270035,8.167249,3.807117,10033.589222,1457.994778,5423.203831
min,811303.9,3559.283853,43.592362,-79.596367,15.0,0.0,0.0,0.0,6577.0,675.0,2750.0
25%,3563607.0,8822.60704,43.67101,-79.479793,34.0,0.0,0.0,0.0,12019.5,1428.75,5465.0
50%,6306846.0,12404.999207,43.702021,-79.403989,55.0,0.0,0.0,0.0,16749.5,2100.0,7475.0
75%,10376120.0,16454.610668,43.747298,-79.331097,80.0,0.0,1.0,1.0,23854.5,3022.5,10588.75
max,72144800.0,59561.024758,43.821208,-79.150844,316.0,1.0,49.0,35.0,65913.0,7840.0,45105.0


Unnamed: 0,STATION_ID,LAT,LON,ALTITUDE,CAPACITY,PHYSICALKEY,TRANSITCARD,CREDITCARD,PHONE,SHAPE_AREA,NEIGH_SHAPE_AREA,NEIGH_SHAPE_LENGTH,NEIGH_AREA_LATITUDE,NEIGH_AREA_LONGITUDE,NEIGH_TRANSIT_STOPS,NEIGH_COLLEGES_UNIVS,NEIGH_CULTURAL_ATTRACTIONS,NEIGH_PLACES_OF_INTEREST,NEIGH_POP_2016,NEIGH_YOUTH_15_24,NEIGH_WORK_AGE_25_54
count,624.0,624.0,624.0,614.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0,624.0
mean,7345.266026,43.664911,-79.393053,0.166254,18.945513,1.0,1.0,0.846154,0.846154,7844189.0,7844189.0,14196.123351,43.663959,-79.393103,64.475962,0.129808,1.208333,7.722756,25593.020833,3230.873397,14586.802885
std,204.767197,0.027901,0.054487,4.119614,6.190158,0.0,0.0,0.361091,0.361091,8742458.0,8742458.0,10984.576914,0.02964,0.053408,32.211421,0.336361,5.649034,10.69338,15612.700929,2234.758381,11444.348907
min,7000.0,43.588077,-79.545827,0.0,7.0,1.0,1.0,0.0,0.0,811303.9,811303.9,3559.283853,43.592362,-79.533345,15.0,0.0,0.0,0.0,7607.0,675.0,3090.0
25%,7163.75,43.648635,-79.42281,0.0,15.0,1.0,1.0,1.0,1.0,2933586.0,2933586.0,8214.176485,43.647536,-79.418409,36.0,0.0,0.0,0.0,14366.0,1485.0,7470.0
50%,7344.5,43.660145,-79.39247,0.0,19.0,1.0,1.0,1.0,1.0,3678385.0,3678385.0,9594.336045,43.659157,-79.385722,60.0,0.0,0.0,3.0,21849.0,2275.0,11615.0
75%,7527.25,43.673985,-79.371628,0.0,22.0,1.0,1.0,1.0,1.0,7167731.0,7167731.0,15209.513429,43.672614,-79.377201,93.0,0.0,0.0,9.0,31180.0,3925.0,17695.0
max,7699.0,43.788319,-79.123505,102.08,56.0,1.0,1.0,1.0,1.0,72144800.0,72144800.0,59561.024758,43.821208,-79.176676,220.0,1.0,40.0,35.0,65913.0,7840.0,45105.0


Unnamed: 0,YEAR,MONTH,DAY,HOUR,NUM_TRIPS,DURATION_MIN,DURATION_MEDIAN,DURATION_MEAN,DURATION_MAX,STATION_ID,LAT,LON,ALTITUDE,CAPACITY,PHYSICALKEY,TRANSITCARD,CREDITCARD,PHONE,SHAPE_AREA,NEIGH_SHAPE_AREA,NEIGH_SHAPE_LENGTH,NEIGH_AREA_LATITUDE,NEIGH_AREA_LONGITUDE,NEIGH_TRANSIT_STOPS,NEIGH_COLLEGES_UNIVS,NEIGH_CULTURAL_ATTRACTIONS,NEIGH_PLACES_OF_INTEREST,NEIGH_POP_2016,NEIGH_YOUTH_15_24,NEIGH_WORK_AGE_25_54
count,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3627868.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0,3734673.0
mean,2021.025,7.14542,15.5543,14.27852,1.850772,788.9565,914.8727,929.0714,1105.539,7265.716,43.65736,-79.39173,0.429607,19.69854,1.0,1.0,0.8613919,0.8613919,8074682.0,8074682.0,14286.91,43.65561,-79.39145,65.60181,0.1652961,0.6528272,11.3235,29092.55,3930.51,17451.24
std,0.1551351,2.966991,8.725659,5.251745,1.795347,1485.406,1580.11,1594.401,2197.53,186.4677,0.01798068,0.03570222,6.608308,7.30203,0.0,0.0,0.3455372,0.3455372,8744330.0,8744330.0,11202.97,0.01975017,0.03588692,29.33942,0.3714477,4.502457,11.7234,17273.85,2419.776,12812.6
min,2021.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,7000.0,43.58808,-79.54583,0.0,7.0,1.0,1.0,0.0,0.0,811303.9,811303.9,3559.284,43.59236,-79.53334,15.0,0.0,0.0,0.0,7607.0,675.0,3090.0
25%,2021.0,5.0,8.0,11.0,1.0,360.0,480.0,480.0,540.0,7100.0,43.64595,-79.4079,0.0,15.0,1.0,1.0,1.0,1.0,2711742.0,2711742.0,7962.833,43.6367,-79.41242,40.0,0.0,0.0,2.0,17945.0,2060.0,8760.0
50%,2021.0,7.0,15.0,15.0,1.0,600.0,720.0,720.0,840.0,7246.0,43.65608,-79.3894,0.0,19.0,1.0,1.0,1.0,1.0,3459075.0,3459075.0,9594.336,43.65751,-79.38572,63.0,0.0,0.0,7.0,25797.0,3010.0,13065.0
75%,2021.0,10.0,23.0,18.0,2.0,960.0,1080.0,1080.0,1260.0,7412.0,43.66553,-79.37621,0.0,23.0,1.0,1.0,1.0,1.0,7125378.0,7125378.0,15209.51,43.66251,-79.3772,96.0,0.0,0.0,18.0,31340.0,6860.0,18780.0
max,2022.0,12.0,31.0,23.0,79.0,86220.0,86220.0,86220.0,86340.0,7681.0,43.78832,-79.1235,102.08,56.0,1.0,1.0,1.0,1.0,72144800.0,72144800.0,59561.02,43.82121,-79.17668,220.0,1.0,40.0,35.0,65913.0,7840.0,45105.0


CPU times: user 17.7 s, sys: 95.5 ms, total: 17.8 s
Wall time: 17.7 s


## Export Processed Data to Multiple CSV Files

In [113]:
%%time
pa.check_input(ad.hourly_trips_by_station_merged_schema)(
    export_df_to_multiple_csv_files
)(
    df_hour_by_station_merged,
    cols_to_export,
    "local_stage",
    nrows_per_staged_csv_file,
)

Exported manual chunk 1 of 9 to local_stage_1.csv.gz (indexes range = 0 - 380,000)
Exported manual chunk 2 of 9 to local_stage_2.csv.gz (indexes range = 380,000 - 760,000)
Exported manual chunk 3 of 9 to local_stage_3.csv.gz (indexes range = 760,000 - 1,140,000)
Exported manual chunk 4 of 9 to local_stage_4.csv.gz (indexes range = 1,140,000 - 1,520,000)
Exported manual chunk 5 of 9 to local_stage_5.csv.gz (indexes range = 1,520,000 - 1,900,000)
Exported manual chunk 6 of 9 to local_stage_6.csv.gz (indexes range = 1,900,000 - 2,280,000)
Exported manual chunk 7 of 9 to local_stage_7.csv.gz (indexes range = 2,280,000 - 2,660,000)
Exported manual chunk 8 of 9 to local_stage_8.csv.gz (indexes range = 2,660,000 - 3,040,000)
Exported manual chunk 9 of 9 to local_stage_9.csv.gz (indexes range = 3,040,000 - 3,734,673)
CPU times: user 23.6 s, sys: 34.2 ms, total: 23.7 s
Wall time: 23.8 s


## Database Administration

### Create bikeshare trips and station metadata databases

In [40]:
conn = snowflake.connector.connect(**snowflake_dict_no_db)
cur = conn.cursor()

In [41]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = cur.execute(f"DROP DATABASE IF EXISTS {db_name}")

CPU times: user 6.52 ms, sys: 0 ns, total: 6.52 ms
Wall time: 447 ms


In [42]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = cur.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")

CPU times: user 6.87 ms, sys: 0 ns, total: 6.87 ms
Wall time: 712 ms


In [43]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = show_sql_df(f"SHOW DATABASES LIKE '{db_name}'", cur, table_output=True)

Unnamed: 0,created_on,name,is_default,is_current,origin,owner,comment,options,retention_time
0,2022-04-10 12:26:56.011000-07:00,TORBIKES,N,N,,SYSADMIN,,,1


Unnamed: 0,created_on,name,is_default,is_current,origin,owner,comment,options,retention_time
0,2022-04-10 12:26:56.363000-07:00,TORBIKESTATIONS,N,Y,,SYSADMIN,,,1


CPU times: user 23.2 ms, sys: 4.18 ms, total: 27.4 ms
Wall time: 217 ms


In [44]:
%%time
_ = show_sql_df("SHOW DATABASES", cur, table_output=True)

Unnamed: 0,created_on,name,is_default,is_current,origin,owner,comment,options,retention_time
0,2022-01-27 16:12:39.701000-08:00,DEMO_DB,N,N,,SYSADMIN,,,1
1,2022-01-27 10:58:19.534000-08:00,SNOWFLAKE_SAMPLE_DATA,N,N,SFC_SAMPLES.SAMPLE_DATA,ACCOUNTADMIN,Provided by Snowflake during account provisioning,,1
2,2022-04-10 12:26:56.011000-07:00,TORBIKES,N,N,,SYSADMIN,,,1
3,2022-04-10 12:26:56.363000-07:00,TORBIKESTATIONS,N,Y,,SYSADMIN,,,1
4,2022-01-27 16:12:52.421000-08:00,UTIL_DB,N,N,,SYSADMIN,,,1


CPU times: user 9.65 ms, sys: 89 µs, total: 9.74 ms
Wall time: 101 ms


In [62]:
cur.close()
conn.close()

### Create bikeshare trips File Format

In [111]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

In [119]:
_ = cur.execute(f"DROP FILE FORMAT IF EXISTS {trips_file_format_name}")
_ = cur.execute(f"DROP STAGE IF EXISTS {trips_stage_name}")

In [120]:
%%time
query = fr"""
        CREATE OR REPLACE FILE FORMAT {trips_file_format_name}
        TYPE = 'CSV'
        COMPRESSION = 'GZIP'
        FIELD_DELIMITER = ','
        RECORD_DELIMITER = '\n'
        SKIP_HEADER = 1
        TRIM_SPACE = FALSE
        ERROR_ON_COLUMN_COUNT_MISMATCH = TRUE
        ESCAPE = 'NONE'
        DATE_FORMAT = 'AUTO'
        TIMESTAMP_FORMAT = 'AUTO'
        NULL_IF = ('\\N')
        """
_ = cur.execute(query)

CPU times: user 4.45 ms, sys: 16 µs, total: 4.46 ms
Wall time: 104 ms


### Create Internal Stage for bikeshare trips data

In [121]:
%%time
query = f"""
        CREATE OR REPLACE STAGE {trips_stage_name}
        FILE_FORMAT = {trips_file_format_name}
        """
_ = cur.execute(query)

CPU times: user 3.76 ms, sys: 0 ns, total: 3.76 ms
Wall time: 131 ms


In [122]:
%%time
query = """
        SHOW STAGES
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,created_on,name,database_name,schema_name,url,has_credentials,has_encryption_key,owner,comment,region,type,cloud,notification_channel,storage_integration
0,2022-04-10 12:52:18.325000-07:00,BIKES_STAGE,TORBIKES,PUBLIC,,N,N,SYSADMIN,,,INTERNAL,,,


CPU times: user 7.23 ms, sys: 3.98 ms, total: 11.2 ms
Wall time: 107 ms


### Stage Local Raw Trips Data

In [123]:
%%time
for file in glob("data/processed/local_stage_*.csv.gz"):
    query = f"""
            PUT file://{file} @{trips_stage_name}
            """
    print(query.strip())
    _ = cur.execute(query)

PUT file://data/processed/local_stage_4.csv.gz @bikes_stage
PUT file://data/processed/local_stage_2.csv.gz @bikes_stage
PUT file://data/processed/local_stage_7.csv.gz @bikes_stage
PUT file://data/processed/local_stage_6.csv.gz @bikes_stage
PUT file://data/processed/local_stage_9.csv.gz @bikes_stage
PUT file://data/processed/local_stage_5.csv.gz @bikes_stage
PUT file://data/processed/local_stage_8.csv.gz @bikes_stage
PUT file://data/processed/local_stage_1.csv.gz @bikes_stage
PUT file://data/processed/local_stage_3.csv.gz @bikes_stage
CPU times: user 265 ms, sys: 23.8 ms, total: 289 ms
Wall time: 8.12 s


In [124]:
query = f"""
        LIST @{trips_stage_name}/
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,name,size,md5,last_modified
0,bikes_stage/local_stage_1.csv.gz,3362112,1c941428575b2544c9ff5be480a42f00,"Sun, 10 Apr 2022 19:52:36 GMT"
1,bikes_stage/local_stage_2.csv.gz,2843120,027476a53c0bc3e4dc30b3c2bfa9ff40,"Sun, 10 Apr 2022 19:52:31 GMT"
2,bikes_stage/local_stage_3.csv.gz,2067536,f53bb1c8401b51970174d1e21dba0638,"Sun, 10 Apr 2022 19:52:37 GMT"
3,bikes_stage/local_stage_4.csv.gz,1890000,34ccbeae142e8ec4b5a0a990eae98142,"Sun, 10 Apr 2022 19:52:30 GMT"
4,bikes_stage/local_stage_5.csv.gz,2046816,0942edfe0f29a1b0f99fa98ef9cbf414,"Sun, 10 Apr 2022 19:52:35 GMT"
5,bikes_stage/local_stage_6.csv.gz,3346400,b832a74c210c1f47cfa7b4996e210f39,"Sun, 10 Apr 2022 19:52:32 GMT"
6,bikes_stage/local_stage_7.csv.gz,2689456,d707874554a783f1e5176d9479860410,"Sun, 10 Apr 2022 19:52:32 GMT"
7,bikes_stage/local_stage_8.csv.gz,2046512,a40ecc3e6bc7f31b63af968aae45e6c0,"Sun, 10 Apr 2022 19:52:35 GMT"
8,bikes_stage/local_stage_9.csv.gz,3460864,6e9a0b8025f09040ce7e4fc37c2bd9c1,"Sun, 10 Apr 2022 19:52:34 GMT"


### Create bikeshare trips Table

In [125]:
%%time
_ = cur.execute(
    f"""
    CREATE OR REPLACE TABLE {trips_table_name} (
        station_name VARCHAR(100),
        year INT,
        month INT,
        day INT,
        hour INT,
        user_type VARCHAR(20),
        num_trips INT,
        duration_mean FLOAT,
        station_type VARCHAR(10),
        area_name TEXT,
        physical_configuration TEXT,
        capacity INT,
        physicalkey INT,
        transitcard INT,
        creditcard INT,
        phone INT,
        neigh_transit_stops INT,
        neigh_colleges_univs INT,
        neigh_cultural_attractions INT,
        neigh_places_of_interest INT
    )
    """
)

CPU times: user 3.62 ms, sys: 0 ns, total: 3.62 ms
Wall time: 241 ms


In [126]:
_ = show_sql_df(f"SHOW TABLES LIKE '%{trips_table_name}%'", cur, True)

Unnamed: 0,created_on,name,database_name,schema_name,kind,comment,cluster_by,rows,bytes,owner,retention_time,automatic_clustering,change_tracking,search_optimization,search_optimization_progress,search_optimization_bytes,is_external
0,2022-04-10 12:52:36.958000-07:00,TRIPS,TORBIKES,PUBLIC,TABLE,,,0,0,SYSADMIN,1,OFF,OFF,OFF,,,N


In [127]:
_ = show_sql_df(f"SHOW COLUMNS IN TABLE {trips_table_name}", cur, True)

Unnamed: 0,table_name,schema_name,column_name,data_type,null?,default,kind,expression,comment,database_name,autoincrement
0,TRIPS,PUBLIC,STATION_NAME,"{""type"":""TEXT"",""length"":100,""byteLength"":400,""...",True,,COLUMN,,,TORBIKES,
1,TRIPS,PUBLIC,YEAR,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
2,TRIPS,PUBLIC,MONTH,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
3,TRIPS,PUBLIC,DAY,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
4,TRIPS,PUBLIC,HOUR,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
5,TRIPS,PUBLIC,USER_TYPE,"{""type"":""TEXT"",""length"":20,""byteLength"":80,""nu...",True,,COLUMN,,,TORBIKES,
6,TRIPS,PUBLIC,NUM_TRIPS,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
7,TRIPS,PUBLIC,DURATION_MEAN,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKES,
8,TRIPS,PUBLIC,STATION_TYPE,"{""type"":""TEXT"",""length"":10,""byteLength"":40,""nu...",True,,COLUMN,,,TORBIKES,
9,TRIPS,PUBLIC,AREA_NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKES,


### Add Staged Trips Data to Trips Table

In [128]:
%%time
query = f"""
        COPY INTO {trips_table_name} from @{trips_stage_name}
        """
_ = cur.execute(query)

CPU times: user 4.19 ms, sys: 7 µs, total: 4.2 ms
Wall time: 3.92 s


In [129]:
%%time
query = f"""
        SELECT *
        FROM {trips_table_name}
        LIMIT 5
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,station_name,year,month,day,hour,user_type,num_trips,duration_mean,station_type,area_name,physical_configuration,capacity,physicalkey,transitcard,creditcard,phone,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest
0,Tommy Thompson Park (Leslie Street Spit),2021,4,4,13,Casual Member,8,2445.0,END,South Riverdale (70),REGULAR,15,1,1,1,1,101,0,0,3
1,St. George St / Russell St - SMART,2021,11,2,8,Casual Member,8,607.5,END,University (79),SMARTMAPFRAME,16,1,1,0,0,34,0,0,4
2,Humber Bay Shores Park West,2021,8,2,12,Casual Member,8,2062.5,END,Mimico (includes Humber Bay Shores) (17),REGULAR,31,1,1,1,1,101,0,38,2
3,High Park - West Rd,2021,8,20,19,Casual Member,8,1957.5,END,High Park-Swansea (87),REGULAR,23,1,1,1,1,79,0,1,2
4,G Ross Lord Park,2021,5,9,19,Casual Member,8,1515.0,END,Bathurst Manor (34),REGULAR,31,1,1,1,1,59,0,0,0


CPU times: user 11.5 ms, sys: 123 µs, total: 11.6 ms
Wall time: 496 ms


In [130]:
%%time
query = f"""
        SELECT COUNT(*) AS num_rows
        FROM {trips_table_name}
        """
df_query_nrows_trips = show_sql_df(query, cur, True)

Unnamed: 0,num_rows
0,3734673


CPU times: user 6.49 ms, sys: 89 µs, total: 6.58 ms
Wall time: 297 ms


In [None]:
assert df_query_nrows_trips.loc[0, "num_rows"] == sum(
    [l["nrows"] for f in cols_dict_list for k, l in f.items()]
)

In [132]:
cur.close()
conn.close()

### Create Stations Metadata to Table

In [134]:
conn = snowflake.connector.connect(**snowflake_station_stats_dict)
cur = conn.cursor()

In [135]:
%%time
_ = cur.execute(
    f"""
    CREATE OR REPLACE TABLE {station_stats_table_name} (
        area_name string,
        station_id integer,
        name string,
        physical_configuration string,
        lat float,
        lon float,
        altitude float,
        address string,
        capacity integer,
        physicalkey integer,
        transitcard integer,
        creditcard integer,
        phone integer,
        shape_area float,
        neigh_shape_area float,
        neigh_shape_length float,
        neigh_area_latitude float,
        neigh_area_longitude float,
        neigh_transit_stops integer,
        neigh_colleges_univs integer,
        neigh_cultural_attractions integer,
        neigh_places_of_interest integer,
        neigh_pop_2016 float,
        neigh_youth_15_24 float,
        neigh_work_age_25_54 float
    )
    """
)

CPU times: user 3.79 ms, sys: 0 ns, total: 3.79 ms
Wall time: 198 ms


In [136]:
_ = show_sql_df(f"SHOW TABLES LIKE '%{station_stats_table_name}%'", cur, True)

Unnamed: 0,created_on,name,database_name,schema_name,kind,comment,cluster_by,rows,bytes,owner,retention_time,automatic_clustering,change_tracking,search_optimization,search_optimization_progress,search_optimization_bytes,is_external
0,2022-04-10 12:55:00.368000-07:00,STATION_STATS,TORBIKESTATIONS,PUBLIC,TABLE,,,0,0,SYSADMIN,1,OFF,OFF,OFF,,,N


In [137]:
df_cols_stations_table = show_sql_df(
    f"SHOW COLUMNS IN TABLE {station_stats_table_name}", cur, True
)

Unnamed: 0,table_name,schema_name,column_name,data_type,null?,default,kind,expression,comment,database_name,autoincrement
0,STATION_STATS,PUBLIC,AREA_NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
1,STATION_STATS,PUBLIC,STATION_ID,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKESTATIONS,
2,STATION_STATS,PUBLIC,NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
3,STATION_STATS,PUBLIC,PHYSICAL_CONFIGURATION,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
4,STATION_STATS,PUBLIC,LAT,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKESTATIONS,
5,STATION_STATS,PUBLIC,LON,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKESTATIONS,
6,STATION_STATS,PUBLIC,ALTITUDE,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKESTATIONS,
7,STATION_STATS,PUBLIC,ADDRESS,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
8,STATION_STATS,PUBLIC,CAPACITY,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKESTATIONS,
9,STATION_STATS,PUBLIC,PHYSICALKEY,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKESTATIONS,


In [138]:
assert (
    len(df_cols_stations_table) == df_stations_new.drop(columns=["GEOMETRY"]).shape[1]
)
assert pd.Series(
    df_stations_new.drop(columns=["GEOMETRY"]).columns.rename("column_name")
).equals(df_cols_stations_table["column_name"])

### Add Stations Metadata to Table

In [140]:
df_stations_new.drop(columns=["GEOMETRY"]).dtypes.rename("dtype").to_frame()

Unnamed: 0,dtype
AREA_NAME,string
STATION_ID,int64
NAME,string
PHYSICAL_CONFIGURATION,string
LAT,float64
LON,float64
ALTITUDE,float64
ADDRESS,string
CAPACITY,int64
PHYSICALKEY,int64


In [141]:
%%time
success, nchunks, nrows, _ = write_pandas(
    conn, df_stations_new.drop(columns=['GEOMETRY']), station_stats_table_name.upper()
)

CPU times: user 165 ms, sys: 4.32 ms, total: 169 ms
Wall time: 2.11 s


In [142]:
%%time
query = f"""
        SELECT COUNT(*) AS num_rows
        FROM {station_stats_table_name}
        """
df_query_nrows_stations = show_sql_df(query, cur, True)

Unnamed: 0,num_rows
0,624


CPU times: user 5.37 ms, sys: 193 µs, total: 5.56 ms
Wall time: 94.7 ms


In [143]:
assert success
try:
    assert nrows == len(df_stations_new)
    assert df_query_nrows_stations.loc[0, "num_rows"] == len(df_stations_new)
    print(f"Exported: {len(df_stations_new):,} rows, as expected")
except AssertionError:
    print(f"Expected: {len(df_stations_new):,} rows\nActual: {nrows:,} rows")

Exported: 624 rows, as expected


In [144]:
cur.close()
conn.close()

## Query Data From Databases

In [145]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

In [148]:
%%time
query = f"""
        SELECT *
        FROM {trips_db_name}.public.{trips_table_name}
        ORDER BY year,month,day,hour
        LIMIT 10
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,station_name,year,month,day,hour,user_type,num_trips,duration_mean,station_type,area_name,physical_configuration,capacity,physicalkey,transitcard,creditcard,phone,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest
0,McGill St / Church St,2021,1,1,0,Annual Member,1,720.0,START,Church-Yonge Corridor (75),REGULAR,19,1,1,1,1,48,1,0,16
1,Lower Simcoe St / Bremner Blvd,2021,1,1,0,Annual Member,1,720.0,START,Waterfront Communities-The Island (77),REGULAR,14,1,1,1,1,96,0,0,35
2,Metro Hall Plaza,2021,1,1,0,Casual Member,1,1080.0,START,Waterfront Communities-The Island (77),REGULAR,27,1,1,1,1,96,0,0,35
3,Essex St / Christie St - SMART,2021,1,1,0,Casual Member,1,1320.0,START,Annex (95),REGULAR,19,1,1,1,1,98,0,0,9
4,Lower Simcoe St / Bremner Blvd,2021,1,1,0,Casual Member,1,360.0,START,Waterfront Communities-The Island (77),REGULAR,14,1,1,1,1,96,0,0,35
5,Madison Ave / Bloor St W,2021,1,1,0,Casual Member,1,720.0,START,Annex (95),REGULAR,15,1,1,1,1,98,0,0,9
6,John St / Mercer St - SMART,2021,1,1,0,Annual Member,1,1620.0,START,Waterfront Communities-The Island (77),SMARTMAPFRAME,12,1,1,0,0,96,0,0,35
7,High Park Subway Station,2021,1,1,0,Annual Member,1,960.0,START,High Park North (88),REGULAR,19,1,1,1,1,51,0,0,3
8,Lonsdale Rd / Spadina Rd,2021,1,1,0,Annual Member,1,1920.0,START,Forest Hill South (101),REGULAR,15,1,1,1,1,40,0,0,0
9,Jarvis St / Isabella St,2021,1,1,0,Annual Member,1,120.0,START,North St. James Town (74),REGULAR,23,1,1,1,1,15,0,0,0


CPU times: user 14.2 ms, sys: 0 ns, total: 14.2 ms
Wall time: 446 ms


In [149]:
%%time
query = f"""
        SELECT station_name,
               station_type,
               duration_mean,
               year,
               month,
               day,
               hour,
               user_type
        FROM {trips_db_name}.public.{trips_table_name}
        LIMIT 10
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,station_name,station_type,duration_mean,year,month,day,hour,user_type
0,Spadina Ave / Blue Jays Way,START,420.0,2021,12,9,9,Annual Member
1,Dundas St W / St. Patrick St,START,420.0,2021,3,31,20,Annual Member
2,Spadina Ave / Adelaide St W,START,510.0,2021,6,21,10,Annual Member
3,Spadina Ave / Blue Jays Way,START,630.0,2021,12,9,8,Annual Member
4,Queen St E / George St (Moss Park),START,300.0,2021,7,25,19,Annual Member
5,Dundonald St / Church St,START,510.0,2021,10,7,16,Casual Member
6,Spadina Ave / Adelaide St W,START,1620.0,2021,6,21,15,Casual Member
7,Yonge St / Alexander St - SMART,START,480.0,2021,5,31,6,Annual Member
8,Dundonald St / Church St,START,450.0,2021,10,6,16,Annual Member
9,Dundonald St / Church St,START,180.0,2021,10,7,13,Casual Member


CPU times: user 5.59 ms, sys: 3.92 ms, total: 9.5 ms
Wall time: 250 ms


In [155]:
%%time
query = f"""
        SELECT station_name,
               station_type,
               year,
               month,
               day,
               hour,
               user_type,
               num_trips
        FROM {trips_db_name}.public.{trips_table_name}
        -- WHERE station_type = 'START'
        LIMIT 10000
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,station_name,station_type,year,month,day,hour,user_type,num_trips
0,Spadina Ave / Blue Jays Way,START,2021,12,9,9,Annual Member,2
1,Dundas St W / St. Patrick St,START,2021,3,31,20,Annual Member,2
2,Spadina Ave / Adelaide St W,START,2021,6,21,10,Annual Member,2
3,Spadina Ave / Blue Jays Way,START,2021,12,9,8,Annual Member,2
4,Queen St E / George St (Moss Park),START,2021,7,25,19,Annual Member,2
...,...,...,...,...,...,...,...,...
9995,Bay St / Albert St,END,2021,2,11,17,Annual Member,2
9996,Bay St / Albert St,END,2021,2,12,8,Annual Member,2
9997,Dundas St W / Yonge St,END,2021,7,29,11,Casual Member,2
9998,Dundas St W / Yonge St,END,2021,7,29,9,Annual Member,2


CPU times: user 319 ms, sys: 4.54 ms, total: 324 ms
Wall time: 840 ms


In [156]:
cur.close()
conn.close()

## AWS QuickSight Data Source

In [None]:
# user_arn = [
#     u["Arn"]
#     for u in qs_client_user.list_users(AwsAccountId=account_id, Namespace="default")[
#         "UserList"
#     ]
#     if u["UserName"].startswith("els")
# ][0]
# user_arn

In [None]:
# %%time
# dso_creation_response = qs_client.create_data_source(
#     AwsAccountId=account_id,
#     DataSourceId=f"snowflake-{trips_db_name}",
#     Name=trips_db_name,
#     Type='SNOWFLAKE',
#     DataSourceParameters={
#         'SnowflakeParameters': {
#             'Host': os.getenv("SNOWFLAKE_ACCOUNT")+".snowflakecomputing.com",
#             'Database': trips_db_name.upper(),
#             'Warehouse': os.getenv("SNOWFLAKE_WAREHOUSE"),
#         },
#     },
#     Credentials={
#         'CredentialPair': {
#             'Username': os.getenv("SNOWFLAKE_USER"),
#             'Password': os.getenv("SNOWFLAKE_PASS"),
#         },
#     },
#     Permissions= [
#       {
#         'Principal': user_arn,
#         'Actions': [
#           'quicksight:DescribeDataSource',
#           'quicksight:DescribeDataSourcePermissions',
#           'quicksight:UpdateDataSource',
#           'quicksight:UpdateDataSourcePermissions',
#           'quicksight:DeleteDataSource',
#           'quicksight:PassDataSource'
#         ]
#       }
#     ],
# )
# dso_creation_response