# Get Data

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
from glob import glob
from io import BytesIO
from multiprocessing import cpu_count
from typing import Dict, List, Union
from urllib.request import urlopen
from zipfile import ZipFile

import geopandas as gpd
import pandas as pd
import requests
import snowflake.connector
from dotenv import find_dotenv, load_dotenv
from joblib import Parallel, delayed
from snowflake.connector.pandas_tools import write_pandas



## About

Download Toronto Bikeshare trips data, bikeshare stations metadata and supplementary (neighbourhood-specific) datasets.

## User Inputs

In [3]:
# Datasets
# # Open Data Portal
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
# # Ridership
params = {"id": "7e876c24-177c-4605-9cef-e50dd74c617f"}
years_wanted = {2021: list(range(1, 12 + 1)), 2022: list(range(1, 1 + 1))}
# # Stations Metadata
about_params = {"id": "2b44db0d-eea9-442d-b038-79335368ad5a"}
stations_cols_wanted = [
    "station_id",
    "name",
    "physical_configuration",
    "lat",
    "lon",
    "altitude",
    "address",
    "capacity",
    "physicalkey",
    "transitcard",
    "creditcard",
    "phone",
]

# Ridership datetime columns
date_cols = ["Start Time", "End Time"]

# Ridership columns in which to drop missing values
nan_cols = [
    "Start Station Id",
    "End Station Id",
    "Start Station Name",
    "End Station Name",
]

# Snowflake resources
# # Database
stations_db_name = "torbikestations"
# # Tables
trips_table_name = "trips"
station_stats_table_name = "station_stats"
# # Stage
trips_stage_name = "bikes_stage"
# # File Format
trips_file_format_name = "COMMASEP_ONEHEADROW"

ci_run = "no"

In [4]:
# Ridership dtypes dict
dtypes_dict = {
    "Trip Id": pd.Int64Dtype(),
    "Trip Duration": pd.Int64Dtype(),
    "Start Station Id": pd.Int64Dtype(),
    "Start Station Name": pd.StringDtype(),
    "Start Station Id": pd.Int64Dtype(),
    "Start Station Name": pd.StringDtype(),
    "Bike Id": pd.Float64Dtype(),
    "User Type": pd.StringDtype(),
}

if ci_run == "no":
    load_dotenv(find_dotenv())

trips_db_name = os.getenv("DB_NAME")
snowflake_dict_no_db = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    role="sysadmin",
)
snowflake_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=trips_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)
snowflake_station_stats_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=stations_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)

In [5]:
def get_file_urls(
    main_dataset_url: str, dataset_params: Dict, years_wanted: Dict[int, List]
) -> List:
    package = requests.get(main_dataset_url, params=dataset_params).json()
    resources = package["result"]["resources"]
    df = pd.DataFrame.from_records(resources)
    year_month_wanted = [
        f"{y}-{str(m).zfill(2)}" for y, ms in years_wanted.items() for m in ms
    ]
    year_month_wanted_str = "|".join(year_month_wanted)
    urls_list = df.query("name.str.contains(@year_month_wanted_str)")["url"].tolist()
    return urls_list


def read_data(
    url: str, dtypes_dict: Dict, date_cols: List[str], nan_cols: List[str]
) -> Dict[str, Union[List[str], int]]:
    df = pd.read_csv(
        url,
        encoding="cp1252",
        parse_dates=date_cols,
        dtype=dtypes_dict,
    ).dropna(subset=nan_cols)
    # df.columns = [re.sub("[^A-Za-z0-9\s]+", "", c) for c in list(df)]
    df.columns = [
        re.sub("[^A-Za-z0-9\s]+", "", c).replace(" ", "_").upper() for c in list(df)
    ]
    df.columns = df.columns.str.replace(" ", "_").str.upper()
    fpath = f"data/raw/{os.path.basename(url).replace('.csv', '')}.csv"
    if not os.path.exists(fpath):
        df.to_csv(fpath, index=False)
    return {os.path.basename(url): {"columns": list(df), "nrows": len(df)}}


def get_single_ridership_data_file(
    url: str, dtypes_dict: Dict, date_cols: List[str], nan_cols: List[str]
) -> Dict[str, List[str]]:
    fname = os.path.basename(url)
    print(f"Loading data from {fname}...", end="")
    cols_dict = read_data(url, dtypes_dict, date_cols, nan_cols)
    print("Done.")
    return cols_dict


def get_all_data_files(
    urls_list: List, dtypes_dict: Dict, date_cols: List[str], nan_cols: List[str]
) -> Dict[str, List[str]]:
    executor = Parallel(n_jobs=cpu_count(), backend="multiprocessing")
    tasks = (
        delayed(get_single_ridership_data_file)(url, dtypes_dict, date_cols, nan_cols)
        for url in urls_list
    )
    cols_dicts = executor(tasks)
    # cols_dicts = [
    #     get_single_ridership_data_file(url, dtypes_dict, date_cols, nan_cols)
    #     for url in urls_list
    # ]
    return cols_dicts


def get_stations_metadata(stations_url: str, stations_params: Dict) -> pd.DataFrame:
    package = requests.get(stations_url, params=about_params).json()
    resources = package["result"]["resources"]
    df_about = pd.DataFrame.from_records(resources)
    r = requests.get(df_about["url"].tolist()[0]).json()
    url_stations = r["data"]["en"]["feeds"][2]["url"]
    df_stations = pd.DataFrame.from_records(
        requests.get(url_stations).json()["data"]["stations"]
    )
    return df_stations


def transform_metadata(
    df: pd.DataFrame, stations_cols_wanted: List[str]
) -> pd.DataFrame:
    df["station_id"] = df["station_id"].astype(int)
    dfa = pd.DataFrame(
        df.set_index("station_id")["rental_methods"].tolist(),
        columns=["key", "transitcard", "creditcard", "phone"],
    )
    for c in ["KEY", "TRANSITCARD", "CREDITCARD", "PHONE"]:
        dfa[c.lower()] = dfa[c.lower()].map({c: 1}).fillna(0).astype(int)
    df = pd.concat(
        [
            df.drop(columns=["groups", "rental_methods"]),
            dfa,
        ],
        axis=1,
    ).rename(columns={"key": "physicalkey"})[stations_cols_wanted]
    return df


def get_toronto_open_data(url, params, col_rename_dict={}):
    package = requests.get(url, params=params).json()
    datastore_url = (
        "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/"
        "action/datastore_search"
    )
    for _, resource in enumerate(package["result"]["resources"]):
        if resource["datastore_active"]:
            url = datastore_url
            p = {"id": resource["id"]}
            data = requests.get(url, params=p).json()
            df = pd.DataFrame(data["result"]["records"])
            break
    if col_rename_dict:
        df = df.rename(columns=col_rename_dict)
    return df


def get_lat_long(row):
    return row["coordinates"]


def get_poi_data(url: str, params: Dict) -> pd.DataFrame:
    poi_cols = [
        "ID",
        "NAME",
        "PLACE_NAME",
        "ADDRESS_FULL",
        "POSTAL_CODE",
        "ATTRACTION_DESC",
        "POI_LATITUDE",
        "POI_LONGITUDE",
    ]
    package = requests.get(url, params=poi_params).json()
    poi_url = package["result"]["resources"][0]["url"]
    df = pd.read_csv(poi_url)
    assert len(df) == 175
    df[["POI_LONGITUDE", "POI_LATITUDE"]] = pd.DataFrame(
        df["geometry"].apply(eval).apply(get_lat_long).tolist()
    )
    # Verify no duplicates (by name) are in the data
    assert df[df.duplicated(subset=["NAME"], keep=False)].empty
    return df


def get_cultural_hotspots(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    ch_locations = package["result"]["resources"][0]["url"]
    ch_locs_dir_path = "data/raw/cultural-hotspot-points-of-interest-wgs84"
    with urlopen(ch_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(ch_locs_dir_path)
    df = gpd.read_file(f"{ch_locs_dir_path}/CULTURAL_HOTSPOT_WGS84.shp")
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT", "LATITUDE", "LONGITUDE"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    assert df[df.duplicated(subset=["PNT_OF_INT"], keep=False)].empty
    df_essentials = df[["RID", "PNT_OF_INT", "LATITUDE", "LONGITUDE"]].rename(
        columns={
            "RID": "ID",
            "PNT_OF_INT": "NAME",
            "LATITUDE": "POI_LATITUDE",
            "LONGITUDE": "POI_LONGITUDE",
        }
    )
    return df_essentials


def get_neighbourhood_boundary_land_area_data(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    n_url = (
        package["result"]["resources"][0]["url"].replace(
            "datastore/dump", "download_resource"
        )
        + "?format=geojson&projection=4326"
    )
    gdf = gpd.read_file(n_url)
    gdf["centroid"] = gdf["geometry"].to_crs(epsg=3395).centroid.to_crs(epsg=4326)
    gdf["AREA_LATITUDE"] = gdf["centroid"].y
    gdf["AREA_LONGITUDE"] = gdf["centroid"].x
    assert len(gdf) == 140
    neigh_cols_to_show = [
        "AREA_ID",
        "AREA_SHORT_CODE",
        "AREA_LONG_CODE",
        "AREA_NAME",
        "Shape__Area",
        "LATITUDE",
        "AREA_LATITUDE",
        "LONGITUDE",
        "AREA_LONGITUDE",
    ]
    return gdf


def get_public_transit_locations(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    pt_locations = package["result"]["resources"][0]["url"]
    pt_locs_dir_path = "data/raw/opendata_ttc_schedules"
    with urlopen(pt_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(pt_locs_dir_path)
    df_pt = pd.read_csv(f"{pt_locs_dir_path}/stops.txt")
    display(df_pt.head())
    df_pt = df_pt.rename(columns={"stop_lat": "lat", "stop_lon": "lon"})
    return df_pt


def get_coll_univ_locations() -> pd.DataFrame:
    coll_univ_locations = {
        "centennial": {"lat": 43.7854, "lon": -79.22664},
        "george-brown": {"lat": 43.6761, "lon": -79.4111},
        "humber": {"lat": 43.7290, "lon": -79.6074},
        "ocad": {"lat": 43.6530, "lon": -79.3912},
        "ryerson": {"lat": 43.6577, "lon": -79.3788},
        "seneca": {"lat": 43.7955, "lon": -79.3496},
        "tynedale": {"lat": 43.7970, "lon": -79.3945},
        "uoft-scarborough": {"lat": 43.7844, "lon": -79.1851},
        "uoft": {"lat": 43.6629, "lon": -79.5019},
        "yorku": {"lat": 43.7735, "lon": -79.5019},
        "yorku-glendon": {"lat": 43.7279, "lon": -79.3780},
    }
    df_coll_univ = (
        pd.DataFrame.from_dict(coll_univ_locations, orient="index")
        .reset_index()
        .rename(columns={"index": "institution_name"})
        .reset_index()
        .rename(columns={"index": "institution_id"})
    )
    return df_coll_univ


def get_neighbourhood_profile_data(url: str, params: Dict) -> pd.DataFrame:
    df_neigh_demog = get_toronto_open_data(url, params)
    df_neigh_demog = (
        df_neigh_demog[
            df_neigh_demog["Characteristic"].isin(
                [
                    "Neighbourhood Number",
                    "Youth (15-24 years)",
                    "Working Age (25-54 years)",
                    "Population, 2016",
                ]
            )
        ]
        .iloc[:, slice(4, None)]
        .set_index("Characteristic")
        .T.reset_index()
        .iloc[1:]
        .reset_index(drop=True)
        .rename(columns={"index": "name"})
    )
    assert len(df_neigh_demog) == 140
    df_neigh_demog["AREA_NAME"] = (
        df_neigh_demog["name"] + " (" + df_neigh_demog["Neighbourhood Number"] + ")"
    )
    return df_neigh_demog


def get_neighbourhood_containing_point(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: str = "Latitude",
    lon: str = "Longitude",
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_order = list(df) + list(gdf)
    polygons_contains = (
        gpd.sjoin(
            gdf,
            gpd.GeoDataFrame(
                df, geometry=gpd.points_from_xy(df[lon], df[lat]), crs=crs
            ),
            predicate="contains",
        )
        .reset_index(drop=True)
        .drop(columns=["index_right"])[cols_order]
    )
    # print(polygons_contains)
    return polygons_contains


def get_data_with_neighbourhood(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: int,
    lon: int,
    col_to_join: str,
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_to_keep = [col_to_join, "AREA_NAME", "geometry", "Shape__Area"]
    df_check = get_neighbourhood_containing_point(gdf, df, lat, lon, crs)[cols_to_keep]
    display(df_check.head(2))
    df = df.merge(df_check.drop(columns=["geometry"]), on=col_to_join, how="left").drop(
        columns=["geometry"]
    )
    print(
        f"Dropped {len(df[['AREA_NAME']].isna().sum())} rows with a missing AREA_NAME"
    )
    df = df.dropna(subset=["AREA_NAME"])
    return df


def summarize_df(df: pd.DataFrame) -> None:
    """Show properties of a DataFrame."""
    display(
        df.dtypes.rename("dtype")
        .to_frame()
        .merge(
            df.isna().sum().rename("num_missing").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .assign(num=len(df))
        .merge(
            df.nunique().rename("nunique").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df.dropna(how="any")
            .sample(1)
            .squeeze()
            .rename("single_non_nan_value")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
    )


def show_sql_df(
    query: str,
    cursor,
    table_output: bool = False,
) -> Union[None, pd.DataFrame]:
    cursor.execute(query)
    if table_output:
        colnames = [cdesc[0].lower() for cdesc in cursor.description]
        cur_fetched = cursor.fetchall()
        if cur_fetched:
            df_query_output = pd.DataFrame.from_records(cur_fetched, columns=colnames)
            display(df_query_output)
            return df_query_output
    return None

## Get Bikeshare Trips Data

### Get URLs for Raw Trips Data Files

In [6]:
%%time
all_urls = get_file_urls(url, params, years_wanted)

CPU times: user 25.8 ms, sys: 1.64 ms, total: 27.4 ms
Wall time: 248 ms


### Download Raw Trips Data Files

In [34]:
%%time
cols_dict_list = get_all_data_files(all_urls, dtypes_dict, date_cols, nan_cols)

Loading data from 2021-01.csv...Loading data from bike-share-ridership-2021-02.csv...Loading data from bike-share-ridership-2021-03.csv...Loading data from bike-share-ridership-2021-04.csv...Loading data from bike-share-ridership-2021-05-.csv...Loading data from bike-share-ridership-2021-06.csv...Loading data from bike-share-ridership-2021-08.csv...Loading data from bike-share-ridership-2021-09.csv...Loading data from bike-share-ridership-2021-07.csv...Loading data from bike-share-ridership-2021-12.csv...Loading data from bike-share-ridership-2021-10.csv...Loading data from bike-share-ridership-2021-11.csv...Done.
Loading data from bike-share-ridership-2022-01.csv...Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
CPU times: user 66.6 ms, sys: 48.9 ms, total: 116 ms
Wall time: 3min 26s


Perform sanity checks on column names and column order in raw trips data files

In [8]:
cols_cleaned = {
    k: [re.sub("[^A-Za-z0-9\s]+", "", c) for c in l["columns"]]
    for f in cols_dict_list
    for k, l in f.items()
}
assert len(cols_cleaned) == len(cols_dict_list)

cols_equality_checks = {
    k: True if cols == list(cols_cleaned.values())[0] else False
    for k, cols in {m: cols_cleaned[m] for m in list(cols_cleaned)[1:]}.items()
}
try:
    assert all(list(cols_equality_checks.values()))
except AssertionError:
    print(cols_equality_checks)

## Get Supplementary Datasets

### Stations Metadata

In [9]:
%%time
df_stations = get_stations_metadata(url, about_params)
df_stations = transform_metadata(df_stations, stations_cols_wanted)
display(df_stations.head(2))
summarize_df(df_stations)

Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1


Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
station_id,int64,0,616,616,7181
name,object,0,616,616,Lansdowne Ave / Whytock Ave
physical_configuration,object,0,616,5,REGULAR
lat,float64,0,616,613,43.653889
lon,float64,0,616,615,-79.441389
altitude,float64,10,616,2,0.0
address,object,0,616,616,Lansdowne Ave / Whytock Ave
capacity,int64,0,616,36,15
physicalkey,int64,0,616,1,1
transitcard,int64,0,616,1,1


CPU times: user 91.7 ms, sys: 0 ns, total: 91.7 ms
Wall time: 777 ms


### Cultural Hotspots

In [10]:
%%time
params = {"id": "c7be2ee7-d317-4a28-8cbe-bff1ce116b46"}
dfch_essentials = get_cultural_hotspots(url, params)
dfch_essentials.head(2)

CPU times: user 60.8 ms, sys: 6.97 ms, total: 67.8 ms
Wall time: 441 ms


Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067
1,2,Crucified Again (Sculpture),43.753806,-79.21617


### Places of Interest

In [11]:
%%time
poi_params = {"id": "965247c0-c72e-49b4-bb1a-879cf98e1a32"}
df_poi = get_poi_data(url, poi_params)

CPU times: user 26.2 ms, sys: 0 ns, total: 26.2 ms
Wall time: 749 ms


Note that duplicate lat-long will be permitted here as multiple places of interest may share the same physical location, or immediately adjacent area. Such places of interest with a duplicated latitude and longitde are shown below

In [12]:
display(
    df_poi[df_poi.duplicated(subset=["POI_LATITUDE", "POI_LONGITUDE"], keep=False)][
        ["ID", "NAME", "POI_LATITUDE", "POI_LONGITUDE"]
    ]
    .sort_values(by=["POI_LATITUDE", "POI_LONGITUDE"])
    .style.set_caption("Duplicates of Latitude-Longitude")
)

Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
39,40,Enercare Centre (formely known as Direct Energy Centre),43.63453,-79.412552
41,42,Exhibition Place,43.63453,-79.412552
57,57,Harbourfront Centre,43.639232,-79.383105
171,171,York Quay Centre,43.639232,-79.383105
66,66,INFOTOGO Tourist Info Trailer - Ferry Terminal,43.640441,-79.375091
70,70,Jack Layton Ferry Terminal,43.640441,-79.375091
68,68,INFOTOGO Tourist Info Trailer - Roundhouse Park,43.641692,-79.385567
124,124,Steam Whistle Brewery,43.641692,-79.385567
153,154,Toronto Railway Museum,43.641692,-79.385567
23,24,CBC Museum,43.644418,-79.387703


These duplicated lat-long locations are shown below to be different points of interest based at the same site
- `ID`=40, `ID`=42
  - Enercare Centre and Exhibition Place are at the same site
- 57, 171
  - York Quay Centre [is at](https://www.museumsontario.ca/museum/York-Quay-Centre-at-Harbourfro) the HarborFront Centre
- 66, 70
  - both places are based at the the Ferry Terminal, so can correctly have the same lat-long
- 68, 124, 154
  - the Brewery and the Toronto Railway Museum are based at Roundhouse Park
- 24, 54
  - [Glenn Gould Studio](https://www.cbc.ca/glenngouldstudio/) is based at the CBC Museum
- 157, 160, 162
  - the [Tourist Information Centre](https://www.toronto.ca/explore-enjoy/visitor-services/tourist-information-centres/) is at the same site as the [Traveller's Aid Society](http://travellersaid.ca/contact.html) and [Union Station](https://torontounion.ca/contact/)
- 67, 145
  - a tourist information centre that is also baed at Nathan Phillips Square
- 8, 167
  - [Ashbridges Bay Park](https://www.toronto.ca/data/parks/prd/facilities/complex/1/index.html) is along [Woodbine Beach](https://www.toronto.ca/data/parks/prd/facilities/complex/311/index.html)
- 75, 111
  - Koerner Hall is at the Royal Observatory of Music
- 73, 74
  - [Kew Balmy Beach](https://www.tripadvisor.ca/Attraction_Review-g155019-d14788092-Reviews-Kew_Balmy_Beach-Toronto_Ontario.html#MAPVIEW-14788092) is at the same site as [Kew Gardens Park](https://www.toronto.ca/data/parks/prd/facilities/complex/107/index.html)
- 93, 141
  - both locations are at Todmorden Mills Park
- 9, 21
  - the Canadian Museum of Cultural Heritage of Indo-Canadians is based at the site of BAPS Shri Swaminarayan Mandir ([link](https://www.baps.org/cultureandheritage/ExperienceIndia/Exhibitions/CanadianMuseumofCulturalHeritageofIndo-Canadians.aspx))

So, the duplicate lat-long sites will be retained in this dataset.

### Neighbourhood Boundary and Land Area Data

In [13]:
%%time
neigh_params = {"id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
gdf = get_neighbourhood_boundary_land_area_data(url, neigh_params)
neigh_cols_to_show = [
    "AREA_ID",
    "AREA_SHORT_CODE",
    "AREA_LONG_CODE",
    "AREA_NAME",
    "Shape__Area",
    "LATITUDE",
    "AREA_LATITUDE",
    "LONGITUDE",
    "AREA_LONGITUDE",
]
gdf[
    gdf["AREA_NAME"].str.contains(
        "Wychwood|Yonge-Eglinton|Yonge-St.|York Univ|Yorkdale-Glen"
    )
][neigh_cols_to_show].sort_values(by=["AREA_NAME"])

CPU times: user 150 ms, sys: 23 ms, total: 173 ms
Wall time: 3.22 s


Unnamed: 0,AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,Shape__Area,LATITUDE,AREA_LATITUDE,LONGITUDE,AREA_LONGITUDE
133,2480144,94,94,Wychwood (94),3217960.0,,43.67692,,-79.425515
16,2480057,100,100,Yonge-Eglinton (100),3160334.0,,43.70469,,-79.40359
134,2480143,97,97,Yonge-St.Clair (97),2222464.0,,43.68786,,-79.397871
131,2480146,27,27,York University Heights (27),25418210.0,,43.76574,,-79.488883
69,2480105,31,31,Yorkdale-Glen Park (31),11566690.0,,43.714673,,-79.457108


In order to use the correct CRS for allowing an area calculation in square km, we'll get the current EPSG ([link](https://epsg.io/4326)) from the geodata

In [14]:
print(gdf.crs)

epsg:4326


Fix typographic errors in the name of the neighbourhood in this dataset
- [North St. James Town](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa74.pdf) and [Cabbagetown-South St. James Town](https://www.toronto.com/community-static/4550668-cabbagetown-south-st-james-town/)
  - missing space between ...St. and Ja...
- Weston-Pelham Park
  - incorrectly listed as its old name (from 2011) of Weston-Pellam Park ([link](https://www.toronto.ca/wp-content/uploads/2017/11/900b-91-Weston-Pellam-Park.pdf))
  - replace with [new name from 2016](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa91.pdf)

In [15]:
d_renaming = {
    "St.James": "St. James",
    "Weston-Pellam": "Weston-Pelham",
}
for k, v in d_renaming.items():
    gdf["AREA_NAME"] = gdf["AREA_NAME"].str.replace(k, v, regex=False)

The incorrect names have been successfully replaced as shown below

In [16]:
# Neighbourhood GeoData columns to use
geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

In [17]:
gdf.query("AREA_NAME.str.contains('James Town|Weston-|Cabbage')")[geo_cols]

Unnamed: 0,AREA_NAME,geometry,Shape__Area
18,North St. James Town (74),"POLYGON ((-79.38057 43.67161, -79.37947 43.671...",811303.9
40,Weston-Pelham Park (91),"POLYGON ((-79.46005 43.66723, -79.46092 43.668...",2794057.0
114,Cabbagetown-South St. James Town (71),"POLYGON ((-79.37672 43.66242, -79.37721 43.663...",2711742.0


Compare manual to provided neighbourhood areas (in square km)
- first, changes geodata projection to a cartesian system (EPSG = 3857, in units of m) ([1](https://epsg.io/3857))

In [18]:
area_diff = (gdf["geometry"].to_crs(epsg=3857).area) - gdf["Shape__Area"]
print(area_diff.min(), area_diff.max())

-0.10295796953141689 0.147477675229311


Since these are small differences (in units of square km), we'll use the provided neighbourhood areas from the `Shape__Area` column of the neighbourhood boundary file.

### Public Transit Locations

In [19]:
%%time
params = {"id": "7795b45e-e65a-4465-81fc-c36b9dfff169"}
df_pt_slice = get_public_transit_locations(url, params)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1
2,264,940,Davenport Rd at Dupont St,,43.675511,-79.401938,,,,,,2
3,265,1871,Davisville Ave at Cleveland St,,43.702088,-79.378112,,,,,,1
4,266,11700,Disco Rd at Attwell Dr,,43.701362,-79.594843,,,,,,1


CPU times: user 780 ms, sys: 363 ms, total: 1.14 s
Wall time: 13 s


### Colleges and Universities

In [20]:
df_coll_univ = get_coll_univ_locations()

### Neighbourhood Profile Data - Population

In [21]:
%%time
neigh_profile_params = {"id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
df_neigh_demog = get_neighbourhood_profile_data(url, neigh_profile_params)
df_neigh_demog.head(6)

CPU times: user 52.4 ms, sys: 2.22 ms, total: 54.6 ms
Wall time: 971 ms


Characteristic,name,Neighbourhood Number,"Population, 2016",Youth (15-24 years),Working Age (25-54 years),AREA_NAME
0,Agincourt North,129,29113,3705,11305,Agincourt North (129)
1,Agincourt South-Malvern West,128,23757,3360,9965,Agincourt South-Malvern West (128)
2,Alderwood,20,12054,1235,5220,Alderwood (20)
3,Annex,95,30526,3750,15040,Annex (95)
4,Banbury-Don Mills,42,27695,2730,10810,Banbury-Don Mills (42)
5,Bathurst Manor,34,15873,1940,6655,Bathurst Manor (34)


### Number of Locations Per Neighbourhood

#### Places of Interest

In [22]:
print(df_poi["ID"].nunique(), len(df_poi))
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_poi.head(2))

175 175


Unnamed: 0,_id,ID,ADDRESS_INFO,NAME,CATEGORY,PHONE,EMAIL,WEBSITE,GEOID,RECEIVED_DATA_DATE,ADDRESS_POINT_ID,ADDRESS_NUMBER,LINEAR_NAME_FULL,ADDRESS_FULL,POSTAL_CODE,MUNICIPALITY,CITY,PLACE_NAME,GENERAL_USE_CODE,CENTRELINE_ID,LO_NUM,LO_NUM_SUF,HI_NUM,HI_NUM_SUF,LINEAR_NAME_ID,X,Y,LONGITUDE,LATITUDE,OBJECTID,MI_PRINX,ATTRACTION_LEVEL,ATTRACTION_DESC,IMAGE_NAME,MAP_ACCESS,geometry,POI_LONGITUDE,POI_LATITUDE
0,1,16,,BMO Field,Sports / Entertainment Venue,416-815-5982,,www.bmofield.com,20229243.0,,20229243.0,170.0,Princes' Blvd,170 Princes' Blvd,M6K 3C3,former TORONTO,Toronto,CNE BMO Field,107007.0,20231258.0,170.0,,,,20228.0,,,,,16,4163950.0,2,BMO Field is home to the Toronto FC (Major Lea...,BMOField.jpg,Y,"{""type"": ""Point"", ""coordinates"": [-79.41861429...",-79.418614,43.632664
1,2,1,,Aga Khan Museum,Museum,416-646-4677,,www.agakhanmuseum.org,10142948.0,,10142948.0,77.0,Wynford Dr,77 Wynford Dr,M3C 1K1,NORTH YORK,Toronto,,107008.0,444094.0,77.0,,,,7128.0,,,,,1,4094277.0,1,"Dedicated to sharing the artistic, intellectua...",AgaKhan.jpg,Y,"{""type"": ""Point"", ""coordinates"": [-79.33233113...",-79.332331,43.725386


In [23]:
%%time
df_poi_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_poi.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(df_poi_new.head(2))

Unnamed: 0,ID,AREA_NAME,geometry,Shape__Area
0,23,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,120,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,ID,NAME,lat,lon,AREA_NAME,Shape__Area
0,16,BMO Field,43.632664,-79.418614,Niagara (82),6192651.0
1,1,Aga Khan Museum,43.725386,-79.332331,Banbury-Don Mills (42),19248970.0


CPU times: user 33.9 ms, sys: 824 µs, total: 34.7 ms
Wall time: 32.8 ms


#### Cultural Hotspots

In [24]:
assert dfch_essentials["ID"].nunique() == len(dfch_essentials)
dfch_essentials.head(2)

Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067
1,2,Crucified Again (Sculpture),43.753806,-79.21617


In [25]:
%%time
dfch_essentials_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    dfch_essentials.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(dfch_essentials_new.head(2))

Unnamed: 0,ID,AREA_NAME,geometry,Shape__Area
0,284,Downsview-Roding-CFB (26),"POLYGON ((-79.50783 43.71776, -79.50854 43.717...",28736800.0
1,45,Kennedy Park (124),"POLYGON ((-79.24549 43.73060, -79.24555 43.730...",6861056.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,ID,NAME,lat,lon,AREA_NAME,Shape__Area
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067,Bendale (127),14303500.0
1,2,Crucified Again (Sculpture),43.753806,-79.21617,Woburn (137),23664990.0


CPU times: user 33.6 ms, sys: 610 µs, total: 34.2 ms
Wall time: 32.4 ms


#### Colleges and Universities

In [26]:
print(df_coll_univ["institution_id"].nunique(), len(df_coll_univ))
df_coll_univ.head(2)

11 11


Unnamed: 0,institution_id,institution_name,lat,lon
0,0,centennial,43.7854,-79.22664
1,1,george-brown,43.6761,-79.4111


### Get Neighbourhood Data for Supplementary Datasets

#### Colleges and Universities

In [27]:
%%time
df_coll_univ_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_coll_univ,
    "lat",
    "lon",
    "institution_id",
)
display(df_coll_univ_new.head(2))

Unnamed: 0,institution_id,AREA_NAME,geometry,Shape__Area
0,1,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,7,Highland Creek (134),"POLYGON ((-79.17527 43.78021, -79.17535 43.780...",10077020.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,institution_id,institution_name,lat,lon,AREA_NAME,Shape__Area
0,0,centennial,43.7854,-79.22664,Woburn (137),23664990.0
1,1,george-brown,43.6761,-79.4111,Casa Loma (96),3678385.0


CPU times: user 30.1 ms, sys: 883 µs, total: 31 ms
Wall time: 28.6 ms


#### Public Transit Locations

In [28]:
print(df_pt_slice["stop_id"].nunique(), len(df_pt_slice))
df_pt_slice.head(2)

9455 9455


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,lat,lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1


In [29]:
%%time
df_pt_slice_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_pt_slice,
    "lat",
    "lon",
    "stop_id",
)
display(df_pt_slice_new.head(2))

Unnamed: 0,stop_id,AREA_NAME,geometry,Shape__Area
0,1857,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,4858,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,lat,lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,AREA_NAME,Shape__Area
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1,Clairlea-Birchmount (120),14168540.0
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1,Annex (95),5337192.0


CPU times: user 50.6 ms, sys: 0 ns, total: 50.6 ms
Wall time: 49.1 ms


### Merge Neighbourhood Aggregations with GeoData and Population Data

In [30]:
df_neigh_stats = (
    (
        gdf.set_index("AREA_NAME")[
            [
                "Shape__Area",
                "Shape__Length",
                "geometry",
                # "CLASSIFICATION",
                # "CLASSIFICATION_CODE",
                "AREA_LATITUDE",
                "AREA_LONGITUDE",
            ]
        ]
        .merge(
            df_pt_slice_new.groupby("AREA_NAME")["stop_id"]
            .count()
            .rename("transit_stops")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_coll_univ_new.groupby("AREA_NAME")["institution_id"]
            .count()
            .rename("colleges_univs")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            dfch_essentials_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("cultural_attractions")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_poi_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("places_of_interest")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .fillna(0)
        .astype(
            {
                k: int
                for k in [
                    "transit_stops",
                    "colleges_univs",
                    "cultural_attractions",
                    "places_of_interest",
                ]
            }
        )
        .merge(
            df_neigh_demog.set_index("AREA_NAME")[
                ["Population, 2016", "Youth (15-24 years)", "Working Age (25-54 years)"]
            ].rename(
                columns={
                    "Population, 2016": "pop_2016",
                    "Youth (15-24 years)": "youth_15_24",
                    "Working Age (25-54 years)": "work_age_25_54",
                }
            ),
            left_index=True,
            right_index=True,
            how="left",
        )
    )
    .add_prefix("neigh_")
    .rename(columns={"neigh_geometry": "geometry"})
)
df_neigh_stats.columns = df_neigh_stats.columns.str.lower().str.replace("__", "_")
df_neigh_stats = df_neigh_stats.reset_index()
for c in ["neigh_pop_2016", "neigh_youth_15_24", "neigh_work_age_25_54"]:
    df_neigh_stats[c] = df_neigh_stats[c].str.replace(",", "").astype(float)
df_neigh_stats.head()

Unnamed: 0,AREA_NAME,neigh_shape_area,neigh_shape_length,geometry,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
0,Casa Loma (96),3678385.0,8214.176485,"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",43.681853,-79.408007,42,1,0,3,10968.0,1080.0,4555.0
1,Annex (95),5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,1,30526.0,3750.0,15040.0
2,Caledonia-Fairbank (109),2955857.0,6849.911724,"POLYGON ((-79.46021 43.68156, -79.46044 43.681...",43.688569,-79.455212,35,0,0,0,9955.0,1220.0,4570.0
3,Woodbine Corridor (64),3052518.0,7512.966773,"POLYGON ((-79.31485 43.66674, -79.31660 43.666...",43.676774,-79.315408,29,0,0,0,12541.0,1035.0,6165.0
4,Lawrence Park South (103),6211341.0,13530.370002,"POLYGON ((-79.41096 43.70408, -79.41165 43.703...",43.717213,-79.406038,42,0,0,1,15179.0,2095.0,5870.0


In [31]:
assert type(df_neigh_stats).__name__ == "GeoDataFrame"
assert df_stations["station_id"].nunique() == len(df_stations)
df_stations.head(2)

Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1


### Merge Stations Metadata with Aggregated Neighbourhood Stats

Append the neighbourhood containing each bikeshare station to the station metadata

In [32]:
%%time
df_stations_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_stations,
    "lat",
    "lon",
    "station_id",
)
display(df_stations_new.head(2))

Unnamed: 0,station_id,AREA_NAME,geometry,Shape__Area
0,7142,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,7141,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone,AREA_NAME,Shape__Area
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1,Waterfront Communities-The Island (77),25629770.0
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1,Church-Yonge Corridor (75),2609014.0


CPU times: user 33.4 ms, sys: 393 µs, total: 33.8 ms
Wall time: 32 ms


Merge the modified stations metadata with the neighbourhood stats

In [33]:
df_stations_new = (
    df_stations_new.set_index("AREA_NAME")
    .merge(
        df_neigh_stats.set_index("AREA_NAME"),
        left_index=True,
        right_index=True,
        how="left",
    )
    .reset_index()
    .rename(columns={"Shape__Area": "Shape_Area"})
)
df_stations_new.columns = df_stations_new.columns.str.upper()
print(df_stations_new.shape)
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_stations_new.head(4))
display(df_stations_new.dtypes.rename("dtype").to_frame())

(616, 26)


Unnamed: 0,AREA_NAME,STATION_ID,NAME,PHYSICAL_CONFIGURATION,LAT,LON,ALTITUDE,ADDRESS,CAPACITY,PHYSICALKEY,TRANSITCARD,CREDITCARD,PHONE,SHAPE_AREA,NEIGH_SHAPE_AREA,NEIGH_SHAPE_LENGTH,GEOMETRY,NEIGH_AREA_LATITUDE,NEIGH_AREA_LONGITUDE,NEIGH_TRANSIT_STOPS,NEIGH_COLLEGES_UNIVS,NEIGH_CULTURAL_ATTRACTIONS,NEIGH_PLACES_OF_INTEREST,NEIGH_POP_2016,NEIGH_YOUTH_15_24,NEIGH_WORK_AGE_25_54
0,Annex (95),7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,,Madison Ave / Bloor St W,15,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,1,30526.0,3750.0,15040.0
1,Annex (95),7040,Euclid Ave / Bloor St W,REGULAR,43.664467,-79.414783,0.0,Euclid Ave / Bloor St W,19,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,1,30526.0,3750.0,15040.0
2,Annex (95),7061,Dalton Rd / Bloor St W,REGULAR,43.666294,-79.406643,0.0,Dalton Rd / Bloor St W,15,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,1,30526.0,3750.0,15040.0
3,Annex (95),7126,Yonge St / Yorkville Ave,REGULAR,43.671944,-79.387778,0.0,Yonge St / Yorkville Ave,17,1,1,1,1,5337192.0,5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",43.671586,-79.404,98,0,0,1,30526.0,3750.0,15040.0


Unnamed: 0,dtype
AREA_NAME,object
STATION_ID,int64
NAME,object
PHYSICAL_CONFIGURATION,object
LAT,float64
LON,float64
ALTITUDE,float64
ADDRESS,object
CAPACITY,int64
PHYSICALKEY,int64


## Database Administration

### Create bikeshare trips and station metadata databases

In [35]:
conn = snowflake.connector.connect(**snowflake_dict_no_db)
cur = conn.cursor()

In [37]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = cur.execute(f"DROP DATABASE IF EXISTS {db_name}")

CPU times: user 13.8 ms, sys: 0 ns, total: 13.8 ms
Wall time: 210 ms


In [38]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = cur.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")

CPU times: user 13.6 ms, sys: 672 µs, total: 14.3 ms
Wall time: 557 ms


In [39]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = show_sql_df(f"SHOW DATABASES LIKE '{db_name}'", cur, table_output=True)

Unnamed: 0,created_on,name,is_default,is_current,origin,owner,comment,options,retention_time
0,2022-03-03 12:24:00.276000-08:00,TORBIKES,N,N,,SYSADMIN,,,1


Unnamed: 0,created_on,name,is_default,is_current,origin,owner,comment,options,retention_time
0,2022-03-03 12:24:00.627000-08:00,TORBIKESTATIONS,N,Y,,SYSADMIN,,,1


CPU times: user 61.5 ms, sys: 12.3 ms, total: 73.7 ms
Wall time: 288 ms


In [40]:
cur.close()
conn.close()

### Create bikeshare trips File Format

In [41]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

In [43]:
%%time
query = fr"""
        CREATE OR REPLACE FILE FORMAT {trips_file_format_name}
        TYPE = 'CSV'
        COMPRESSION = 'AUTO'
        FIELD_DELIMITER = ','
        RECORD_DELIMITER = '\n'
        SKIP_HEADER = 1
        TRIM_SPACE = FALSE
        ERROR_ON_COLUMN_COUNT_MISMATCH = TRUE
        ESCAPE = 'NONE'
        DATE_FORMAT = 'AUTO'
        TIMESTAMP_FORMAT = 'AUTO'
        NULL_IF = ('\\N')
        """
_ = cur.execute(query)

CPU times: user 3.61 ms, sys: 1.12 ms, total: 4.73 ms
Wall time: 119 ms


### Create Internal Stage for bikeshare trips data

In [44]:
%%time
query = f"""
        CREATE OR REPLACE STAGE {trips_stage_name}
        FILE_FORMAT = {trips_file_format_name}
        """
_ = cur.execute(query)

CPU times: user 4.68 ms, sys: 258 µs, total: 4.93 ms
Wall time: 553 ms


In [45]:
%%time
query = """
        SHOW STAGES
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,created_on,name,database_name,schema_name,url,has_credentials,has_encryption_key,owner,comment,region,type,cloud,notification_channel,storage_integration
0,2022-03-03 12:24:19.918000-08:00,BIKES_STAGE,TORBIKES,PUBLIC,,N,N,SYSADMIN,,,INTERNAL,,,


CPU times: user 31.3 ms, sys: 1.14 ms, total: 32.4 ms
Wall time: 427 ms


### Stage Local Raw Trips Data

In [46]:
%%time
for file in glob("data/raw/*.csv"):
    query = f"""
            PUT file://{file} @{trips_stage_name}
            """
    print(query.strip())
    _ = cur.execute(query)

PUT file://data/raw/bike-share-ridership-2021-12.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-03.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-04.csv @bikes_stage
PUT file://data/raw/2021-01.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2022-01.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-09.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-05-.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-10.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-08.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-07.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-02.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-06.csv @bikes_stage
PUT file://data/raw/bike-share-ridership-2021-11.csv @bikes_stage
CPU times: user 30.2 s, sys: 213 ms, total: 30.4 s
Wall time: 53.1 s


In [47]:
query = f"""
        LIST @{trips_stage_name}/
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,name,size,md5,last_modified
0,bikes_stage/2021-01.csv.gz,2250736,e7a5028781606941f6fca5afdb2ab3e8,"Thu, 3 Mar 2022 20:24:39 GMT"
1,bikes_stage/bike-share-ridership-2021-02.csv.gz,1554896,1976e16dbfa8fe8907de298f4bd8210d,"Thu, 3 Mar 2022 20:25:12 GMT"
2,bikes_stage/bike-share-ridership-2021-03.csv.gz,4076560,fb82cead590f2190d512793cbfd3b2db,"Thu, 3 Mar 2022 20:24:34 GMT"
3,bikes_stage/bike-share-ridership-2021-04.csv.gz,5682560,fad531cbcf3e7b603e1ac8e9d133e1fb,"Thu, 3 Mar 2022 20:24:38 GMT"
4,bikes_stage/bike-share-ridership-2021-05-.csv.gz,10451056,f37d07c61a9f0e9626c33a502d8f9ed8,"Thu, 3 Mar 2022 20:24:53 GMT"
5,bikes_stage/bike-share-ridership-2021-06.csv.gz,11325552,b68a7b6fa0588c83080f1dd3e131fa60,"Thu, 3 Mar 2022 20:25:17 GMT"
6,bikes_stage/bike-share-ridership-2021-07.csv.gz,11663312,7f72f085091c90613745f6388602266d,"Thu, 3 Mar 2022 20:25:10 GMT"
7,bikes_stage/bike-share-ridership-2021-08.csv.gz,12494544,b04b2fe77aa992303cbe47a4454e49c6,"Thu, 3 Mar 2022 20:25:04 GMT"
8,bikes_stage/bike-share-ridership-2021-09.csv.gz,11489808,1979bd2f15f32d0a97685ff26cd76d11,"Thu, 3 Mar 2022 20:24:47 GMT"
9,bikes_stage/bike-share-ridership-2021-10.csv.gz,9347248,0dad527baf95e48d2fafd996071201fd,"Thu, 3 Mar 2022 20:24:58 GMT"


### Create bikeshare trips Table

In [48]:
%%time
_ = cur.execute(
    f"""
    CREATE OR REPLACE TABLE {trips_table_name} (
        trip_id integer,
        trip_duration integer,
        start_station_id integer,
        start_time timestamp,
        start_station_name string,
        end_station_id integer,
        end_time timestamp,
        end_station_name string,
        bike_id integer,
        user_type string
    )
    """
)

CPU times: user 5.99 ms, sys: 0 ns, total: 5.99 ms
Wall time: 358 ms


In [49]:
_ = show_sql_df(f"SHOW TABLES LIKE '%{trips_table_name}%'", cur, True)

Unnamed: 0,created_on,name,database_name,schema_name,kind,comment,cluster_by,rows,bytes,owner,retention_time,automatic_clustering,change_tracking,search_optimization,search_optimization_progress,search_optimization_bytes,is_external
0,2022-03-03 12:25:31.357000-08:00,TRIPS,TORBIKES,PUBLIC,TABLE,,,0,0,SYSADMIN,1,OFF,OFF,OFF,,,N


In [50]:
_ = show_sql_df(f"SHOW COLUMNS IN TABLE {trips_table_name}", cur, True)

Unnamed: 0,table_name,schema_name,column_name,data_type,null?,default,kind,expression,comment,database_name,autoincrement
0,TRIPS,PUBLIC,TRIP_ID,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
1,TRIPS,PUBLIC,TRIP_DURATION,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
2,TRIPS,PUBLIC,START_STATION_ID,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
3,TRIPS,PUBLIC,START_TIME,"{""type"":""TIMESTAMP_NTZ"",""precision"":0,""scale"":...",True,,COLUMN,,,TORBIKES,
4,TRIPS,PUBLIC,START_STATION_NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKES,
5,TRIPS,PUBLIC,END_STATION_ID,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
6,TRIPS,PUBLIC,END_TIME,"{""type"":""TIMESTAMP_NTZ"",""precision"":0,""scale"":...",True,,COLUMN,,,TORBIKES,
7,TRIPS,PUBLIC,END_STATION_NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKES,
8,TRIPS,PUBLIC,BIKE_ID,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKES,
9,TRIPS,PUBLIC,USER_TYPE,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKES,


### Add Staged Trips Data to Trips Table

In [51]:
%%time
query = f"""
        COPY INTO {trips_table_name} from @{trips_stage_name}
        """
_ = cur.execute(query)

CPU times: user 5.49 ms, sys: 0 ns, total: 5.49 ms
Wall time: 5.03 s


In [52]:
%%time
query = f"""
        SELECT *
        FROM {trips_table_name}
        LIMIT 5
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
0,11015571,195,7032,2021-04-01 00:01:00,Augusta Ave / Dundas St W,7049,2021-04-01 00:04:00,Queen St W / Portland St,656,Annual Member
1,11015572,938,7168,2021-04-01 00:01:00,Queens Quay / Yonge St,7508,2021-04-01 00:17:00,Berkeley St / Dundas St E - SMART,5272,Annual Member
2,11015573,1145,7012,2021-04-01 00:03:00,Elizabeth St / Edward St (Bus Terminal),7012,2021-04-01 00:23:00,Elizabeth St / Edward St (Bus Terminal),3253,Annual Member
3,11015574,1061,7037,2021-04-01 00:04:00,Bathurst St / Dundas St W,7079,2021-04-01 00:22:00,McGill St / Church St,3233,Annual Member
4,11015575,460,7198,2021-04-01 00:07:00,Queen St W / Cowan Ave,7662,2021-04-01 00:15:00,Beaty Ave / Queen St W,1381,Annual Member


CPU times: user 13.2 ms, sys: 0 ns, total: 13.2 ms
Wall time: 717 ms


In [53]:
%%time
query = f"""
        SELECT COUNT(*) AS num_rows
        FROM {trips_table_name}
        """
df_query_nrows_trips = show_sql_df(query, cur, True)

Unnamed: 0,num_rows
0,3621230


CPU times: user 8.03 ms, sys: 0 ns, total: 8.03 ms
Wall time: 100 ms


In [54]:
assert df_query_nrows_trips.loc[0, "num_rows"] == sum(
    [l["nrows"] for f in cols_dict_list for k, l in f.items()]
)

In [55]:
cur.close()
conn.close()

### Create Stations Metadata to Table

In [56]:
conn = snowflake.connector.connect(**snowflake_station_stats_dict)
cur = conn.cursor()

In [57]:
%%time
_ = cur.execute(
    f"""
    CREATE OR REPLACE TABLE {station_stats_table_name} (
        area_name string,
        station_id integer,
        name string,
        physical_configuration string,
        lat float,
        lon float,
        altitude float,
        address string,
        capacity integer,
        physicalkey integer,
        transitcard integer,
        creditcard integer,
        phone integer,
        shape_area float,
        neigh_shape_area float,
        neigh_shape_length float,
        neigh_area_latitude float,
        neigh_area_longitude float,
        neigh_transit_stops integer,
        neigh_colleges_univs integer,
        neigh_cultural_attractions integer,
        neigh_places_of_interest integer,
        neigh_pop_2016 float,
        neigh_youth_15_24 float,
        neigh_work_age_25_54 float
    )
    """
)

CPU times: user 4.88 ms, sys: 50 µs, total: 4.93 ms
Wall time: 367 ms


In [58]:
_ = show_sql_df(f"SHOW TABLES LIKE '%{station_stats_table_name}%'", cur, True)

Unnamed: 0,created_on,name,database_name,schema_name,kind,comment,cluster_by,rows,bytes,owner,retention_time,automatic_clustering,change_tracking,search_optimization,search_optimization_progress,search_optimization_bytes,is_external
0,2022-03-03 12:25:51.874000-08:00,STATION_STATS,TORBIKESTATIONS,PUBLIC,TABLE,,,0,0,SYSADMIN,1,OFF,OFF,OFF,,,N


In [59]:
df_cols_stations_table = show_sql_df(
    f"SHOW COLUMNS IN TABLE {station_stats_table_name}", cur, True
)

Unnamed: 0,table_name,schema_name,column_name,data_type,null?,default,kind,expression,comment,database_name,autoincrement
0,STATION_STATS,PUBLIC,AREA_NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
1,STATION_STATS,PUBLIC,STATION_ID,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKESTATIONS,
2,STATION_STATS,PUBLIC,NAME,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
3,STATION_STATS,PUBLIC,PHYSICAL_CONFIGURATION,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
4,STATION_STATS,PUBLIC,LAT,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKESTATIONS,
5,STATION_STATS,PUBLIC,LON,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKESTATIONS,
6,STATION_STATS,PUBLIC,ALTITUDE,"{""type"":""REAL"",""nullable"":true}",True,,COLUMN,,,TORBIKESTATIONS,
7,STATION_STATS,PUBLIC,ADDRESS,"{""type"":""TEXT"",""length"":16777216,""byteLength"":...",True,,COLUMN,,,TORBIKESTATIONS,
8,STATION_STATS,PUBLIC,CAPACITY,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKESTATIONS,
9,STATION_STATS,PUBLIC,PHYSICALKEY,"{""type"":""FIXED"",""precision"":38,""scale"":0,""null...",True,,COLUMN,,,TORBIKESTATIONS,


In [60]:
assert (
    len(df_cols_stations_table) == df_stations_new.drop(columns=["GEOMETRY"]).shape[1]
)
assert pd.Series(
    df_stations_new.drop(columns=["GEOMETRY"]).columns.rename("column_name")
).equals(df_cols_stations_table["column_name"])

### Add Stations Metadata to Table

In [61]:
df_stations_new.drop(columns=["GEOMETRY"]).dtypes.rename("dtype").to_frame()

Unnamed: 0,dtype
AREA_NAME,object
STATION_ID,int64
NAME,object
PHYSICAL_CONFIGURATION,object
LAT,float64
LON,float64
ALTITUDE,float64
ADDRESS,object
CAPACITY,int64
PHYSICALKEY,int64


In [62]:
%%time
success, nchunks, nrows, _ = write_pandas(
    conn, df_stations_new.drop(columns=['GEOMETRY']), station_stats_table_name.upper()
)

CPU times: user 202 ms, sys: 3.85 ms, total: 206 ms
Wall time: 3.4 s


In [63]:
%%time
query = f"""
        SELECT COUNT(*) AS num_rows
        FROM {station_stats_table_name}
        """
df_query_nrows_stations = show_sql_df(query, cur, True)

Unnamed: 0,num_rows
0,616


CPU times: user 8.89 ms, sys: 49 µs, total: 8.94 ms
Wall time: 366 ms


In [64]:
assert success
try:
    assert nrows == len(df_stations_new)
    assert df_query_nrows_stations.loc[0, "num_rows"] == len(df_stations_new)
    print(f"Exported: {len(df_stations_new):,} rows, as expected")
except AssertionError:
    print(f"Expected: {len(df_stations_new):,} rows\nActual: {nrows:,} rows")

Exported: 616 rows, as expected


In [65]:
cur.close()
conn.close()

## Query Data From Databases

In [66]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

In [67]:
%%time
query = f"""
        SELECT *
        FROM {trips_db_name}.public.{trips_table_name}
        ORDER BY start_time
        LIMIT 10
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
0,10644218,1315,7021,2021-01-01 00:04:00,Bay St / Albert St,7164,2021-01-01 00:26:00,Gould St / Yonge St (Ryerson University),6795,Annual Member
1,10644220,396,7534,2021-01-01 00:07:00,Walnut Ave / Queen St W,7524,2021-01-01 00:13:00,Lisgar Park,4176,Casual Member
2,10644221,86,7162,2021-01-01 00:10:00,Hayter St / Laplante Ave,7006,2021-01-01 00:11:00,Bay St / College St (East Side),1814,Annual Member
3,10644222,741,7003,2021-01-01 00:10:00,Madison Ave / Bloor St W,7272,2021-01-01 00:22:00,Yonge St / Dundonald St - SMART,198,Casual Member
4,10644223,2073,7562,2021-01-01 00:11:00,Priscilla Ave / Dundas St W - SMART,7562,2021-01-01 00:45:00,Priscilla Ave / Dundas St W - SMART,6688,Casual Member
5,10644224,1924,7442,2021-01-01 00:11:00,Lonsdale Rd / Spadina Rd,7256,2021-01-01 00:43:00,Vanauley St / Queen St W - SMART,196,Annual Member
6,10644225,473,7006,2021-01-01 00:12:00,Bay St / College St (East Side),7025,2021-01-01 00:19:00,Ted Rogers Way / Bloor St E,6882,Annual Member
7,10644226,265,7014,2021-01-01 00:13:00,Sherbourne St / Carlton St (Allan Gardens),7508,2021-01-01 00:17:00,Berkeley St / Dundas St E - SMART,6485,Annual Member
8,10644227,1079,7053,2021-01-01 00:13:00,Metro Hall Plaza,7079,2021-01-01 00:31:00,McGill St / Church St,5791,Casual Member
9,10644228,981,7528,2021-01-01 00:15:00,Spadina Rd / Austin Terrace - SMART,7481,2021-01-01 00:31:00,Westmount Ave / St Clair Ave W - SMART,2951,Casual Member


CPU times: user 14.1 ms, sys: 161 µs, total: 14.3 ms
Wall time: 1.02 s


In [68]:
%%time
query = f"""
        SELECT trip_id,
               trip_duration,
               start_time,
               start_station_name,
               user_type
        FROM {trips_db_name}.public.{trips_table_name}
        LIMIT 10
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,trip_id,trip_duration,start_time,start_station_name,user_type
0,10748540,1240,2021-02-01 00:01:00,Queen St W / Roncesvalles Ave,Annual Member
1,10748541,444,2021-02-01 00:28:00,Kendal Ave / Bernard Ave,Annual Member
2,10748542,533,2021-02-01 00:40:00,Bay St / St. Joseph St,Annual Member
3,10748543,316,2021-02-01 00:58:00,Queen's Park Cres E / Grosvenor St - SMART,Annual Member
4,10748544,104,2021-02-01 00:59:00,Yonge St / Alexander St - SMART,Annual Member
5,10748545,204,2021-02-01 01:02:00,Church St / Alexander St,Annual Member
6,10748547,1466,2021-02-01 01:19:00,Spadina Ave / Adelaide St W,Annual Member
7,10748548,524,2021-02-01 01:31:00,Dundas St W / Yonge St,Annual Member
8,10748549,393,2021-02-01 01:46:00,Danforth Ave / Ellerbeck St,Annual Member
9,10748551,500,2021-02-01 02:38:00,Ross St / Cecil St - SMART,Annual Member


CPU times: user 15.5 ms, sys: 0 ns, total: 15.5 ms
Wall time: 397 ms


In [69]:
%%time
query = f"""
        SELECT start_station_name AS station_name,
               DATE_PART(year, start_time) AS year,
               DATE_PART(month, start_time) AS month,
               DATE_PART(day, start_time) AS day,
               DATE_PART(hour, start_time) AS hour,
               user_type,
               COUNT(DISTINCT trip_id) AS num_trips
        FROM {trips_db_name}.public.{trips_table_name}
        GROUP BY 1,2,3,4,5,6
        LIMIT 10000
        """
_ = show_sql_df(query, cur, True)

Unnamed: 0,station_name,year,month,day,hour,user_type,num_trips
0,Sherbourne St / Carlton St (Allan Gardens),2021,9,1,0,Casual Member,1
1,Wellington St W / Portland St,2021,9,1,9,Annual Member,3
2,Queen St W / Gladstone Ave,2021,9,1,6,Annual Member,1
3,439 Sherbourne St,2021,9,1,12,Annual Member,1
4,Navy Wharf Ct. / Bremner Blvd.,2021,9,1,6,Annual Member,2
...,...,...,...,...,...,...,...
9995,Fort York Blvd / Capreol Ct,2021,2,9,15,Annual Member,1
9996,Danforth Ave / Aldridge Ave,2021,2,9,17,Annual Member,1
9997,College Park- Gerrard Entrance,2021,2,9,18,Casual Member,1
9998,Walton St / Elizabeth St - SMART,2021,2,10,16,Annual Member,3


CPU times: user 304 ms, sys: 159 µs, total: 304 ms
Wall time: 3.06 s


In [70]:
cur.close()
conn.close()