# Get Data

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
from glob import glob
from io import BytesIO
from multiprocessing import cpu_count
from typing import Dict, List, Union
from urllib.request import urlopen
from zipfile import ZipFile

import boto3
import geopandas as gpd
import pandas as pd
import requests
import snowflake.connector
from dotenv import find_dotenv, load_dotenv
from joblib import Parallel, delayed
from snowflake.connector.pandas_tools import write_pandas

## About

Download Toronto Bikeshare trips data, bikeshare stations metadata and supplementary (neighbourhood-specific) datasets.

## User Inputs

In [None]:
# Datasets
# # Open Data Portal
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
# # Ridership
params = {"id": "7e876c24-177c-4605-9cef-e50dd74c617f"}
years_wanted = {2021: list(range(1, 12 + 1)), 2022: list(range(1, 1 + 1))}
# # Stations Metadata
about_params = {"id": "2b44db0d-eea9-442d-b038-79335368ad5a"}
stations_cols_wanted = [
    "station_id",
    "name",
    "physical_configuration",
    "lat",
    "lon",
    "altitude",
    "address",
    "capacity",
    "physicalkey",
    "transitcard",
    "creditcard",
    "phone",
]

# Ridership datetime columns
date_cols = ["Start Time", "End Time"]

# Ridership columns in which to drop missing values
nan_cols = [
    "Start Station Id",
    "End Station Id",
    "Start Station Name",
    "End Station Name",
]

# Snowflake resources
# # Database
stations_db_name = "torbikestations"
# # Tables
trips_table_name = "trips"
station_stats_table_name = "station_stats"
# # Stage
trips_stage_name = "bikes_stage"
# # File Format
trips_file_format_name = "COMMASEP_ONEHEADROW"

ci_run = "no"

In [None]:
# Ridership dtypes dict
dtypes_dict = {
    "Trip Id": pd.Int64Dtype(),
    "Trip Duration": pd.Int64Dtype(),
    "Start Station Id": pd.Int64Dtype(),
    "Start Station Name": pd.StringDtype(),
    "Start Station Id": pd.Int64Dtype(),
    "Start Station Name": pd.StringDtype(),
    "Bike Id": pd.Float64Dtype(),
    "User Type": pd.StringDtype(),
}

if ci_run == "no":
    load_dotenv(find_dotenv())

trips_db_name = os.getenv("DB_NAME")
snowflake_dict_no_db = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    role="sysadmin",
)
snowflake_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=trips_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)
snowflake_station_stats_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=stations_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)

In [None]:
aws_region = os.getenv("AWS_REGION")
account_id = (
    boto3.client("sts", region_name=aws_region).get_caller_identity().get("Account")
)

In [None]:
def get_file_urls(
    main_dataset_url: str, dataset_params: Dict, years_wanted: Dict[int, List]
) -> List:
    package = requests.get(main_dataset_url, params=dataset_params).json()
    resources = package["result"]["resources"]
    df = pd.DataFrame.from_records(resources)
    year_month_wanted = [
        f"{y}-{str(m).zfill(2)}" for y, ms in years_wanted.items() for m in ms
    ]
    year_month_wanted_str = "|".join(year_month_wanted)
    urls_list = df.query("name.str.contains(@year_month_wanted_str)")["url"].tolist()
    return urls_list


def read_data(
    url: str, dtypes_dict: Dict, date_cols: List[str], nan_cols: List[str]
) -> Dict[str, Union[List[str], int]]:
    df = pd.read_csv(
        url,
        encoding="cp1252",
        parse_dates=date_cols,
        dtype=dtypes_dict,
    ).dropna(subset=nan_cols)
    # df.columns = [re.sub("[^A-Za-z0-9\s]+", "", c) for c in list(df)]
    df.columns = [
        re.sub("[^A-Za-z0-9\s]+", "", c).replace(" ", "_").upper() for c in list(df)
    ]
    df.columns = df.columns.str.replace(" ", "_").str.upper()
    fpath = f"data/raw/{os.path.basename(url).replace('.csv', '')}.csv"
    if not os.path.exists(fpath):
        df.to_csv(fpath, index=False)
    return {os.path.basename(url): {"columns": list(df), "nrows": len(df)}}


def get_single_ridership_data_file(
    url: str, dtypes_dict: Dict, date_cols: List[str], nan_cols: List[str]
) -> Dict[str, List[str]]:
    fname = os.path.basename(url)
    print(f"Loading data from {fname}...", end="")
    cols_dict = read_data(url, dtypes_dict, date_cols, nan_cols)
    print("Done.")
    return cols_dict


def get_all_data_files(
    urls_list: List, dtypes_dict: Dict, date_cols: List[str], nan_cols: List[str]
) -> Dict[str, List[str]]:
    executor = Parallel(n_jobs=cpu_count(), backend="multiprocessing")
    tasks = (
        delayed(get_single_ridership_data_file)(url, dtypes_dict, date_cols, nan_cols)
        for url in urls_list
    )
    cols_dicts = executor(tasks)
    # cols_dicts = [
    #     get_single_ridership_data_file(url, dtypes_dict, date_cols, nan_cols)
    #     for url in urls_list
    # ]
    return cols_dicts


def get_stations_metadata(stations_url: str, stations_params: Dict) -> pd.DataFrame:
    package = requests.get(stations_url, params=about_params).json()
    resources = package["result"]["resources"]
    df_about = pd.DataFrame.from_records(resources)
    r = requests.get(df_about["url"].tolist()[0]).json()
    url_stations = r["data"]["en"]["feeds"][2]["url"]
    df_stations = pd.DataFrame.from_records(
        requests.get(url_stations).json()["data"]["stations"]
    )
    return df_stations


def transform_metadata(
    df: pd.DataFrame, stations_cols_wanted: List[str]
) -> pd.DataFrame:
    df["station_id"] = df["station_id"].astype(int)
    dfa = pd.DataFrame(
        df.set_index("station_id")["rental_methods"].tolist(),
        columns=["key", "transitcard", "creditcard", "phone"],
    )
    for c in ["KEY", "TRANSITCARD", "CREDITCARD", "PHONE"]:
        dfa[c.lower()] = dfa[c.lower()].map({c: 1}).fillna(0).astype(int)
    df = pd.concat(
        [
            df.drop(columns=["groups", "rental_methods"]),
            dfa,
        ],
        axis=1,
    ).rename(columns={"key": "physicalkey"})[stations_cols_wanted]
    return df


def get_toronto_open_data(url, params, col_rename_dict={}):
    package = requests.get(url, params=params).json()
    datastore_url = (
        "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/"
        "action/datastore_search"
    )
    for _, resource in enumerate(package["result"]["resources"]):
        if resource["datastore_active"]:
            url = datastore_url
            p = {"id": resource["id"]}
            data = requests.get(url, params=p).json()
            df = pd.DataFrame(data["result"]["records"])
            break
    if col_rename_dict:
        df = df.rename(columns=col_rename_dict)
    return df


def get_lat_long(row):
    return row["coordinates"]


def get_poi_data(url: str, params: Dict) -> pd.DataFrame:
    poi_cols = [
        "ID",
        "NAME",
        "PLACE_NAME",
        "ADDRESS_FULL",
        "POSTAL_CODE",
        "ATTRACTION_DESC",
        "POI_LATITUDE",
        "POI_LONGITUDE",
    ]
    package = requests.get(url, params=poi_params).json()
    poi_url = package["result"]["resources"][0]["url"]
    df = pd.read_csv(poi_url)
    df = df.rename(columns={list(df)[0]: "ID"})

    df[["POI_LONGITUDE", "POI_LATITUDE"]] = pd.DataFrame(
        df["geometry"].apply(eval).apply(get_lat_long).tolist()
    )
    # Verify no duplicates (by name) are in the data
    assert df[df.duplicated(subset=["NAME"], keep=False)].empty
    return df


def get_cultural_hotspots(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    ch_locations = package["result"]["resources"][0]["url"]
    ch_locs_dir_path = "data/raw/cultural-hotspot-points-of-interest-wgs84"
    with urlopen(ch_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(ch_locs_dir_path)
    df = gpd.read_file(f"{ch_locs_dir_path}/CULTURAL_HOTSPOT_WGS84.shp")
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT", "LATITUDE", "LONGITUDE"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    assert df[df.duplicated(subset=["PNT_OF_INT"], keep=False)].empty
    df_essentials = df[["RID", "PNT_OF_INT", "LATITUDE", "LONGITUDE"]].rename(
        columns={
            "RID": "ID",
            "PNT_OF_INT": "NAME",
            "LATITUDE": "POI_LATITUDE",
            "LONGITUDE": "POI_LONGITUDE",
        }
    )
    return df_essentials


def get_neighbourhood_boundary_land_area_data(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    files = package["result"]["resources"]
    n_url = [f["url"] for f in files if f["url"].endswith("4326.geojson")][0]
    gdf = gpd.read_file(n_url)
    gdf["centroid"] = gdf["geometry"].to_crs(epsg=3395).centroid.to_crs(epsg=4326)
    gdf["AREA_LATITUDE"] = gdf["centroid"].y
    gdf["AREA_LONGITUDE"] = gdf["centroid"].x
    assert len(gdf) == 140
    neigh_cols_to_show = [
        "AREA_ID",
        "AREA_SHORT_CODE",
        "AREA_LONG_CODE",
        "AREA_NAME",
        "Shape__Area",
        "LATITUDE",
        "AREA_LATITUDE",
        "LONGITUDE",
        "AREA_LONGITUDE",
    ]
    return gdf


def get_public_transit_locations(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    pt_locations = package["result"]["resources"][0]["url"]
    pt_locs_dir_path = "data/raw/opendata_ttc_schedules"
    with urlopen(pt_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(pt_locs_dir_path)
    df_pt = pd.read_csv(f"{pt_locs_dir_path}/stops.txt")
    display(df_pt.head())
    df_pt = df_pt.rename(columns={"stop_lat": "lat", "stop_lon": "lon"})
    return df_pt


def get_coll_univ_locations() -> pd.DataFrame:
    coll_univ_locations = {
        "centennial": {"lat": 43.7854, "lon": -79.22664},
        "george-brown": {"lat": 43.6761, "lon": -79.4111},
        "humber": {"lat": 43.7290, "lon": -79.6074},
        "ocad": {"lat": 43.6530, "lon": -79.3912},
        "ryerson": {"lat": 43.6577, "lon": -79.3788},
        "seneca": {"lat": 43.7955, "lon": -79.3496},
        "tynedale": {"lat": 43.7970, "lon": -79.3945},
        "uoft-scarborough": {"lat": 43.7844, "lon": -79.1851},
        "uoft": {"lat": 43.6629, "lon": -79.5019},
        "yorku": {"lat": 43.7735, "lon": -79.5019},
        "yorku-glendon": {"lat": 43.7279, "lon": -79.3780},
    }
    df_coll_univ = (
        pd.DataFrame.from_dict(coll_univ_locations, orient="index")
        .reset_index()
        .rename(columns={"index": "institution_name"})
        .reset_index()
        .rename(columns={"index": "institution_id"})
    )
    return df_coll_univ


def get_neighbourhood_profile_data(url: str, params: Dict) -> pd.DataFrame:
    df_neigh_demog = get_toronto_open_data(url, params)
    df_neigh_demog = (
        df_neigh_demog[
            df_neigh_demog["Characteristic"].isin(
                [
                    "Neighbourhood Number",
                    "Youth (15-24 years)",
                    "Working Age (25-54 years)",
                    "Population, 2016",
                ]
            )
        ]
        .iloc[:, slice(4, None)]
        .set_index("Characteristic")
        .T.reset_index()
        .iloc[1:]
        .reset_index(drop=True)
        .rename(columns={"index": "name"})
    )
    assert len(df_neigh_demog) == 140
    df_neigh_demog["AREA_NAME"] = (
        df_neigh_demog["name"] + " (" + df_neigh_demog["Neighbourhood Number"] + ")"
    )
    return df_neigh_demog


def get_neighbourhood_containing_point(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: str = "Latitude",
    lon: str = "Longitude",
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_order = list(df) + list(gdf)
    polygons_contains = (
        gpd.sjoin(
            gdf,
            gpd.GeoDataFrame(
                df, geometry=gpd.points_from_xy(df[lon], df[lat]), crs=crs
            ),
            predicate="contains",
        )
        .reset_index(drop=True)
        .drop(columns=["index_right"])[cols_order]
    )
    # print(polygons_contains)
    return polygons_contains


def get_data_with_neighbourhood(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: int,
    lon: int,
    col_to_join: str,
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_to_keep = [col_to_join, "AREA_NAME", "geometry", "Shape__Area"]
    df_check = get_neighbourhood_containing_point(gdf, df, lat, lon, crs)[cols_to_keep]
    display(df_check.head(2))
    df = df.merge(df_check.drop(columns=["geometry"]), on=col_to_join, how="left").drop(
        columns=["geometry"]
    )
    print(
        f"Dropped {len(df[['AREA_NAME']].isna().sum())} rows with a missing AREA_NAME"
    )
    df = df.dropna(subset=["AREA_NAME"])
    return df


def summarize_df(df: pd.DataFrame) -> None:
    """Show properties of a DataFrame."""
    display(
        df.dtypes.rename("dtype")
        .to_frame()
        .merge(
            df.isna().sum().rename("num_missing").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .assign(num=len(df))
        .merge(
            df.nunique().rename("nunique").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df.dropna(how="any")
            .sample(1)
            .squeeze()
            .rename("single_non_nan_value")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
    )


def show_sql_df(
    query: str,
    cursor,
    table_output: bool = False,
) -> Union[None, pd.DataFrame]:
    cursor.execute(query)
    if table_output:
        colnames = [cdesc[0].lower() for cdesc in cursor.description]
        cur_fetched = cursor.fetchall()
        if cur_fetched:
            df_query_output = pd.DataFrame.from_records(cur_fetched, columns=colnames)
            display(df_query_output)
            return df_query_output
    return None

## Create AWS Python SDK Objects for Creating QuickSight Resources

In [None]:
qs_client_user = boto3.client("quicksight", region_name="us-east-1")
qs_client = boto3.client("quicksight", region_name=aws_region)

## Get Bikeshare Trips Data

### Get URLs for Raw Trips Data Files

In [None]:
%%time
all_urls = get_file_urls(url, params, years_wanted)

### Download Raw Trips Data Files

In [None]:
%%time
cols_dict_list = get_all_data_files(all_urls, dtypes_dict, date_cols, nan_cols)

Perform sanity checks on column names and column order in raw trips data files

In [None]:
cols_cleaned = {
    k: [re.sub("[^A-Za-z0-9\s]+", "", c) for c in l["columns"]]
    for f in cols_dict_list
    for k, l in f.items()
}
assert len(cols_cleaned) == len(cols_dict_list)

cols_equality_checks = {
    k: True if cols == list(cols_cleaned.values())[0] else False
    for k, cols in {m: cols_cleaned[m] for m in list(cols_cleaned)[1:]}.items()
}
try:
    assert all(list(cols_equality_checks.values()))
except AssertionError:
    print(cols_equality_checks)

## Get Supplementary Datasets

### Stations Metadata

In [None]:
%%time
df_stations = get_stations_metadata(url, about_params)
df_stations = transform_metadata(df_stations, stations_cols_wanted)
display(df_stations.head(2))
summarize_df(df_stations)

### Cultural Hotspots

In [None]:
%%time
params = {"id": "c7be2ee7-d317-4a28-8cbe-bff1ce116b46"}
dfch_essentials = get_cultural_hotspots(url, params)
dfch_essentials.head(2)

### Places of Interest

In [None]:
%%time
poi_params = {"id": "965247c0-c72e-49b4-bb1a-879cf98e1a32"}
df_poi = get_poi_data(url, poi_params)

Note that duplicate lat-long will be permitted here as multiple places of interest may share the same physical location, or immediately adjacent area. Some examples of such places of interest with a duplicated latitude and longitde are shown in `0_get_bikeshare_data.ipynb`. So, the duplicate lat-long sites will be retained in this dataset.

### Neighbourhood Boundary and Land Area Data

In [None]:
%%time
neigh_params = {"id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
gdf = get_neighbourhood_boundary_land_area_data(url, neigh_params)
neigh_cols_to_show = [
    "AREA_ID",
    "AREA_SHORT_CODE",
    "AREA_LONG_CODE",
    "AREA_NAME",
    "Shape__Area",
    "LATITUDE",
    "AREA_LATITUDE",
    "LONGITUDE",
    "AREA_LONGITUDE",
]
gdf[
    gdf["AREA_NAME"].str.contains(
        "Wychwood|Yonge-Eglinton|Yonge-St.|York Univ|Yorkdale-Glen"
    )
][neigh_cols_to_show].sort_values(by=["AREA_NAME"])

In order to use the correct CRS for allowing an area calculation in square km, we'll get the current EPSG ([link](https://epsg.io/4326)) from the geodata

In [None]:
print(gdf.crs)

Fix typographic errors in the name of the neighbourhood in this dataset
- [North St. James Town](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa74.pdf) and [Cabbagetown-South St. James Town](https://www.toronto.com/community-static/4550668-cabbagetown-south-st-james-town/)
  - missing space between ...St. and Ja...
- Weston-Pelham Park
  - incorrectly listed as its old name (from 2011) of Weston-Pellam Park ([link](https://www.toronto.ca/wp-content/uploads/2017/11/900b-91-Weston-Pellam-Park.pdf))
  - replace with [new name from 2016](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa91.pdf)

In [None]:
d_renaming = {
    "St.James": "St. James",
    "Weston-Pellam": "Weston-Pelham",
}
for k, v in d_renaming.items():
    gdf["AREA_NAME"] = gdf["AREA_NAME"].str.replace(k, v, regex=False)

The incorrect names have been successfully replaced as shown below

In [None]:
# Neighbourhood GeoData columns to use
geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

In [None]:
gdf.query("AREA_NAME.str.contains('James Town|Weston-|Cabbage')")[geo_cols]

Compare manual to provided neighbourhood areas (in square km)
- first, changes geodata projection to a cartesian system (EPSG = 3857, in units of m) ([1](https://epsg.io/3857))

In [None]:
area_diff = (gdf["geometry"].to_crs(epsg=3857).area) - gdf["Shape__Area"]
print(area_diff.min(), area_diff.max())

Since these are small differences (in units of square km), we'll use the provided neighbourhood areas from the `Shape__Area` column of the neighbourhood boundary file.

### Public Transit Locations

In [None]:
%%time
params = {"id": "7795b45e-e65a-4465-81fc-c36b9dfff169"}
df_pt_slice = get_public_transit_locations(url, params)

### Colleges and Universities

In [None]:
df_coll_univ = get_coll_univ_locations()

### Neighbourhood Profile Data - Population

In [None]:
%%time
neigh_profile_params = {"id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
df_neigh_demog = get_neighbourhood_profile_data(url, neigh_profile_params)
df_neigh_demog.head(6)

### Number of Locations Per Neighbourhood

#### Places of Interest

In [None]:
print(df_poi["ID"].nunique(), len(df_poi))
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_poi.head(2))

In [None]:
%%time
df_poi_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_poi.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(df_poi_new.head(2))

#### Cultural Hotspots

In [None]:
assert dfch_essentials["ID"].nunique() == len(dfch_essentials)
dfch_essentials.head(2)

In [None]:
%%time
dfch_essentials_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    dfch_essentials.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(dfch_essentials_new.head(2))

#### Colleges and Universities

In [None]:
print(df_coll_univ["institution_id"].nunique(), len(df_coll_univ))
df_coll_univ.head(2)

### Get Neighbourhood Data for Supplementary Datasets

#### Colleges and Universities

In [None]:
%%time
df_coll_univ_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_coll_univ,
    "lat",
    "lon",
    "institution_id",
)
display(df_coll_univ_new.head(2))

#### Public Transit Locations

In [None]:
print(df_pt_slice["stop_id"].nunique(), len(df_pt_slice))
df_pt_slice.head(2)

In [None]:
%%time
df_pt_slice_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_pt_slice,
    "lat",
    "lon",
    "stop_id",
)
display(df_pt_slice_new.head(2))

### Merge Neighbourhood Aggregations with GeoData and Population Data

In [None]:
df_neigh_stats = (
    (
        gdf.set_index("AREA_NAME")[
            [
                "Shape__Area",
                "Shape__Length",
                "geometry",
                # "CLASSIFICATION",
                # "CLASSIFICATION_CODE",
                "AREA_LATITUDE",
                "AREA_LONGITUDE",
            ]
        ]
        .merge(
            df_pt_slice_new.groupby("AREA_NAME")["stop_id"]
            .count()
            .rename("transit_stops")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_coll_univ_new.groupby("AREA_NAME")["institution_id"]
            .count()
            .rename("colleges_univs")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            dfch_essentials_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("cultural_attractions")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_poi_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("places_of_interest")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .fillna(0)
        .astype(
            {
                k: int
                for k in [
                    "transit_stops",
                    "colleges_univs",
                    "cultural_attractions",
                    "places_of_interest",
                ]
            }
        )
        .merge(
            df_neigh_demog.set_index("AREA_NAME")[
                ["Population, 2016", "Youth (15-24 years)", "Working Age (25-54 years)"]
            ].rename(
                columns={
                    "Population, 2016": "pop_2016",
                    "Youth (15-24 years)": "youth_15_24",
                    "Working Age (25-54 years)": "work_age_25_54",
                }
            ),
            left_index=True,
            right_index=True,
            how="left",
        )
    )
    .add_prefix("neigh_")
    .rename(columns={"neigh_geometry": "geometry"})
)
df_neigh_stats.columns = df_neigh_stats.columns.str.lower().str.replace("__", "_")
df_neigh_stats = df_neigh_stats.reset_index()
for c in ["neigh_pop_2016", "neigh_youth_15_24", "neigh_work_age_25_54"]:
    df_neigh_stats[c] = df_neigh_stats[c].str.replace(",", "").astype(float)
df_neigh_stats.head()

In [None]:
assert type(df_neigh_stats).__name__ == "GeoDataFrame"
assert df_stations["station_id"].nunique() == len(df_stations)
df_stations.head(2)

### Merge Stations Metadata with Aggregated Neighbourhood Stats

Append the neighbourhood containing each bikeshare station to the station metadata

In [None]:
%%time
df_stations_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_stations,
    "lat",
    "lon",
    "station_id",
)
display(df_stations_new.head(2))

Merge the modified stations metadata with the neighbourhood stats

In [None]:
df_stations_new = (
    df_stations_new.set_index("AREA_NAME")
    .merge(
        df_neigh_stats.set_index("AREA_NAME"),
        left_index=True,
        right_index=True,
        how="left",
    )
    .reset_index()
    .rename(columns={"Shape__Area": "Shape_Area"})
)
df_stations_new.columns = df_stations_new.columns.str.upper()
print(df_stations_new.shape)
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_stations_new.head(4))
display(df_stations_new.dtypes.rename("dtype").to_frame())

## Database Administration

### Create bikeshare trips and station metadata databases

In [None]:
conn = snowflake.connector.connect(**snowflake_dict_no_db)
cur = conn.cursor()

In [None]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = cur.execute(f"DROP DATABASE IF EXISTS {db_name}")

In [None]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = cur.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")

In [None]:
%%time
for db_name in [trips_db_name, stations_db_name]:
    _ = show_sql_df(f"SHOW DATABASES LIKE '{db_name}'", cur, table_output=True)

In [None]:
%%time
_ = show_sql_df("SHOW DATABASES", cur, table_output=True)

In [None]:
cur.close()
conn.close()

### Create bikeshare trips File Format

In [None]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

In [None]:
%%time
query = fr"""
        CREATE OR REPLACE FILE FORMAT {trips_file_format_name}
        TYPE = 'CSV'
        COMPRESSION = 'AUTO'
        FIELD_DELIMITER = ','
        RECORD_DELIMITER = '\n'
        SKIP_HEADER = 1
        TRIM_SPACE = FALSE
        ERROR_ON_COLUMN_COUNT_MISMATCH = TRUE
        ESCAPE = 'NONE'
        DATE_FORMAT = 'AUTO'
        TIMESTAMP_FORMAT = 'AUTO'
        NULL_IF = ('\\N')
        """
_ = cur.execute(query)

### Create Internal Stage for bikeshare trips data

In [None]:
%%time
query = f"""
        CREATE OR REPLACE STAGE {trips_stage_name}
        FILE_FORMAT = {trips_file_format_name}
        """
_ = cur.execute(query)

In [None]:
%%time
query = """
        SHOW STAGES
        """
_ = show_sql_df(query, cur, True)

### Stage Local Raw Trips Data

In [None]:
%%time
for file in glob("data/raw/*.csv"):
    query = f"""
            PUT file://{file} @{trips_stage_name}
            """
    print(query.strip())
    _ = cur.execute(query)

In [None]:
query = f"""
        LIST @{trips_stage_name}/
        """
_ = show_sql_df(query, cur, True)

### Create bikeshare trips Table

In [None]:
%%time
_ = cur.execute(
    f"""
    CREATE OR REPLACE TABLE {trips_table_name} (
        trip_id integer,
        trip_duration integer,
        start_station_id integer,
        start_time timestamp,
        start_station_name string,
        end_station_id integer,
        end_time timestamp,
        end_station_name string,
        bike_id integer,
        user_type string
    )
    """
)

In [None]:
_ = show_sql_df(f"SHOW TABLES LIKE '%{trips_table_name}%'", cur, True)

In [None]:
_ = show_sql_df(f"SHOW COLUMNS IN TABLE {trips_table_name}", cur, True)

### Add Staged Trips Data to Trips Table

In [None]:
%%time
query = f"""
        COPY INTO {trips_table_name} from @{trips_stage_name}
        """
_ = cur.execute(query)

In [None]:
%%time
query = f"""
        SELECT *
        FROM {trips_table_name}
        LIMIT 5
        """
_ = show_sql_df(query, cur, True)

In [None]:
%%time
query = f"""
        SELECT COUNT(*) AS num_rows
        FROM {trips_table_name}
        """
df_query_nrows_trips = show_sql_df(query, cur, True)

In [None]:
assert df_query_nrows_trips.loc[0, "num_rows"] == sum(
    [l["nrows"] for f in cols_dict_list for k, l in f.items()]
)

In [None]:
cur.close()
conn.close()

### Create Stations Metadata to Table

In [None]:
conn = snowflake.connector.connect(**snowflake_station_stats_dict)
cur = conn.cursor()

In [None]:
%%time
_ = cur.execute(
    f"""
    CREATE OR REPLACE TABLE {station_stats_table_name} (
        area_name string,
        station_id integer,
        name string,
        physical_configuration string,
        lat float,
        lon float,
        altitude float,
        address string,
        capacity integer,
        physicalkey integer,
        transitcard integer,
        creditcard integer,
        phone integer,
        shape_area float,
        neigh_shape_area float,
        neigh_shape_length float,
        neigh_area_latitude float,
        neigh_area_longitude float,
        neigh_transit_stops integer,
        neigh_colleges_univs integer,
        neigh_cultural_attractions integer,
        neigh_places_of_interest integer,
        neigh_pop_2016 float,
        neigh_youth_15_24 float,
        neigh_work_age_25_54 float
    )
    """
)

In [None]:
_ = show_sql_df(f"SHOW TABLES LIKE '%{station_stats_table_name}%'", cur, True)

In [None]:
df_cols_stations_table = show_sql_df(
    f"SHOW COLUMNS IN TABLE {station_stats_table_name}", cur, True
)

In [None]:
assert (
    len(df_cols_stations_table) == df_stations_new.drop(columns=["GEOMETRY"]).shape[1]
)
assert pd.Series(
    df_stations_new.drop(columns=["GEOMETRY"]).columns.rename("column_name")
).equals(df_cols_stations_table["column_name"])

### Add Stations Metadata to Table

In [None]:
df_stations_new.drop(columns=["GEOMETRY"]).dtypes.rename("dtype").to_frame()

In [None]:
%%time
success, nchunks, nrows, _ = write_pandas(
    conn, df_stations_new.drop(columns=['GEOMETRY']), station_stats_table_name.upper()
)

In [None]:
%%time
query = f"""
        SELECT COUNT(*) AS num_rows
        FROM {station_stats_table_name}
        """
df_query_nrows_stations = show_sql_df(query, cur, True)

In [None]:
assert success
try:
    assert nrows == len(df_stations_new)
    assert df_query_nrows_stations.loc[0, "num_rows"] == len(df_stations_new)
    print(f"Exported: {len(df_stations_new):,} rows, as expected")
except AssertionError:
    print(f"Expected: {len(df_stations_new):,} rows\nActual: {nrows:,} rows")

In [None]:
cur.close()
conn.close()

## Query Data From Databases

In [None]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

In [None]:
%%time
query = f"""
        SELECT *
        FROM {trips_db_name}.public.{trips_table_name}
        ORDER BY start_time
        LIMIT 10
        """
_ = show_sql_df(query, cur, True)

In [None]:
%%time
query = f"""
        SELECT trip_id,
               trip_duration,
               start_time,
               start_station_name,
               user_type
        FROM {trips_db_name}.public.{trips_table_name}
        LIMIT 10
        """
_ = show_sql_df(query, cur, True)

In [None]:
%%time
query = f"""
        SELECT start_station_name AS station_name,
               DATE_PART(year, start_time) AS year,
               DATE_PART(month, start_time) AS month,
               DATE_PART(day, start_time) AS day,
               DATE_PART(hour, start_time) AS hour,
               user_type,
               COUNT(DISTINCT trip_id) AS num_trips
        FROM {trips_db_name}.public.{trips_table_name}
        GROUP BY 1,2,3,4,5,6
        LIMIT 10000
        """
_ = show_sql_df(query, cur, True)

In [None]:
cur.close()
conn.close()

## AWS QuickSight Data Source

In [None]:
user_arn = [
    u["Arn"]
    for u in qs_client_user.list_users(AwsAccountId=account_id, Namespace="default")[
        "UserList"
    ]
    if u["UserName"].startswith("els")
][0]
user_arn

In [None]:
%%time
dso_creation_response = qs_client.create_data_source(
    AwsAccountId=account_id,
    DataSourceId=f"snowflake-{trips_db_name}",
    Name=trips_db_name,
    Type='SNOWFLAKE',
    DataSourceParameters={
        'SnowflakeParameters': {
            'Host': os.getenv("SNOWFLAKE_ACCOUNT")+".snowflakecomputing.com",
            'Database': trips_db_name.upper(),
            'Warehouse': os.getenv("SNOWFLAKE_WAREHOUSE"),
        },
    },
    Credentials={
        'CredentialPair': {
            'Username': os.getenv("SNOWFLAKE_USER"),
            'Password': os.getenv("SNOWFLAKE_PASS"),
        },
    },
    Permissions= [
      {
        'Principal': user_arn,
        'Actions': [
          'quicksight:DescribeDataSource',
          'quicksight:DescribeDataSourcePermissions',
          'quicksight:UpdateDataSource',
          'quicksight:UpdateDataSourcePermissions',
          'quicksight:DeleteDataSource',
          'quicksight:PassDataSource'
        ]
      }
    ],
)
dso_creation_response