# Get Data

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import configparser
import os
import urllib
from io import BytesIO
from multiprocessing import cpu_count
from typing import Dict, List
from urllib.request import urlopen
from zipfile import ZipFile

import geopandas as gpd
import pandas as pd
import requests
from joblib import Parallel, delayed
from sqlalchemy import create_engine

In [None]:
%aimport src.utils
from src.utils import summarize_df

In [None]:
# Datasets
# # Open Data Portal
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
# # Ridership
params = {"id": "7e876c24-177c-4605-9cef-e50dd74c617f"}
# # Stations Metadata
about_params = {"id": "2b44db0d-eea9-442d-b038-79335368ad5a"}

# Name of database table to store merged hourly aggregated data
table_name = "ridership"

# Ridership dtypes dict
dtypes_dict = {
    "Trip Duration": int,
    "Start Station Id": int,
    "End Station Id": float,
    "Bike Id": float,
}

# Neighbourhood GeoData columns to keep when getting neighbourhood
# containing a location
geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

In [None]:
config = configparser.ConfigParser()
config.read("../sql.ini")
default_cfg = config["default"]

In [None]:
DB_TYPE = default_cfg["DB_TYPE"]
DB_DRIVER = default_cfg["DB_DRIVER"]
DB_USER = default_cfg["DB_USER"]
DB_PASS = default_cfg["DB_PASS"]
DB_HOST = default_cfg["DB_HOST"]
DB_PORT = default_cfg["DB_PORT"]
DB_NAME = "bikeshare"

In [None]:
# Connect to single database (required to create database)
URI_NO_DB = f"{DB_TYPE}+{DB_DRIVER}://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}"

# Connect to all databases (required to perform CRUD operations and submit queries)
URI = f"{DB_TYPE}+{DB_DRIVER}://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

In [None]:
def get_file_urls(main_dataset_url: str, dataset_params: Dict) -> List:
    package = requests.get(main_dataset_url, params=dataset_params).json()
    resources = package["result"]["resources"]
    df = pd.DataFrame.from_records(resources)
    year_month_wanted = [
        f"{y}-{str(m).zfill(2)}" for y in [2021] for m in range(1, 10 + 1)
    ]
    year_month_wanted_str = "|".join(year_month_wanted)
    urls_list = df.query("name.str.contains(@year_month_wanted_str)")["url"].tolist()
    return urls_list


def read_data(url: str, dtypes_dict: Dict) -> pd.DataFrame:
    df = pd.read_csv(
        url,
        encoding="cp1252",
        parse_dates=["Start Time", "End Time"],
        dtype=dtypes_dict,
    )
    df = df.rename(columns={list(df)[0]: "Trip Id"})
    df.columns = df.columns.str.replace("  ", " ").str.replace(" ", "_").str.lower()
    return df


def load(df: pd.DataFrame, table_name: str, uri: str) -> None:
    engine = create_engine(URI)
    conn = engine.connect()
    df_stations.to_sql(table_name, index=False, con=conn, if_exists="append")
    conn.close()
    engine.dispose()


def get_single_ridership_data_file(url: str, dtypes_dict: Dict) -> pd.DataFrame:
    fname = os.path.basename(url)
    print(f"Loading data from {fname}...", end="")
    df = read_data(url, dtypes_dict)
    print("Done.")
    return df


def get_all_data_files(urls_list: List, dtypes_dict: Dict) -> pd.DataFrame:
    executor = Parallel(n_jobs=cpu_count(), backend="multiprocessing")
    tasks = (
        delayed(get_single_ridership_data_file)(url, dtypes_dict) for url in urls_list
    )
    dfs = executor(tasks)
    df = pd.concat(dfs, ignore_index=True)
    return df


def get_stations_metadata(stations_url: str, stations_params: Dict) -> pd.DataFrame:
    package = requests.get(stations_url, params=about_params).json()
    resources = package["result"]["resources"]
    df_about = pd.DataFrame.from_records(resources)
    r = requests.get(df_about["url"].tolist()[0]).json()
    url_stations = r["data"]["en"]["feeds"][2]["url"]
    df_stations = pd.DataFrame.from_records(
        requests.get(url_stations).json()["data"]["stations"]
    )
    return df_stations


def transform_metadata(df: pd.DataFrame) -> pd.DataFrame:
    df["station_id"] = df["station_id"].astype(int)
    dfa = pd.DataFrame(
        df.set_index("station_id")["rental_methods"].tolist(),
        columns=["key", "transitcard", "creditcard", "phone"],
    )
    for c in ["KEY", "TRANSITCARD", "CREDITCARD", "PHONE"]:
        dfa[c.lower()] = dfa[c.lower()].map({c: 1}).fillna(0).astype(int)
    df = pd.concat(
        [
            df.drop(columns=["groups", "rental_methods"]),
            dfa,
        ],
        axis=1,
    ).rename(columns={"key": "physicalkey"})
    return df


def get_toronto_open_data(url, params, col_rename_dict={}):
    package = requests.get(url, params=params).json()
    datastore_url = (
        "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/"
        "action/datastore_search"
    )
    for _, resource in enumerate(package["result"]["resources"]):
        if resource["datastore_active"]:
            url = datastore_url
            p = {"id": resource["id"]}
            data = requests.get(url, params=p).json()
            df = pd.DataFrame(data["result"]["records"])
            break
    if col_rename_dict:
        df = df.rename(columns=col_rename_dict)
    return df


def get_lat_long(row):
    return row["coordinates"]


def get_poi_data(url: str, params: Dict) -> pd.DataFrame:
    poi_cols = [
        "ID",
        "NAME",
        "PLACE_NAME",
        "ADDRESS_FULL",
        "POSTAL_CODE",
        "ATTRACTION_DESC",
        "POI_LATITUDE",
        "POI_LONGITUDE",
    ]
    package = requests.get(url, params=poi_params).json()
    poi_url = package["result"]["resources"][0]["url"]
    df = pd.read_csv(poi_url)
    assert len(df) == 175
    df[["POI_LONGITUDE", "POI_LATITUDE"]] = pd.DataFrame(
        df["geometry"].apply(eval).apply(get_lat_long).tolist()
    )
    # Verify no duplicates (by name) are in the data
    assert df[df.duplicated(subset=["NAME"], keep=False)].empty
    return df


def get_cultural_hotspots(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    ch_locations = package["result"]["resources"][0]["url"]
    ch_locs_dir_path = "data/raw/cultural-hotspot-points-of-interest-wgs84"
    with urlopen(ch_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(ch_locs_dir_path)
    df = gpd.read_file(f"{ch_locs_dir_path}/CULTURAL_HOTSPOT_WGS84.shp")
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT", "LATITUDE", "LONGITUDE"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    assert df[df.duplicated(subset=["PNT_OF_INT"], keep=False)].empty
    df_essentials = df[["RID", "PNT_OF_INT", "LATITUDE", "LONGITUDE"]].rename(
        columns={
            "RID": "ID",
            "PNT_OF_INT": "NAME",
            "LATITUDE": "POI_LATITUDE",
            "LONGITUDE": "POI_LONGITUDE",
        }
    )
    return df_essentials


def get_neighbourhood_boundary_land_area_data(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    n_url = (
        package["result"]["resources"][0]["url"].replace(
            "datastore/dump", "download_resource"
        )
        + "?format=geojson&projection=4326"
    )
    gdf = gpd.read_file(n_url)
    gdf["centroid"] = gdf["geometry"].to_crs(epsg=3395).centroid.to_crs(epsg=4326)
    gdf["AREA_LATITUDE"] = gdf["centroid"].y
    gdf["AREA_LONGITUDE"] = gdf["centroid"].x
    assert len(gdf) == 140
    neigh_cols_to_show = [
        "AREA_ID",
        "AREA_SHORT_CODE",
        "AREA_LONG_CODE",
        "AREA_NAME",
        "Shape__Area",
        "LATITUDE",
        "AREA_LATITUDE",
        "LONGITUDE",
        "AREA_LONGITUDE",
    ]
    return gdf


def get_public_transit_locations(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    pt_locations = package["result"]["resources"][0]["url"]
    pt_locs_dir_path = "data/raw/opendata_ttc_schedules"
    with urlopen(pt_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(pt_locs_dir_path)
    df_pt = pd.read_csv(f"{pt_locs_dir_path}/stops.txt")
    display(df_pt.head())
    df_pt = df_pt.rename(columns={"stop_lat": "lat", "stop_lon": "lon"})
    return df_pt


def get_coll_univ_locations() -> pd.DataFrame:
    coll_univ_locations = {
        "centennial": {"lat": 43.7854, "lon": -79.22664},
        "george-brown": {"lat": 43.6761, "lon": -79.4111},
        "humber": {"lat": 43.7290, "lon": -79.6074},
        "ocad": {"lat": 43.6530, "lon": -79.3912},
        "ryerson": {"lat": 43.6577, "lon": -79.3788},
        "seneca": {"lat": 43.7955, "lon": -79.3496},
        "tynedale": {"lat": 43.7970, "lon": -79.3945},
        "uoft-scarborough": {"lat": 43.7844, "lon": -79.1851},
        "uoft": {"lat": 43.6629, "lon": -79.5019},
        "yorku": {"lat": 43.7735, "lon": -79.5019},
        "yorku-glendon": {"lat": 43.7279, "lon": -79.3780},
    }
    df_coll_univ = (
        pd.DataFrame.from_dict(coll_univ_locations, orient="index")
        .reset_index()
        .rename(columns={"index": "institution_name"})
        .reset_index()
        .rename(columns={"index": "institution_id"})
    )
    return df_coll_univ


def get_neighbourhood_profile_data(url: str, params: Dict) -> pd.DataFrame:
    df_neigh_demog = get_toronto_open_data(url, params)
    df_neigh_demog = (
        df_neigh_demog[
            df_neigh_demog["Characteristic"].isin(
                [
                    "Neighbourhood Number",
                    "Youth (15-24 years)",
                    "Working Age (25-54 years)",
                    "Population, 2016",
                ]
            )
        ]
        .iloc[:, slice(4, None)]
        .set_index("Characteristic")
        .T.reset_index()
        .iloc[1:]
        .reset_index(drop=True)
        .rename(columns={"index": "name"})
    )
    assert len(df_neigh_demog) == 140
    df_neigh_demog["AREA_NAME"] = (
        df_neigh_demog["name"] + " (" + df_neigh_demog["Neighbourhood Number"] + ")"
    )
    return df_neigh_demog


def get_neighbourhood_containing_point(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: str = "Latitude",
    lon: str = "Longitude",
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_order = list(df) + list(gdf)
    polygons_contains = (
        gpd.sjoin(
            gdf,
            gpd.GeoDataFrame(
                df, geometry=gpd.points_from_xy(df[lon], df[lat]), crs=crs
            ),
            predicate="contains",
        )
        .reset_index(drop=True)
        .drop(columns=["index_right"])[cols_order]
    )
    # print(polygons_contains)
    return polygons_contains


def get_data_with_neighbourhood(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: int,
    lon: int,
    col_to_join: str,
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_to_keep = [col_to_join, "AREA_NAME", "geometry", "Shape__Area"]
    df_check = get_neighbourhood_containing_point(gdf, df, lat, lon, crs)[cols_to_keep]
    display(df_check.head(2))
    df = df.merge(df_check.drop(columns=["geometry"]), on=col_to_join, how="left").drop(
        columns=["geometry"]
    )
    print(
        f"Dropped {len(df[['AREA_NAME']].isna().sum())} rows with a missing AREA_NAME"
    )
    df = df.dropna(subset=["AREA_NAME"])
    return df

## Get Bikeshare Data

### Get List of Ridership URLs from Open Data Platform

In [None]:
%%time
all_urls = get_file_urls(url, params)

### Retrieve Ridership Data

In [None]:
%%time
df = get_all_data_files(all_urls, dtypes_dict)
df.head(4)

In [None]:
# Extract attributes
# # Trip duration
df["duration"] = (df["end_time"] - df["start_time"]).dt.seconds
# # Datetime attributes
for trip_point in ["start", "end"]:
    df[f"{trip_point}_year"] = df[f"{trip_point}_time"].dt.year
    df[f"{trip_point}_month"] = df[f"{trip_point}_time"].dt.month
    df[f"{trip_point}_day"] = df[f"{trip_point}_time"].dt.day
    df[f"{trip_point}_hour"] = df[f"{trip_point}_time"].dt.hour
    df[f"{trip_point}_minute"] = df[f"{trip_point}_time"].dt.minute
    df[f"{trip_point}_quarter"] = df[f"{trip_point}_time"].dt.quarter
display(df)
summarize_df(df)

### Drop Rows with Missing Values and Duplicates from Ridership Data

In [None]:
# Columns in which to drop missing values
nan_cols = [
    "start_station_id",
    "end_station_id",
    "start_station_name",
    "end_station_name",
]

# Columns with duplicates, in which to drop rows
duplicated_cols = ["trip_id", "start_time", "end_time"]

In [None]:
%%time
dups_to_drop = df.dropna(subset=nan_cols)[
    df.dropna(subset=nan_cols).duplicated(subset=duplicated_cols, keep="first")
]
not_missing = len(df.dropna(subset=nan_cols))
d_nan = {
    "all": len(df),
    "non_missing": not_missing,
    "frac_to_drop": ((len(df) - not_missing) / len(df)) * 100,
    "duplicates_to_drop": (len(dups_to_drop) / len(df)) * 100,
}
df_nan = pd.DataFrame.from_dict(d_nan, orient="index").T
summarize_df(df)
df_nan

In [None]:
%%time
df = df.dropna(subset=nan_cols).drop_duplicates(subset=duplicated_cols, keep="first")
summarize_df(df)

### Get Stations Metadata

In [None]:
%%time
df_stations = get_stations_metadata(url, about_params)
df_stations = transform_metadata(df_stations)
df_stations.head(2)

In [None]:
summarize_df(df_stations)

## Get Supplementary Datasets

### Cultural Hotspots

In [None]:
%%time
params = {"id": "c7be2ee7-d317-4a28-8cbe-bff1ce116b46"}
dfch_essentials = get_cultural_hotspots(url, params)
dfch_essentials.head(2)

### Places of Interest

In [None]:
%%time
poi_params = {"id": "965247c0-c72e-49b4-bb1a-879cf98e1a32"}
df_poi = get_poi_data(url, poi_params)

Note that duplicate lat-long will be permitted here as multiple places of interest may share the same physical location, or immediately adjacent area. Such places of interest with a duplicated latitude and longitde are shown below

In [None]:
display(
    df_poi[df_poi.duplicated(subset=["POI_LATITUDE", "POI_LONGITUDE"], keep=False)][
        ["ID", "NAME", "POI_LATITUDE", "POI_LONGITUDE"]
    ]
    .sort_values(by=["POI_LATITUDE", "POI_LONGITUDE"])
    .style.set_caption("Duplicates of Latitude-Longitude")
)

These duplicated lat-long locations are shown below to be different points of interest based at the same site
- `ID`=40, `ID`=42
  - Enercare Centre and Exhibition Place are at the same site
- 57, 171
  - York Quay Centre [is at](https://www.museumsontario.ca/museum/York-Quay-Centre-at-Harbourfro) the HarborFront Centre
- 66, 70
  - both places are based at the the Ferry Terminal, so can correctly have the same lat-long
- 68, 124, 154
  - the Brewery and the Toronto Railway Museum are based at Roundhouse Park
- 24, 54
  - [Glenn Gould Studio](https://www.cbc.ca/glenngouldstudio/) is based at the CBC Museum
- 157, 160, 162
  - the [Tourist Information Centre](https://www.toronto.ca/explore-enjoy/visitor-services/tourist-information-centres/) is at the same site as the [Traveller's Aid Society](http://travellersaid.ca/contact.html) and [Union Station](https://torontounion.ca/contact/)
- 67, 145
  - a tourist information centre that is also baed at Nathan Phillips Square
- 8, 167
  - [Ashbridges Bay Park](https://www.toronto.ca/data/parks/prd/facilities/complex/1/index.html) is along [Woodbine Beach](https://www.toronto.ca/data/parks/prd/facilities/complex/311/index.html)
- 75, 111
  - Koerner Hall is at the Royal Observatory of Music
- 73, 74
  - [Kew Balmy Beach](https://www.tripadvisor.ca/Attraction_Review-g155019-d14788092-Reviews-Kew_Balmy_Beach-Toronto_Ontario.html#MAPVIEW-14788092) is at the same site as [Kew Gardens Park](https://www.toronto.ca/data/parks/prd/facilities/complex/107/index.html)
- 93, 141
  - both locations are at Todmorden Mills Park
- 9, 21
  - the Canadian Museum of Cultural Heritage of Indo-Canadians is based at the site of BAPS Shri Swaminarayan Mandir ([link](https://www.baps.org/cultureandheritage/ExperienceIndia/Exhibitions/CanadianMuseumofCulturalHeritageofIndo-Canadians.aspx))

So, the duplicate lat-long sites will be retained in this dataset.

### Neighbourhood Boundary and Land Area Data

In [None]:
%%time
neigh_params = {"id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
gdf = get_neighbourhood_boundary_land_area_data(url, neigh_params)

Print the data for a few neighbourhoods

In [None]:
neigh_cols_to_show = [
    "AREA_ID",
    "AREA_SHORT_CODE",
    "AREA_LONG_CODE",
    "AREA_NAME",
    "Shape__Area",
    "LATITUDE",
    "AREA_LATITUDE",
    "LONGITUDE",
    "AREA_LONGITUDE",
]
gdf[
    gdf["AREA_NAME"].str.contains(
        "Wychwood|Yonge-Eglinton|Yonge-St.|York Univ|Yorkdale-Glen"
    )
][neigh_cols_to_show].sort_values(by=["AREA_NAME"])

In order to use the correct CRS for allowing an area calculation in square km, we'll get the current EPSG ([link](https://epsg.io/4326)) from the geodata

In [None]:
print(gdf.crs)

Fix typographic errors in the name of the neighbourhood in this dataset
- [North St. James Town](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa74.pdf) and [Cabbagetown-South St. James Town](https://www.toronto.com/community-static/4550668-cabbagetown-south-st-james-town/)
  - missing space between ...St. and Ja...
- Weston-Pelham Park
  - incorrectly listed as its old name (from 2011) of Weston-Pellam Park ([link](https://www.toronto.ca/wp-content/uploads/2017/11/900b-91-Weston-Pellam-Park.pdf))
  - replace with [new name from 2016](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa91.pdf)

In [None]:
d_renaming = {
    "St.James": "St. James",
    "Weston-Pellam": "Weston-Pelham",
}
for k, v in d_renaming.items():
    gdf["AREA_NAME"] = gdf["AREA_NAME"].str.replace(k, v, regex=False)

The incorrect names have been successfully replaced as shown below

In [None]:
# Neighbourhood GeoData columns to use
geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

In [None]:
gdf.query("AREA_NAME.str.contains('James Town|Weston-|Cabbage')")[geo_cols]

Compare manual to provided neighbourhood areas (in square km)
- first, changes geodata projection to a cartesian system (EPSG = 3857, in units of m) ([1](https://epsg.io/3857))

In [None]:
area_diff = (gdf["geometry"].to_crs(epsg=3857).area) - gdf["Shape__Area"]
print(area_diff.min(), area_diff.max())

Since these are small differences (in units of square km), we'll use the provided neighbourhood areas from the `Shape__Area` column of the neighbourhood boundary file.

### Public Transit Locations

In [None]:
%%time
params = {"id": "7795b45e-e65a-4465-81fc-c36b9dfff169"}
df_pt_slice = get_public_transit_locations(url, params)

### Colleges and Universities

In [None]:
df_coll_univ = get_coll_univ_locations()

### Neighbourhood Profile Data - Population

In [None]:
%%time
neigh_profile_params = {"id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
df_neigh_demog = get_neighbourhood_profile_data(url, neigh_profile_params)
df_neigh_demog.head(6)

### Number of Locations Per Neighbourhood

#### Places of Interest

In [None]:
print(df_poi["ID"].nunique(), len(df_poi))
df_poi.head(2)

In [None]:
%%time
df_poi_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_poi.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(df_poi_new.head(2))

#### Cultural Hotspots

In [None]:
print(dfch_essentials["ID"].nunique(), len(dfch_essentials))
dfch_essentials.head(2)

In [None]:
%%time
dfch_essentials_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    dfch_essentials.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(dfch_essentials_new.head(2))

#### Colleges and Universities

In [None]:
print(df_coll_univ["institution_id"].nunique(), len(df_coll_univ))
df_coll_univ.head(2)

In [None]:
%%time
df_coll_univ_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_coll_univ,
    "lat",
    "lon",
    "institution_id",
)
display(df_coll_univ_new.head(2))

#### Public Transit Locations

In [None]:
print(df_pt_slice["stop_id"].nunique(), len(df_pt_slice))
df_pt_slice.head(2)

In [None]:
%%time
df_pt_slice_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_pt_slice,
    "lat",
    "lon",
    "stop_id",
)
display(df_pt_slice_new.head(2))

#### Merge Neighbourhood Aggregations with GeoData and Population Data

In [None]:
df_neigh_stats = (
    (
        gdf.set_index("AREA_NAME")[
            [
                "Shape__Area",
                "Shape__Length",
                "geometry",
                "CLASSIFICATION",
                "CLASSIFICATION_CODE",
                "AREA_LATITUDE",
                "AREA_LONGITUDE",
            ]
        ]
        .merge(
            df_pt_slice_new.groupby("AREA_NAME")["stop_id"]
            .count()
            .rename("transit_stops")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_coll_univ_new.groupby("AREA_NAME")["institution_id"]
            .count()
            .rename("colleges_univs")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            dfch_essentials_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("cultural_attractions")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_poi_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("places_of_interest")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .fillna(0)
        .astype(
            {
                k: int
                for k in [
                    "transit_stops",
                    "colleges_univs",
                    "cultural_attractions",
                    "places_of_interest",
                ]
            }
        )
        .merge(
            df_neigh_demog.set_index("AREA_NAME")[
                ["Population, 2016", "Youth (15-24 years)", "Working Age (25-54 years)"]
            ].rename(
                columns={
                    "Population, 2016": "pop_2016",
                    "Youth (15-24 years)": "youth_15_24",
                    "Working Age (25-54 years)": "work_age_25_54",
                }
            ),
            left_index=True,
            right_index=True,
            how="left",
        )
    )
    .add_prefix("neigh_")
    .rename(columns={"neigh_geometry": "geometry"})
)
df_neigh_stats.columns = df_neigh_stats.columns.str.lower().str.replace("__", "_")
df_neigh_stats = df_neigh_stats.reset_index()
df_neigh_stats.head()

In [None]:
type(df_neigh_stats)

### Merge Stations Metadata with Aggregated Neighbourhood Stats

Append the neighbourhood containing each bikeshare station to the station metadata

In [None]:
print(df_stations["station_id"].nunique(), len(df_stations))
df_stations.head(2)

In [None]:
%%time
df_stations_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_stations[
        ["station_id", "name", "physical_configuration", "lat", "lon", "altitude", "address", "capacity", "physicalkey", "transitcard", "creditcard", "phone"]
    ],
    "lat",
    "lon",
    "station_id",
)
display(df_stations_new.head(2))

Merge the modified stations metadata with the neighbourhood stats

In [None]:
df_stations_new = (
    df_stations_new.set_index("AREA_NAME")
    .merge(
        df_neigh_stats.set_index("AREA_NAME"),
        left_index=True,
        right_index=True,
        how="left",
    )
    .reset_index()
)
df_stations_new.head(4)

## Merge Modified Stations Metadata With Ridership Data

In [None]:
def get_aggregated_station_hourly_trips(
    df: pd.DataFrame, cols: List, trip_point: str = "start"
) -> pd.DataFrame:
    trip_point_cols = [f"{trip_point}_{c}" for c in cols] + ["user_type"]
    station_trips = df.groupby(trip_point_cols, as_index=False).agg(
        {"trip_id": "count", "duration": ["min", "median", "mean", "max"]}
    )
    station_trips.columns = (
        cols
        + ["user_type"]
        + [
            "num_trips",
            "duration_min",
            "duration_median",
            "duration_mean",
            "duration_max",
        ]
    )
    station_trips = (
        station_trips.assign(station_type=trip_point)
        .rename(columns={"trip_id_count": "num_trips"})
        .sort_values(by="num_trips", ascending=False)
    )
    return station_trips

In [None]:
%%time
cols = ["station_name", "year", "month", "day", "hour"]
df_hour_by_station_merged = pd.concat(
    [
        get_aggregated_station_hourly_trips(
            df,
            cols,
            trip_point,
        ).merge(
            df_stations_new.rename(columns={"name": "station_name"}),
            on="station_name",
            how="left",
        ).dropna(subset=["capacity"])
        for trip_point in ["start", "end"]
    ],
    ignore_index=True,
)
for c in ["neigh_pop_2016", "neigh_youth_15_24", "neigh_work_age_25_54"]:
    df_hour_by_station_merged[c] = df_hour_by_station_merged[c].str.replace(",", "").astype(float)
display(df_hour_by_station_merged.head(4).append(df_hour_by_station_merged.tail(4)))
display(
    df_hour_by_station_merged.isna().sum().rename("num_missing").to_frame().merge(
        df_hour_by_station_merged.dtypes.rename("dtype").to_frame(), left_index=True, right_index=True
    )
)

## Database Administration

Create the `bikeshare` database

In [None]:
engine = create_engine(URI_NO_DB)
conn = engine.connect()

In [None]:
_ = conn.execute(f"DROP DATABASE IF EXISTS {DB_NAME};")
_ = conn.execute(f"CREATE DATABASE IF NOT EXISTS {DB_NAME};")

In [None]:
conn.close()
engine.dispose()

## Create Database Table

In [None]:
engine = create_engine(URI)
conn = engine.connect()

Create the `ridership` table in the `bikeshare` database

In [None]:
# _ = conn.execute(f"DROP TABLE IF EXISTS {table_name}")

In [None]:
# create_table_query = f"""
#                      CREATE TABLE IF NOT EXISTS {table_name} (
#                          station_name VARCHAR(100),
#                          year INT,
#                          month INT,
#                          day INT,
#                          hour INT,
#                          user_type VARCHAR(20),
#                          num_trips INT,
#                          duration_min INT,
#                          duration_median FLOAT,
#                          duration_mean FLOAT,
#                          duration_max INT,
#                          station_type VARCHAR(10),
#                          area_name TEXT,
#                          station_id FLOAT,
#                          physical_configuration TEXT,
#                          lat FLOAT,
#                          lon FLOAT,
#                          altitude FLOAT,
#                          address TEXT,
#                          capacity INT,
#                          physicalkey INT,
#                          transitcard INT,
#                          creditcard INT,
#                          phone INT,
#                          neigh_shape_area FLOAT,
#                          neigh_shape_length FLOAT,
#                          neigh_classification TEXT,
#                          neigh_classification_code TEXT,
#                          neigh_area_latitude FLOAT,
#                          neigh_area_longitude FLOAT,
#                          neigh_transit_stops INT,
#                          neigh_colleges_univs INT,
#                          neigh_cultural_attractions INT,
#                          neigh_places_of_interest INT,
#                          neigh_pop_2016 FLOAT,
#                          neigh_youth_15_24 FLOAT,
#                          neigh_work_age_25_54 FLOAT
#                      )
#                      """
# _ = conn.execute(create_table_query)

In [None]:
_ = conn.execute(
    f"ALTER TABLE {table_name} ADD UNIQUE unique_index(station_name, year, month, day, hour, user_type, station_type)"
)

## Append Merged Data to Database

In [None]:
# %%time
# df_hour_by_station_merged.drop(
#     columns=["geometry", "Shape__Area"]
# ).iloc[0:10_000].to_sql(table_name, con=conn, index=False, if_exists="append")

(NOT DONE HERE) Change the datatype for the neighbourhood stats columns to `INT`

In [None]:
# %%time
# for c in ["station_id", "neigh_pop_2016", "neigh_youth_15_24", "neigh_work_age_25_54"]:
#     _ = conn.execute(f"ALTER TABLE {table_name} MODIFY {c} INTEGER")

Query the data in the database

In [None]:
%%time
df_query = pd.read_sql(
    f"""
    SELECT *
    FROM {table_name}
    WHERE station_type = 'start'
    LIMIT 900000
    """,
    con=conn
)
df_query

## Close MySQL Database Connection

In [None]:
conn.close()
engine.dispose()