# Get Data

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import configparser
import os
import urllib
from io import BytesIO
from multiprocessing import cpu_count
from typing import Dict, List
from urllib.request import urlopen
from zipfile import ZipFile

import geopandas as gpd
import pandas as pd
import requests
from joblib import Parallel, delayed
from sqlalchemy import create_engine

In [3]:
# Datasets
# # Open Data Portal
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
# # Ridership
params = {"id": "7e876c24-177c-4605-9cef-e50dd74c617f"}
# # Stations Metadata
about_params = {"id": "2b44db0d-eea9-442d-b038-79335368ad5a"}

# Database
path_to_sql_cfg = "../sql.ini"
table_name = "ridership"
n_rows_to_append_to_db = 100_000

# Ridership dtypes dict
dtypes_dict = {
    "Trip Duration": int,
    "Start Station Id": int,
    "End Station Id": float,
    "Bike Id": float,
}

# Neighbourhood GeoData columns to keep when getting neighbourhood
# containing a location
geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

In [4]:
if not os.path.isfile(path_to_sql_cfg):
    DB_TYPE = os.getenv("DB_TYPE")
    DB_DRIVER = os.getenv("DB_DRIVER")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
else:
    config = configparser.ConfigParser()
    config.read("../sql.ini")
    default_cfg = config["default"]
    DB_TYPE = default_cfg["DB_TYPE"]
    DB_DRIVER = default_cfg["DB_DRIVER"]
    DB_USER = default_cfg["DB_USER"]
    DB_PASS = default_cfg["DB_PASS"]
    DB_HOST = default_cfg["DB_HOST"]
    DB_PORT = default_cfg["DB_PORT"]
DB_NAME = "bikeshare"

In [5]:
# Connect to single database (required to create database)
URI_NO_DB = f"{DB_TYPE}+{DB_DRIVER}://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}"

# Connect to all databases (required to perform CRUD operations and submit queries)
URI = f"{DB_TYPE}+{DB_DRIVER}://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

In [6]:
def get_file_urls(main_dataset_url: str, dataset_params: Dict) -> List:
    package = requests.get(main_dataset_url, params=dataset_params).json()
    resources = package["result"]["resources"]
    df = pd.DataFrame.from_records(resources)
    year_month_wanted = [
        f"{y}-{str(m).zfill(2)}" for y in [2021] for m in range(1, 10 + 1)
    ]
    year_month_wanted_str = "|".join(year_month_wanted)
    urls_list = df.query("name.str.contains(@year_month_wanted_str)")["url"].tolist()
    return urls_list


def read_data(url: str, dtypes_dict: Dict) -> pd.DataFrame:
    df = pd.read_csv(
        url,
        encoding="cp1252",
        parse_dates=["Start Time", "End Time"],
        dtype=dtypes_dict,
    )
    df = df.rename(columns={list(df)[0]: "Trip Id"})
    df.columns = df.columns.str.replace("  ", " ").str.replace(" ", "_").str.lower()
    return df


def load(df: pd.DataFrame, table_name: str, uri: str) -> None:
    engine = create_engine(URI)
    conn = engine.connect()
    df_stations.to_sql(table_name, index=False, con=conn, if_exists="append")
    conn.close()
    engine.dispose()


def get_single_ridership_data_file(url: str, dtypes_dict: Dict) -> pd.DataFrame:
    fname = os.path.basename(url)
    print(f"Loading data from {fname}...", end="")
    df = read_data(url, dtypes_dict)
    print("Done.")
    return df


def get_all_data_files(urls_list: List, dtypes_dict: Dict) -> pd.DataFrame:
    executor = Parallel(n_jobs=cpu_count(), backend="multiprocessing")
    tasks = (
        delayed(get_single_ridership_data_file)(url, dtypes_dict) for url in urls_list
    )
    dfs = executor(tasks)
    df = pd.concat(dfs, ignore_index=True)
    return df


def get_stations_metadata(stations_url: str, stations_params: Dict) -> pd.DataFrame:
    package = requests.get(stations_url, params=about_params).json()
    resources = package["result"]["resources"]
    df_about = pd.DataFrame.from_records(resources)
    r = requests.get(df_about["url"].tolist()[0]).json()
    url_stations = r["data"]["en"]["feeds"][2]["url"]
    df_stations = pd.DataFrame.from_records(
        requests.get(url_stations).json()["data"]["stations"]
    )
    return df_stations


def transform_metadata(df: pd.DataFrame) -> pd.DataFrame:
    df["station_id"] = df["station_id"].astype(int)
    dfa = pd.DataFrame(
        df.set_index("station_id")["rental_methods"].tolist(),
        columns=["key", "transitcard", "creditcard", "phone"],
    )
    for c in ["KEY", "TRANSITCARD", "CREDITCARD", "PHONE"]:
        dfa[c.lower()] = dfa[c.lower()].map({c: 1}).fillna(0).astype(int)
    df = pd.concat(
        [
            df.drop(columns=["groups", "rental_methods"]),
            dfa,
        ],
        axis=1,
    ).rename(columns={"key": "physicalkey"})
    return df


def get_toronto_open_data(url, params, col_rename_dict={}):
    package = requests.get(url, params=params).json()
    datastore_url = (
        "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/"
        "action/datastore_search"
    )
    for _, resource in enumerate(package["result"]["resources"]):
        if resource["datastore_active"]:
            url = datastore_url
            p = {"id": resource["id"]}
            data = requests.get(url, params=p).json()
            df = pd.DataFrame(data["result"]["records"])
            break
    if col_rename_dict:
        df = df.rename(columns=col_rename_dict)
    return df


def get_lat_long(row):
    return row["coordinates"]


def get_poi_data(url: str, params: Dict) -> pd.DataFrame:
    poi_cols = [
        "ID",
        "NAME",
        "PLACE_NAME",
        "ADDRESS_FULL",
        "POSTAL_CODE",
        "ATTRACTION_DESC",
        "POI_LATITUDE",
        "POI_LONGITUDE",
    ]
    package = requests.get(url, params=poi_params).json()
    poi_url = package["result"]["resources"][0]["url"]
    df = pd.read_csv(poi_url)
    assert len(df) == 175
    df[["POI_LONGITUDE", "POI_LATITUDE"]] = pd.DataFrame(
        df["geometry"].apply(eval).apply(get_lat_long).tolist()
    )
    # Verify no duplicates (by name) are in the data
    assert df[df.duplicated(subset=["NAME"], keep=False)].empty
    return df


def get_cultural_hotspots(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    ch_locations = package["result"]["resources"][0]["url"]
    ch_locs_dir_path = "data/raw/cultural-hotspot-points-of-interest-wgs84"
    with urlopen(ch_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(ch_locs_dir_path)
    df = gpd.read_file(f"{ch_locs_dir_path}/CULTURAL_HOTSPOT_WGS84.shp")
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT", "LATITUDE", "LONGITUDE"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    df = (
        df.drop_duplicates(
            subset=["PNT_OF_INT"],
            keep="first",
        )
        .reset_index(drop=True)
        .copy()
    )
    assert df[df.duplicated(subset=["PNT_OF_INT"], keep=False)].empty
    df_essentials = df[["RID", "PNT_OF_INT", "LATITUDE", "LONGITUDE"]].rename(
        columns={
            "RID": "ID",
            "PNT_OF_INT": "NAME",
            "LATITUDE": "POI_LATITUDE",
            "LONGITUDE": "POI_LONGITUDE",
        }
    )
    return df_essentials


def get_neighbourhood_boundary_land_area_data(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    n_url = (
        package["result"]["resources"][0]["url"].replace(
            "datastore/dump", "download_resource"
        )
        + "?format=geojson&projection=4326"
    )
    gdf = gpd.read_file(n_url)
    gdf["centroid"] = gdf["geometry"].to_crs(epsg=3395).centroid.to_crs(epsg=4326)
    gdf["AREA_LATITUDE"] = gdf["centroid"].y
    gdf["AREA_LONGITUDE"] = gdf["centroid"].x
    assert len(gdf) == 140
    neigh_cols_to_show = [
        "AREA_ID",
        "AREA_SHORT_CODE",
        "AREA_LONG_CODE",
        "AREA_NAME",
        "Shape__Area",
        "LATITUDE",
        "AREA_LATITUDE",
        "LONGITUDE",
        "AREA_LONGITUDE",
    ]
    return gdf


def get_public_transit_locations(url: str, params: Dict) -> pd.DataFrame:
    package = requests.get(url, params=params).json()
    pt_locations = package["result"]["resources"][0]["url"]
    pt_locs_dir_path = "data/raw/opendata_ttc_schedules"
    with urlopen(pt_locations) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(pt_locs_dir_path)
    df_pt = pd.read_csv(f"{pt_locs_dir_path}/stops.txt")
    display(df_pt.head())
    df_pt = df_pt.rename(columns={"stop_lat": "lat", "stop_lon": "lon"})
    return df_pt


def get_coll_univ_locations() -> pd.DataFrame:
    coll_univ_locations = {
        "centennial": {"lat": 43.7854, "lon": -79.22664},
        "george-brown": {"lat": 43.6761, "lon": -79.4111},
        "humber": {"lat": 43.7290, "lon": -79.6074},
        "ocad": {"lat": 43.6530, "lon": -79.3912},
        "ryerson": {"lat": 43.6577, "lon": -79.3788},
        "seneca": {"lat": 43.7955, "lon": -79.3496},
        "tynedale": {"lat": 43.7970, "lon": -79.3945},
        "uoft-scarborough": {"lat": 43.7844, "lon": -79.1851},
        "uoft": {"lat": 43.6629, "lon": -79.5019},
        "yorku": {"lat": 43.7735, "lon": -79.5019},
        "yorku-glendon": {"lat": 43.7279, "lon": -79.3780},
    }
    df_coll_univ = (
        pd.DataFrame.from_dict(coll_univ_locations, orient="index")
        .reset_index()
        .rename(columns={"index": "institution_name"})
        .reset_index()
        .rename(columns={"index": "institution_id"})
    )
    return df_coll_univ


def get_neighbourhood_profile_data(url: str, params: Dict) -> pd.DataFrame:
    df_neigh_demog = get_toronto_open_data(url, params)
    df_neigh_demog = (
        df_neigh_demog[
            df_neigh_demog["Characteristic"].isin(
                [
                    "Neighbourhood Number",
                    "Youth (15-24 years)",
                    "Working Age (25-54 years)",
                    "Population, 2016",
                ]
            )
        ]
        .iloc[:, slice(4, None)]
        .set_index("Characteristic")
        .T.reset_index()
        .iloc[1:]
        .reset_index(drop=True)
        .rename(columns={"index": "name"})
    )
    assert len(df_neigh_demog) == 140
    df_neigh_demog["AREA_NAME"] = (
        df_neigh_demog["name"] + " (" + df_neigh_demog["Neighbourhood Number"] + ")"
    )
    return df_neigh_demog


def get_neighbourhood_containing_point(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: str = "Latitude",
    lon: str = "Longitude",
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_order = list(df) + list(gdf)
    polygons_contains = (
        gpd.sjoin(
            gdf,
            gpd.GeoDataFrame(
                df, geometry=gpd.points_from_xy(df[lon], df[lat]), crs=crs
            ),
            predicate="contains",
        )
        .reset_index(drop=True)
        .drop(columns=["index_right"])[cols_order]
    )
    # print(polygons_contains)
    return polygons_contains


def get_data_with_neighbourhood(
    gdf: gpd.GeoDataFrame,
    df: pd.DataFrame,
    lat: int,
    lon: int,
    col_to_join: str,
    crs: int = 4326,
) -> gpd.GeoDataFrame:
    cols_to_keep = [col_to_join, "AREA_NAME", "geometry", "Shape__Area"]
    df_check = get_neighbourhood_containing_point(gdf, df, lat, lon, crs)[cols_to_keep]
    display(df_check.head(2))
    df = df.merge(df_check.drop(columns=["geometry"]), on=col_to_join, how="left").drop(
        columns=["geometry"]
    )
    print(
        f"Dropped {len(df[['AREA_NAME']].isna().sum())} rows with a missing AREA_NAME"
    )
    df = df.dropna(subset=["AREA_NAME"])
    return df


def summarize_df(df: pd.DataFrame) -> None:
    """Show properties of a DataFrame."""
    display(
        df.dtypes.rename("dtype")
        .to_frame()
        .merge(
            df.isna().sum().rename("num_missing").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .assign(num=len(df))
        .merge(
            df.nunique().rename("nunique").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df.dropna(how="any")
            .sample(1)
            .squeeze()
            .rename("single_non_nan_value")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
    )

## Get Bikeshare Data

### Get List of Ridership URLs from Open Data Platform

In [7]:
%%time
all_urls = get_file_urls(url, params)

CPU times: user 24.9 ms, sys: 873 µs, total: 25.8 ms
Wall time: 256 ms


### Retrieve Ridership Data

In [8]:
%%time
df = get_all_data_files(all_urls, dtypes_dict)
df.head(4)

Loading data from 2021-01.csv...Loading data from bike-share-ridership-2021-02.csv...Loading data from bike-share-ridership-2021-03.csv...Loading data from bike-share-ridership-2021-05-.csv...Loading data from bike-share-ridership-2021-06.csv...Loading data from bike-share-ridership-2021-07.csv...Loading data from bike-share-ridership-2021-08.csv...Loading data from bike-share-ridership-2021-09.csv...Loading data from bike-share-ridership-2021-10.csv...Loading data from bike-share-ridership-2021-04.csv...Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
CPU times: user 625 ms, sys: 241 ms, total: 867 ms
Wall time: 12.1 s


Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
0,10644218,1315,7021,2021-01-01 00:04:00,Bay St / Albert St,7164.0,2021-01-01 00:26:00,Gould St / Yonge St (Ryerson University),6795.0,Annual Member
1,10644220,396,7534,2021-01-01 00:07:00,Walnut Ave / Queen St W,7524.0,2021-01-01 00:13:00,Lisgar Park,4176.0,Casual Member
2,10644221,86,7162,2021-01-01 00:10:00,Hayter St / Laplante Ave,7006.0,2021-01-01 00:11:00,Bay St / College St (East Side),1814.0,Annual Member
3,10644222,741,7003,2021-01-01 00:10:00,Madison Ave / Bloor St W,7272.0,2021-01-01 00:22:00,Yonge St / Dundonald St - SMART,198.0,Casual Member


In [9]:
%%time
# Extract attributes
# # Trip duration
df["duration"] = (df["end_time"] - df["start_time"]).dt.seconds
# # Datetime attributes
for trip_point in ["start", "end"]:
    df[f"{trip_point}_year"] = df[f"{trip_point}_time"].dt.year
    df[f"{trip_point}_month"] = df[f"{trip_point}_time"].dt.month
    df[f"{trip_point}_day"] = df[f"{trip_point}_time"].dt.day
    df[f"{trip_point}_hour"] = df[f"{trip_point}_time"].dt.hour
    df[f"{trip_point}_minute"] = df[f"{trip_point}_time"].dt.minute
    df[f"{trip_point}_quarter"] = df[f"{trip_point}_time"].dt.quarter
display(df)
summarize_df(df)

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type,...,start_day,start_hour,start_minute,start_quarter,end_year,end_month,end_day,end_hour,end_minute,end_quarter
0,10644218,1315,7021,2021-01-01 00:04:00,Bay St / Albert St,7164.0,2021-01-01 00:26:00,Gould St / Yonge St (Ryerson University),6795.0,Annual Member,...,1,0,4,1,2021,1,1,0,26,1
1,10644220,396,7534,2021-01-01 00:07:00,Walnut Ave / Queen St W,7524.0,2021-01-01 00:13:00,Lisgar Park,4176.0,Casual Member,...,1,0,7,1,2021,1,1,0,13,1
2,10644221,86,7162,2021-01-01 00:10:00,Hayter St / Laplante Ave,7006.0,2021-01-01 00:11:00,Bay St / College St (East Side),1814.0,Annual Member,...,1,0,10,1,2021,1,1,0,11,1
3,10644222,741,7003,2021-01-01 00:10:00,Madison Ave / Bloor St W,7272.0,2021-01-01 00:22:00,Yonge St / Dundonald St - SMART,198.0,Casual Member,...,1,0,10,1,2021,1,1,0,22,1
4,10644223,2073,7562,2021-01-01 00:11:00,Priscilla Ave / Dundas St W - SMART,7562.0,2021-01-01 00:45:00,Priscilla Ave / Dundas St W - SMART,6688.0,Casual Member,...,1,0,11,1,2021,1,1,0,45,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3186400,14348496,613,7024,2021-10-31 23:58:00,Dundonald St / Church St,7192.0,2021-11-01 00:09:00,Harbord St / Clinton St,2630.0,Annual Member,...,31,23,58,4,2021,11,1,0,9,4
3186401,14348499,710,7021,2021-10-31 23:59:00,Bay St / Albert St,7002.0,2021-11-01 00:11:00,St. George St / Bloor St W,6360.0,Annual Member,...,31,23,59,4,2021,11,1,0,11,4
3186402,14348500,521,7033,2021-10-31 23:59:00,Union Station,7576.0,2021-11-01 00:08:00,Front St E / Bayview Avenue,391.0,Casual Member,...,31,23,59,4,2021,11,1,0,8,4
3186403,14348501,520,7033,2021-10-31 23:59:00,Union Station,7576.0,2021-11-01 00:08:00,Front St E / Bayview Avenue,4474.0,Casual Member,...,31,23,59,4,2021,11,1,0,8,4


Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
trip_id,int64,0,3186405,3186392,11553090
trip_duration,int64,0,3186405,18804,485
start_station_id,int64,0,3186405,623,7367
start_time,datetime64[ns],0,3186405,353587,2021-05-20 16:51:00
start_station_name,object,1452,3186405,735,Alma Ave / Gladstone Ave SMART
end_station_id,float64,1758,3186405,625,7150.0
end_time,datetime64[ns],0,3186405,353336,2021-05-20 16:59:00
end_station_name,object,3264,3186405,739,Dufferin St / Sylvan Av (Dufferin Grove Park)
bike_id,float64,203,3186405,6484,556.0
user_type,object,0,3186405,2,Annual Member


CPU times: user 4.44 s, sys: 407 ms, total: 4.85 s
Wall time: 4.85 s


### Drop Rows with Missing Values and Duplicates from Ridership Data

In [10]:
# Columns in which to drop missing values
nan_cols = [
    "start_station_id",
    "end_station_id",
    "start_station_name",
    "end_station_name",
]

# Columns with duplicates, in which to drop rows
duplicated_cols = ["trip_id", "start_time", "end_time"]

In [11]:
%%time
dups_to_drop = df.dropna(subset=nan_cols)[
    df.dropna(subset=nan_cols).duplicated(subset=duplicated_cols, keep="first")
]
not_missing = len(df.dropna(subset=nan_cols))
d_nan = {
    "all": len(df),
    "non_missing": not_missing,
    "frac_to_drop": ((len(df) - not_missing) / len(df)) * 100,
    "duplicates_to_drop": (len(dups_to_drop) / len(df)) * 100,
}
df_nan = pd.DataFrame.from_dict(d_nan, orient="index").T
summarize_df(df)
df_nan

Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
trip_id,int64,0,3186405,3186392,11703909
trip_duration,int64,0,3186405,18804,950
start_station_id,int64,0,3186405,623,7061
start_time,datetime64[ns],0,3186405,353587,2021-05-28 19:23:00
start_station_name,object,1452,3186405,735,Dalton Rd / Bloor St W
end_station_id,float64,1758,3186405,625,7155.0
end_time,datetime64[ns],0,3186405,353336,2021-05-28 19:39:00
end_station_name,object,3264,3186405,739,Bathurst St / Lennox St
bike_id,float64,203,3186405,6484,5772.0
user_type,object,0,3186405,2,Annual Member


CPU times: user 3.03 s, sys: 322 ms, total: 3.35 s
Wall time: 3.35 s


Unnamed: 0,all,non_missing,frac_to_drop,duplicates_to_drop
0,3186405.0,3181768.0,0.145525,0.0


In [12]:
%%time
df = df.dropna(subset=nan_cols).drop_duplicates(subset=duplicated_cols, keep="first")
summarize_df(df)

Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
trip_id,int64,0,3181768,3181768,12585986
trip_duration,int64,0,3181768,18790,596
start_station_id,int64,0,3181768,620,7336
start_time,datetime64[ns],0,3181768,353456,2021-07-18 15:49:00
start_station_name,object,0,3181768,735,Queen St E / Alton Av
end_station_id,float64,0,3181768,622,7427.0
end_time,datetime64[ns],0,3181768,353211,2021-07-18 15:59:00
end_station_name,object,0,3181768,739,Northern Dancer Blvd / Lake Shore Blvd E
bike_id,float64,201,3181768,6484,2234.0
user_type,object,0,3181768,2,Annual Member


CPU times: user 2.53 s, sys: 215 ms, total: 2.74 s
Wall time: 2.74 s


### Get Stations Metadata

In [13]:
%%time
df_stations = get_stations_metadata(url, about_params)
df_stations = transform_metadata(df_stations)
df_stations.head(2)

CPU times: user 69.6 ms, sys: 3.5 ms, total: 73.1 ms
Wall time: 553 ms


Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,obcn,nearby_distance,post_code,cross_street,physicalkey,transitcard,creditcard,phone
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,647-643-9607,500.0,,,1,1,1,1
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,416-617-9576,500.0,M4Y 1G7,,1,1,1,1


In [14]:
summarize_df(df_stations)

Unnamed: 0,dtype,num_missing,num,nunique,single_non_nan_value
station_id,int64,0,615,615,7656
name,object,0,615,615,Bloor St W / Brock Ave
physical_configuration,object,0,615,5,REGULAR
lat,float64,0,615,611,43.658988
lon,float64,0,615,614,-79.438715
altitude,float64,10,615,2,0.0
address,object,0,615,615,Bloor St W / Brock Av
capacity,int64,0,615,35,11
obcn,object,0,615,569,
nearby_distance,float64,0,615,6,1609.34


## Get Supplementary Datasets

### Cultural Hotspots

In [15]:
%%time
params = {"id": "c7be2ee7-d317-4a28-8cbe-bff1ce116b46"}
dfch_essentials = get_cultural_hotspots(url, params)
dfch_essentials.head(2)

CPU times: user 85.1 ms, sys: 2.63 ms, total: 87.7 ms
Wall time: 385 ms


Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067
1,2,Crucified Again (Sculpture),43.753806,-79.21617


### Places of Interest

In [16]:
%%time
poi_params = {"id": "965247c0-c72e-49b4-bb1a-879cf98e1a32"}
df_poi = get_poi_data(url, poi_params)

CPU times: user 32.8 ms, sys: 2.62 ms, total: 35.4 ms
Wall time: 347 ms


Note that duplicate lat-long will be permitted here as multiple places of interest may share the same physical location, or immediately adjacent area. Such places of interest with a duplicated latitude and longitde are shown below

In [17]:
display(
    df_poi[df_poi.duplicated(subset=["POI_LATITUDE", "POI_LONGITUDE"], keep=False)][
        ["ID", "NAME", "POI_LATITUDE", "POI_LONGITUDE"]
    ]
    .sort_values(by=["POI_LATITUDE", "POI_LONGITUDE"])
    .style.set_caption("Duplicates of Latitude-Longitude")
)

Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
39,40,Enercare Centre (formely known as Direct Energy Centre),43.63453,-79.412552
41,42,Exhibition Place,43.63453,-79.412552
57,57,Harbourfront Centre,43.639232,-79.383105
171,171,York Quay Centre,43.639232,-79.383105
66,66,INFOTOGO Tourist Info Trailer - Ferry Terminal,43.640441,-79.375091
70,70,Jack Layton Ferry Terminal,43.640441,-79.375091
68,68,INFOTOGO Tourist Info Trailer - Roundhouse Park,43.641692,-79.385567
124,124,Steam Whistle Brewery,43.641692,-79.385567
153,154,Toronto Railway Museum,43.641692,-79.385567
23,24,CBC Museum,43.644418,-79.387703


These duplicated lat-long locations are shown below to be different points of interest based at the same site
- `ID`=40, `ID`=42
  - Enercare Centre and Exhibition Place are at the same site
- 57, 171
  - York Quay Centre [is at](https://www.museumsontario.ca/museum/York-Quay-Centre-at-Harbourfro) the HarborFront Centre
- 66, 70
  - both places are based at the the Ferry Terminal, so can correctly have the same lat-long
- 68, 124, 154
  - the Brewery and the Toronto Railway Museum are based at Roundhouse Park
- 24, 54
  - [Glenn Gould Studio](https://www.cbc.ca/glenngouldstudio/) is based at the CBC Museum
- 157, 160, 162
  - the [Tourist Information Centre](https://www.toronto.ca/explore-enjoy/visitor-services/tourist-information-centres/) is at the same site as the [Traveller's Aid Society](http://travellersaid.ca/contact.html) and [Union Station](https://torontounion.ca/contact/)
- 67, 145
  - a tourist information centre that is also baed at Nathan Phillips Square
- 8, 167
  - [Ashbridges Bay Park](https://www.toronto.ca/data/parks/prd/facilities/complex/1/index.html) is along [Woodbine Beach](https://www.toronto.ca/data/parks/prd/facilities/complex/311/index.html)
- 75, 111
  - Koerner Hall is at the Royal Observatory of Music
- 73, 74
  - [Kew Balmy Beach](https://www.tripadvisor.ca/Attraction_Review-g155019-d14788092-Reviews-Kew_Balmy_Beach-Toronto_Ontario.html#MAPVIEW-14788092) is at the same site as [Kew Gardens Park](https://www.toronto.ca/data/parks/prd/facilities/complex/107/index.html)
- 93, 141
  - both locations are at Todmorden Mills Park
- 9, 21
  - the Canadian Museum of Cultural Heritage of Indo-Canadians is based at the site of BAPS Shri Swaminarayan Mandir ([link](https://www.baps.org/cultureandheritage/ExperienceIndia/Exhibitions/CanadianMuseumofCulturalHeritageofIndo-Canadians.aspx))

So, the duplicate lat-long sites will be retained in this dataset.

### Neighbourhood Boundary and Land Area Data

In [18]:
%%time
neigh_params = {"id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
gdf = get_neighbourhood_boundary_land_area_data(url, neigh_params)

CPU times: user 181 ms, sys: 15.5 ms, total: 197 ms
Wall time: 2.06 s


Print the data for a few neighbourhoods

In [19]:
neigh_cols_to_show = [
    "AREA_ID",
    "AREA_SHORT_CODE",
    "AREA_LONG_CODE",
    "AREA_NAME",
    "Shape__Area",
    "LATITUDE",
    "AREA_LATITUDE",
    "LONGITUDE",
    "AREA_LONGITUDE",
]
gdf[
    gdf["AREA_NAME"].str.contains(
        "Wychwood|Yonge-Eglinton|Yonge-St.|York Univ|Yorkdale-Glen"
    )
][neigh_cols_to_show].sort_values(by=["AREA_NAME"])

Unnamed: 0,AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,Shape__Area,LATITUDE,AREA_LATITUDE,LONGITUDE,AREA_LONGITUDE
133,2480144,94,94,Wychwood (94),3217960.0,,43.67692,,-79.425515
16,2480057,100,100,Yonge-Eglinton (100),3160334.0,,43.70469,,-79.40359
134,2480143,97,97,Yonge-St.Clair (97),2222464.0,,43.68786,,-79.397871
131,2480146,27,27,York University Heights (27),25418210.0,,43.76574,,-79.488883
69,2480105,31,31,Yorkdale-Glen Park (31),11566690.0,,43.714673,,-79.457108


In order to use the correct CRS for allowing an area calculation in square km, we'll get the current EPSG ([link](https://epsg.io/4326)) from the geodata

In [20]:
print(gdf.crs)

epsg:4326


Fix typographic errors in the name of the neighbourhood in this dataset
- [North St. James Town](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa74.pdf) and [Cabbagetown-South St. James Town](https://www.toronto.com/community-static/4550668-cabbagetown-south-st-james-town/)
  - missing space between ...St. and Ja...
- Weston-Pelham Park
  - incorrectly listed as its old name (from 2011) of Weston-Pellam Park ([link](https://www.toronto.ca/wp-content/uploads/2017/11/900b-91-Weston-Pellam-Park.pdf))
  - replace with [new name from 2016](https://www.toronto.ca/ext/sdfa/Neighbourhood%20Profiles/pdf/2016/pdf1/cpa91.pdf)

In [21]:
d_renaming = {
    "St.James": "St. James",
    "Weston-Pellam": "Weston-Pelham",
}
for k, v in d_renaming.items():
    gdf["AREA_NAME"] = gdf["AREA_NAME"].str.replace(k, v, regex=False)

The incorrect names have been successfully replaced as shown below

In [22]:
# Neighbourhood GeoData columns to use
geo_cols = ["AREA_NAME", "geometry", "Shape__Area"]

In [23]:
gdf.query("AREA_NAME.str.contains('James Town|Weston-|Cabbage')")[geo_cols]

Unnamed: 0,AREA_NAME,geometry,Shape__Area
18,North St. James Town (74),"POLYGON ((-79.38057 43.67161, -79.37947 43.671...",811303.9
40,Weston-Pelham Park (91),"POLYGON ((-79.46005 43.66723, -79.46092 43.668...",2794057.0
114,Cabbagetown-South St. James Town (71),"POLYGON ((-79.37672 43.66242, -79.37721 43.663...",2711742.0


Compare manual to provided neighbourhood areas (in square km)
- first, changes geodata projection to a cartesian system (EPSG = 3857, in units of m) ([1](https://epsg.io/3857))

In [24]:
area_diff = (gdf["geometry"].to_crs(epsg=3857).area) - gdf["Shape__Area"]
print(area_diff.min(), area_diff.max())

-0.10295796953141689 0.147477675229311


Since these are small differences (in units of square km), we'll use the provided neighbourhood areas from the `Shape__Area` column of the neighbourhood boundary file.

### Public Transit Locations

In [25]:
%%time
params = {"id": "7795b45e-e65a-4465-81fc-c36b9dfff169"}
df_pt_slice = get_public_transit_locations(url, params)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,2
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1
2,264,940,Davenport Rd at Dupont St,,43.675511,-79.401938,,,,,,2
3,265,1871,Davisville Ave at Cleveland St,,43.702088,-79.378112,,,,,,1
4,266,11700,Disco Rd at Attwell Dr,,43.701362,-79.594843,,,,,,1


CPU times: user 676 ms, sys: 92 ms, total: 768 ms
Wall time: 1.24 s


### Colleges and Universities

In [26]:
df_coll_univ = get_coll_univ_locations()

### Neighbourhood Profile Data - Population

In [27]:
%%time
neigh_profile_params = {"id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
df_neigh_demog = get_neighbourhood_profile_data(url, neigh_profile_params)
df_neigh_demog.head(6)

CPU times: user 82.5 ms, sys: 3.23 ms, total: 85.8 ms
Wall time: 454 ms


Characteristic,name,Neighbourhood Number,"Population, 2016",Youth (15-24 years),Working Age (25-54 years),AREA_NAME
0,Agincourt North,129,29113,3705,11305,Agincourt North (129)
1,Agincourt South-Malvern West,128,23757,3360,9965,Agincourt South-Malvern West (128)
2,Alderwood,20,12054,1235,5220,Alderwood (20)
3,Annex,95,30526,3750,15040,Annex (95)
4,Banbury-Don Mills,42,27695,2730,10810,Banbury-Don Mills (42)
5,Bathurst Manor,34,15873,1940,6655,Bathurst Manor (34)


### Number of Locations Per Neighbourhood

#### Places of Interest

In [28]:
print(df_poi["ID"].nunique(), len(df_poi))
df_poi.head(2)

175 175


Unnamed: 0,_id,ID,ADDRESS_INFO,NAME,CATEGORY,PHONE,EMAIL,WEBSITE,GEOID,RECEIVED_DATA_DATE,...,LATITUDE,OBJECTID,MI_PRINX,ATTRACTION_LEVEL,ATTRACTION_DESC,IMAGE_NAME,MAP_ACCESS,geometry,POI_LONGITUDE,POI_LATITUDE
0,1,16,,BMO Field,Sports / Entertainment Venue,416-815-5982,,www.bmofield.com,20229243.0,,...,,16,4163950.0,2,BMO Field is home to the Toronto FC (Major Lea...,BMOField.jpg,Y,"{""type"": ""Point"", ""coordinates"": [-79.41861429...",-79.418614,43.632664
1,2,1,,Aga Khan Museum,Museum,416-646-4677,,www.agakhanmuseum.org,10142948.0,,...,,1,4094277.0,1,"Dedicated to sharing the artistic, intellectua...",AgaKhan.jpg,Y,"{""type"": ""Point"", ""coordinates"": [-79.33233113...",-79.332331,43.725386


In [29]:
%%time
df_poi_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_poi.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(df_poi_new.head(2))

Unnamed: 0,ID,AREA_NAME,geometry,Shape__Area
0,23,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,120,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,ID,NAME,lat,lon,AREA_NAME,Shape__Area
0,16,BMO Field,43.632664,-79.418614,Niagara (82),6192651.0
1,1,Aga Khan Museum,43.725386,-79.332331,Banbury-Don Mills (42),19248970.0


CPU times: user 36.9 ms, sys: 117 µs, total: 37 ms
Wall time: 34.6 ms


#### Cultural Hotspots

In [30]:
print(dfch_essentials["ID"].nunique(), len(dfch_essentials))
dfch_essentials.head(2)

470 470


Unnamed: 0,ID,NAME,POI_LATITUDE,POI_LONGITUDE
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067
1,2,Crucified Again (Sculpture),43.753806,-79.21617


In [31]:
%%time
dfch_essentials_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    dfch_essentials.rename(columns={"POI_LATITUDE": "lat", "POI_LONGITUDE": "lon",})[
        ["ID", "NAME", "lat", "lon"]
    ],
    "lat",
    "lon",
    "ID",
)
display(dfch_essentials_new.head(2))

Unnamed: 0,ID,AREA_NAME,geometry,Shape__Area
0,284,Downsview-Roding-CFB (26),"POLYGON ((-79.50783 43.71776, -79.50854 43.717...",28736800.0
1,45,Kennedy Park (124),"POLYGON ((-79.24549 43.73060, -79.24555 43.730...",6861056.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,ID,NAME,lat,lon,AREA_NAME,Shape__Area
0,1,21 Points in Equilibrium (Sculpture),43.772936,-79.257067,Bendale (127),14303500.0
1,2,Crucified Again (Sculpture),43.753806,-79.21617,Woburn (137),23664990.0


CPU times: user 37.3 ms, sys: 186 µs, total: 37.5 ms
Wall time: 35.1 ms


#### Colleges and Universities

In [32]:
print(df_coll_univ["institution_id"].nunique(), len(df_coll_univ))
df_coll_univ.head(2)

11 11


Unnamed: 0,institution_id,institution_name,lat,lon
0,0,centennial,43.7854,-79.22664
1,1,george-brown,43.6761,-79.4111


In [33]:
%%time
df_coll_univ_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_coll_univ,
    "lat",
    "lon",
    "institution_id",
)
display(df_coll_univ_new.head(2))

Unnamed: 0,institution_id,AREA_NAME,geometry,Shape__Area
0,1,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,7,Highland Creek (134),"POLYGON ((-79.17527 43.78021, -79.17535 43.780...",10077020.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,institution_id,institution_name,lat,lon,AREA_NAME,Shape__Area
0,0,centennial,43.7854,-79.22664,Woburn (137),23664990.0
1,1,george-brown,43.6761,-79.4111,Casa Loma (96),3678385.0


CPU times: user 32.6 ms, sys: 65 µs, total: 32.6 ms
Wall time: 30.6 ms


#### Public Transit Locations

In [34]:
print(df_pt_slice["stop_id"].nunique(), len(df_pt_slice))
df_pt_slice.head(2)

9476 9476


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,lat,lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,2
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1


In [35]:
%%time
df_pt_slice_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_pt_slice,
    "lat",
    "lon",
    "stop_id",
)
display(df_pt_slice_new.head(2))

Unnamed: 0,stop_id,AREA_NAME,geometry,Shape__Area
0,1857,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,8268,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,lat,lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,AREA_NAME,Shape__Area
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,2,Clairlea-Birchmount (120),14168540.0
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1,Annex (95),5337192.0


CPU times: user 52.2 ms, sys: 0 ns, total: 52.2 ms
Wall time: 50.6 ms


#### Merge Neighbourhood Aggregations with GeoData and Population Data

In [36]:
df_neigh_stats = (
    (
        gdf.set_index("AREA_NAME")[
            [
                "Shape__Area",
                "Shape__Length",
                "geometry",
                "CLASSIFICATION",
                "CLASSIFICATION_CODE",
                "AREA_LATITUDE",
                "AREA_LONGITUDE",
            ]
        ]
        .merge(
            df_pt_slice_new.groupby("AREA_NAME")["stop_id"]
            .count()
            .rename("transit_stops")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_coll_univ_new.groupby("AREA_NAME")["institution_id"]
            .count()
            .rename("colleges_univs")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            dfch_essentials_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("cultural_attractions")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .merge(
            df_poi_new.groupby("AREA_NAME")["ID"]
            .count()
            .rename("places_of_interest")
            .to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .fillna(0)
        .astype(
            {
                k: int
                for k in [
                    "transit_stops",
                    "colleges_univs",
                    "cultural_attractions",
                    "places_of_interest",
                ]
            }
        )
        .merge(
            df_neigh_demog.set_index("AREA_NAME")[
                ["Population, 2016", "Youth (15-24 years)", "Working Age (25-54 years)"]
            ].rename(
                columns={
                    "Population, 2016": "pop_2016",
                    "Youth (15-24 years)": "youth_15_24",
                    "Working Age (25-54 years)": "work_age_25_54",
                }
            ),
            left_index=True,
            right_index=True,
            how="left",
        )
    )
    .add_prefix("neigh_")
    .rename(columns={"neigh_geometry": "geometry"})
)
df_neigh_stats.columns = df_neigh_stats.columns.str.lower().str.replace("__", "_")
df_neigh_stats = df_neigh_stats.reset_index()
df_neigh_stats.head()

Unnamed: 0,AREA_NAME,neigh_shape_area,neigh_shape_length,geometry,neigh_classification,neigh_classification_code,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
0,Casa Loma (96),3678385.0,8214.176485,"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",0,0,43.681853,-79.408007,42,1,0,3,10968,1080,4555
1,Annex (95),5337192.0,10513.883143,"POLYGON ((-79.39414 43.66872, -79.39588 43.668...",0,0,43.671586,-79.404,98,0,0,1,30526,3750,15040
2,Caledonia-Fairbank (109),2955857.0,6849.911724,"POLYGON ((-79.46021 43.68156, -79.46044 43.681...",0,0,43.688569,-79.455212,35,0,0,0,9955,1220,4570
3,Woodbine Corridor (64),3052518.0,7512.966773,"POLYGON ((-79.31485 43.66674, -79.31660 43.666...",0,0,43.676774,-79.315408,29,0,0,0,12541,1035,6165
4,Lawrence Park South (103),6211341.0,13530.370002,"POLYGON ((-79.41096 43.70408, -79.41165 43.703...",0,0,43.717213,-79.406038,42,0,0,1,15179,2095,5870


In [37]:
type(df_neigh_stats)

geopandas.geodataframe.GeoDataFrame

### Merge Stations Metadata with Aggregated Neighbourhood Stats

Append the neighbourhood containing each bikeshare station to the station metadata

In [38]:
print(df_stations["station_id"].nunique(), len(df_stations))
df_stations.head(2)

615 615


Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,obcn,nearby_distance,post_code,cross_street,physicalkey,transitcard,creditcard,phone
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,647-643-9607,500.0,,,1,1,1,1
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,416-617-9576,500.0,M4Y 1G7,,1,1,1,1


In [39]:
%%time
df_stations_new = get_data_with_neighbourhood(
    gdf[geo_cols],
    df_stations[
        ["station_id", "name", "physical_configuration", "lat", "lon", "altitude", "address", "capacity", "physicalkey", "transitcard", "creditcard", "phone"]
    ],
    "lat",
    "lon",
    "station_id",
)
display(df_stations_new.head(2))

Unnamed: 0,station_id,AREA_NAME,geometry,Shape__Area
0,7142,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0
1,7141,Casa Loma (96),"POLYGON ((-79.41469 43.67391, -79.41485 43.674...",3678385.0


Dropped 1 rows with a missing AREA_NAME


Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,transitcard,creditcard,phone,AREA_NAME,Shape__Area
0,7000,Fort York Blvd / Capreol Ct,REGULAR,43.639832,-79.395954,0.0,Fort York Blvd / Capreol Ct,35,1,1,1,1,Waterfront Communities-The Island (77),25629770.0
1,7001,Wellesley Station Green P,REGULAR,43.664964,-79.38355,0.0,Yonge / Wellesley,17,1,1,1,1,Church-Yonge Corridor (75),2609014.0


CPU times: user 37.1 ms, sys: 147 µs, total: 37.2 ms
Wall time: 35 ms


Merge the modified stations metadata with the neighbourhood stats

In [40]:
df_stations_new = (
    df_stations_new.set_index("AREA_NAME")
    .merge(
        df_neigh_stats.set_index("AREA_NAME"),
        left_index=True,
        right_index=True,
        how="left",
    )
    .reset_index()
)
df_stations_new.head(4)

Unnamed: 0,AREA_NAME,station_id,name,physical_configuration,lat,lon,altitude,address,capacity,physicalkey,...,neigh_classification_code,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
0,Annex (95),7003,Madison Ave / Bloor St W,REGULAR,43.667158,-79.402761,,Madison Ave / Bloor St W,15,1,...,0,43.671586,-79.404,98,0,0,1,30526,3750,15040
1,Annex (95),7040,Euclid Ave / Bloor St W,REGULAR,43.664467,-79.414783,0.0,Euclid Ave / Bloor St W,19,1,...,0,43.671586,-79.404,98,0,0,1,30526,3750,15040
2,Annex (95),7061,Dalton Rd / Bloor St W,REGULAR,43.666294,-79.406643,0.0,Dalton Rd / Bloor St W,15,1,...,0,43.671586,-79.404,98,0,0,1,30526,3750,15040
3,Annex (95),7126,Yonge St / Yorkville Ave,REGULAR,43.671944,-79.387778,0.0,Yonge St / Yorkville Ave,17,1,...,0,43.671586,-79.404,98,0,0,1,30526,3750,15040


## Merge Modified Stations Metadata With Ridership Data

In [41]:
def get_aggregated_station_hourly_trips(
    df: pd.DataFrame, cols: List, trip_point: str = "start"
) -> pd.DataFrame:
    trip_point_cols = [f"{trip_point}_{c}" for c in cols] + ["user_type"]
    station_trips = df.groupby(trip_point_cols, as_index=False).agg(
        {"trip_id": "count", "duration": ["min", "median", "mean", "max"]}
    )
    station_trips.columns = (
        cols
        + ["user_type"]
        + [
            "num_trips",
            "duration_min",
            "duration_median",
            "duration_mean",
            "duration_max",
        ]
    )
    station_trips = (
        station_trips.assign(station_type=trip_point)
        .rename(columns={"trip_id_count": "num_trips"})
        .sort_values(by="num_trips", ascending=False)
    )
    return station_trips

In [42]:
%%time
cols = ["station_name", "year", "month", "day", "hour"]
df_hour_by_station_merged = pd.concat(
    [
        get_aggregated_station_hourly_trips(
            df,
            cols,
            trip_point,
        ).merge(
            df_stations_new.rename(columns={"name": "station_name"}),
            on="station_name",
            how="left",
        ).dropna(subset=["capacity"])
        for trip_point in ["start", "end"]
    ],
    ignore_index=True,
)
for c in ["neigh_pop_2016", "neigh_youth_15_24", "neigh_work_age_25_54"]:
    df_hour_by_station_merged[c] = df_hour_by_station_merged[c].str.replace(",", "").astype(float)
display(df_hour_by_station_merged.head(4).append(df_hour_by_station_merged.tail(4)))
display(
    df_hour_by_station_merged.isna().sum().rename("num_missing").to_frame().merge(
        df_hour_by_station_merged.dtypes.rename("dtype").to_frame(), left_index=True, right_index=True
    )
)

Unnamed: 0,station_name,year,month,day,hour,user_type,num_trips,duration_min,duration_median,duration_mean,...,neigh_classification_code,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
0,Ontario Place Blvd / Lake Shore Blvd W (East),2021,5,30,16,Casual Member,79,300,1320.0,1740.759494,...,0,43.636682,-79.41242,63.0,0.0,0.0,10.0,31180.0,2415.0,23320.0
1,Lake Shore Blvd W / Ontario Dr,2021,5,23,19,Casual Member,73,540,1500.0,2688.493151,...,0,43.636682,-79.41242,63.0,0.0,0.0,10.0,31180.0,2415.0,23320.0
2,Lake Shore Blvd W / Ontario Dr,2021,5,24,19,Casual Member,72,60,1500.0,1632.5,...,0,43.636682,-79.41242,63.0,0.0,0.0,10.0,31180.0,2415.0,23320.0
3,Lake Shore Blvd W / Ontario Dr,2021,5,24,15,Casual Member,70,60,1560.0,1738.285714,...,0,43.636682,-79.41242,63.0,0.0,0.0,10.0,31180.0,2415.0,23320.0
3122844,College St / Huron St,2021,5,11,17,Annual Member,1,300,300.0,300.0,...,0,43.653554,-79.39724,40.0,1.0,0.0,3.0,17945.0,3350.0,8760.0
3122845,Lower Simcoe St / Bremner Blvd,2021,7,25,14,Casual Member,1,960,960.0,960.0,...,0,43.633884,-79.377201,103.0,0.0,0.0,50.0,65913.0,7840.0,45105.0
3122846,College St / Huron St,2021,5,11,15,Casual Member,1,600,600.0,600.0,...,0,43.653554,-79.39724,40.0,1.0,0.0,3.0,17945.0,3350.0,8760.0
3122847,Lower Simcoe St / Bremner Blvd,2021,7,25,17,Annual Member,1,1140,1140.0,1140.0,...,0,43.633884,-79.377201,103.0,0.0,0.0,50.0,65913.0,7840.0,45105.0


Unnamed: 0,num_missing,dtype
station_name,0,object
year,0,int64
month,0,int64
day,0,int64
hour,0,int64
user_type,0,object
num_trips,0,int64
duration_min,0,int64
duration_median,0,float64
duration_mean,0,float64


CPU times: user 10.5 s, sys: 1.23 s, total: 11.8 s
Wall time: 11.8 s


## Database Administration

Create the `bikeshare` database

In [43]:
engine = create_engine(URI_NO_DB)
conn = engine.connect()

In [44]:
_ = conn.execute(f"DROP DATABASE IF EXISTS {DB_NAME};")
_ = conn.execute(f"CREATE DATABASE IF NOT EXISTS {DB_NAME};")

In [45]:
conn.close()
engine.dispose()

## Create Database Table

In [46]:
engine = create_engine(URI)
conn = engine.connect()

Create the `ridership` table in the `bikeshare` database

In [47]:
_ = conn.execute(f"DROP TABLE IF EXISTS {table_name}")

In [48]:
create_table_query = f"""
                     CREATE TABLE IF NOT EXISTS {table_name} (
                         station_name VARCHAR(100),
                         year INT,
                         month INT,
                         day INT,
                         hour INT,
                         user_type VARCHAR(20),
                         num_trips INT,
                         duration_min INT,
                         duration_median FLOAT,
                         duration_mean FLOAT,
                         duration_max INT,
                         station_type VARCHAR(10),
                         area_name TEXT,
                         station_id FLOAT,
                         physical_configuration TEXT,
                         lat FLOAT,
                         lon FLOAT,
                         altitude FLOAT,
                         address TEXT,
                         capacity INT,
                         physicalkey INT,
                         transitcard INT,
                         creditcard INT,
                         phone INT,
                         neigh_shape_area FLOAT,
                         neigh_shape_length FLOAT,
                         neigh_classification TEXT,
                         neigh_classification_code TEXT,
                         neigh_area_latitude FLOAT,
                         neigh_area_longitude FLOAT,
                         neigh_transit_stops INT,
                         neigh_colleges_univs INT,
                         neigh_cultural_attractions INT,
                         neigh_places_of_interest INT,
                         neigh_pop_2016 FLOAT,
                         neigh_youth_15_24 FLOAT,
                         neigh_work_age_25_54 FLOAT
                     )
                     """
_ = conn.execute(create_table_query)

In [49]:
_ = conn.execute(
    f"ALTER TABLE {table_name} ADD UNIQUE unique_index(station_name, year, month, day, hour, user_type, station_type)"
)

## Append Merged Data to Database

In [50]:
%%time
df_hour_by_station_merged.drop(
    columns=["geometry", "Shape__Area"]
).iloc[0:n_rows_to_append_to_db].to_sql(table_name, con=conn, index=False, if_exists="append")

CPU times: user 5.99 s, sys: 271 ms, total: 6.26 s
Wall time: 10.1 s


(NOT DONE HERE) Change the datatype for the neighbourhood stats columns to `INT`

In [51]:
%%time
for c in ["station_id", "neigh_pop_2016", "neigh_youth_15_24", "neigh_work_age_25_54"]:
    _ = conn.execute(f"ALTER TABLE {table_name} MODIFY {c} INTEGER")

CPU times: user 3.34 ms, sys: 442 µs, total: 3.78 ms
Wall time: 9.75 s


Query the data in the database

In [52]:
%%time
df_query = pd.read_sql(
    f"""
    SELECT *
    FROM {table_name}
    WHERE station_type = 'start'
    LIMIT 900000
    """,
    con=conn
)
df_query

CPU times: user 3.69 s, sys: 85.8 ms, total: 3.78 s
Wall time: 3.79 s


Unnamed: 0,station_name,year,month,day,hour,user_type,num_trips,duration_min,duration_median,duration_mean,...,neigh_classification_code,neigh_area_latitude,neigh_area_longitude,neigh_transit_stops,neigh_colleges_univs,neigh_cultural_attractions,neigh_places_of_interest,neigh_pop_2016,neigh_youth_15_24,neigh_work_age_25_54
0,Ontario Place Blvd / Lake Shore Blvd W (East),2021,5,30,16,Casual Member,79,300,1320.0,1740.76,...,0,43.6367,-79.4124,63,0,0,10,31180,2415,23320
1,Lake Shore Blvd W / Ontario Dr,2021,5,23,19,Casual Member,73,540,1500.0,2688.49,...,0,43.6367,-79.4124,63,0,0,10,31180,2415,23320
2,Lake Shore Blvd W / Ontario Dr,2021,5,24,19,Casual Member,72,60,1500.0,1632.50,...,0,43.6367,-79.4124,63,0,0,10,31180,2415,23320
3,Lake Shore Blvd W / Ontario Dr,2021,5,24,15,Casual Member,70,60,1560.0,1738.29,...,0,43.6367,-79.4124,63,0,0,10,31180,2415,23320
4,Lake Shore Blvd W / The Boulevard Club,2021,4,10,17,Casual Member,68,480,1260.0,1743.53,...,NIA,43.6367,-79.4393,33,0,1,1,21849,2275,12440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Princess St / Adelaide St E,2021,10,8,20,Annual Member,4,480,900.0,930.00,...,0,43.6565,-79.3673,61,0,0,3,20506,2060,12530
99996,Bay St / Wellesley St W,2021,3,7,11,Annual Member,4,600,1020.0,930.00,...,0,43.6575,-79.3857,93,0,0,18,25797,6860,13065
99997,Bay St / Wellesley St W,2021,10,19,7,Annual Member,4,240,720.0,8055.00,...,0,43.6575,-79.3857,93,0,0,18,25797,6860,13065
99998,College Park- Gerrard Entrance,2021,6,4,7,Annual Member,4,300,690.0,690.00,...,0,43.6575,-79.3857,93,0,0,18,25797,6860,13065


## Close MySQL Database Connection

In [53]:
conn.close()
engine.dispose()