# Data Pipeline to Get Data

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from prefect import Flow

## About

Use a data pipeline to assemble the data used in the dashboard.

## User Inputs

In [3]:
open_tor_data_url = (
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
)

trips_data_glob_str = "data/raw/*.csv"

stations_params = {"id": "2b44db0d-eea9-442d-b038-79335368ad5a"}
stations_cols_wanted = [
    "station_id",
    "name",
    "physical_configuration",
    "lat",
    "lon",
    "altitude",
    "address",
    "capacity",
    "physicalkey",
    "transitcard",
    "creditcard",
    "phone",
]
neigh_profile_params = {"id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
pt_params = {"id": "7795b45e-e65a-4465-81fc-c36b9dfff169"}
poi_params = {"id": "965247c0-c72e-49b4-bb1a-879cf98e1a32"}
ch_params = {"id": "c7be2ee7-d317-4a28-8cbe-bff1ce116b46"}
neigh_boundary_params = {"id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}

neigh_cols_to_show = [
    "AREA_ID",
    "AREA_SHORT_CODE",
    "AREA_LONG_CODE",
    "AREA_NAME",
    "Shape__Area",
    "Shape__Length",
    "LATITUDE",
    "AREA_LATITUDE",
    "LONGITUDE",
    "AREA_LONGITUDE",
    "geometry",
]
trips_nan_cols = [
    "START_STATION_ID",
    "END_STATION_ID",
    "START_STATION_NAME",
    "END_STATION_NAME",
]
trips_duplicated_cols = ["TRIP_ID", "START_TIME", "END_TIME"]

cols = ["STATION_NAME", "year", "month", "day", "hour"]

# Exporting to staged CSV files
cols_to_export = [
    "STATION_NAME",
    "YEAR",
    "MONTH",
    "DAY",
    "HOUR",
    "USER_TYPE",
    "NUM_TRIPS",
    "DURATION_MEAN",
    "AREA_NAME",
    "PHYSICAL_CONFIGURATION",
    "CAPACITY",
    "PHYSICALKEY",
    "TRANSITCARD",
    "CREDITCARD",
    "PHONE",
    "NEIGH_TRANSIT_STOPS",
    "NEIGH_COLLEGES_UNIVS",
    "NEIGH_CULTURAL_ATTRACTIONS",
    "NEIGH_PLACES_OF_INTEREST",
]
nrows_per_staged_csv_file = 350_000

In [4]:
%aimport src.data_pipe_utils
import src.data_pipe_utils as dpu



## Data Pipeline

### Define Pipeline

In [7]:
with Flow("My Functional Flow") as flow:
    df_stations = dpu.get_bikeshare_stations_metadata(
        open_tor_data_url,
        stations_params,
        stations_cols_wanted,
    )
    df = dpu.get_bikeshare_trips_data(
        trips_data_glob_str,
        trips_nan_cols,
        trips_duplicated_cols,
    )
    dfch_essentials = dpu.get_city_cultural_hotspots_data(open_tor_data_url, ch_params)
    df_poi = dpu.get_city_points_of_interest_data(open_tor_data_url, poi_params)
    gdf = dpu.get_city_neighbourhood_boundary_data(
        open_tor_data_url,
        neigh_boundary_params,
        neigh_cols_to_show,
    )
    df_pt_slice = dpu.get_city_public_transit_locations_data(
        open_tor_data_url, pt_params
    )
    df_coll_univ = dpu.get_city_college_university_locations_data()
    df_neigh_demog = dpu.get_neighbourhood_profile_data(
        open_tor_data_url, neigh_profile_params
    )

    (
        df_poi_new,
        dfch_essentials_new,
        df_coll_univ_new,
        df_pt_slice_new,
        df_neigh_stats,
        df_stations_new,
    ) = dpu.aggregate_data(
        gdf,
        df_poi,
        dfch_essentials,
        df_coll_univ,
        df_pt_slice,
        df_neigh_demog,
        df_stations,
    )

    df_hour_by_station_merged = dpu.combine_trips_neighbourhood_data(
        df, cols, df_stations_new
    )

    dpu.export_aggregated_data_multiple_csvs(
        df_hour_by_station_merged,
        cols_to_export,
        nrows_per_staged_csv_file,
    )

### Run Pipeline

In [8]:
%%time
state = flow.run()

[2022-04-10 12:11:23-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'My Functional Flow'
[2022-04-10 12:11:23-0400] INFO - prefect.TaskRunner | Task 'get_city_cultural_hotspots_data': Starting task run...
[2022-04-10 12:11:24-0400] INFO - prefect.get_city_cultural_hotspots_data | Retrieved 470 rows of citywide cultural hotspot data.
[2022-04-10 12:11:24-0400] INFO - prefect.TaskRunner | Task 'get_city_cultural_hotspots_data': Finished task run for task with final state: 'Success'
[2022-04-10 12:11:24-0400] INFO - prefect.TaskRunner | Task 'get_city_points_of_interest_data': Starting task run...
[2022-04-10 12:11:24-0400] INFO - prefect.get_city_points_of_interest_data | Retrieved 174 rows of citywide points-of-interest data.
[2022-04-10 12:11:24-0400] INFO - prefect.TaskRunner | Task 'get_city_points_of_interest_data': Finished task run for task with final state: 'Success'
[2022-04-10 12:11:24-0400] INFO - prefect.TaskRunner | Task 'get_neighbourhood_profile_data': Starting t

In [None]:
%%time
# print(state.result[gdf].shape)
# display(state.result[gdf].result.describe())
display(state.result[df_neigh_demog].result.describe())
# display(state.result[df_poi_new].result.describe())
# display(state.result[dfch_essentials_new].result.describe())
# display(state.result[df_coll_univ_new].result.describe())
# display(state.result[df_pt_slice_new].result.describe())
with pd.option_context('display.max_columns', 100):
    display(state.result[df_neigh_stats].result.describe())
    display(state.result[df_stations_new].result.describe())
    display(state.result[df_hour_by_station_merged].result.describe())