# Generate Recommendations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import warnings
from glob import glob
from itertools import product

import duckdb
import pandas as pd
from watermark import watermark

In [3]:
PROJ_ROOT = os.pardir
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport file_utils
import file_utils as flut

%aimport pandas_utils
import pandas_utils as pu

## About

### Objective

The previously evaluated station performance, temporal trends and geospatial patterns are combined in order to make recommendations for

1. which stations to target as part of the campaign (using geospatial insights)
2. when stations should be prioritized (using temporal insights)

### Data

The following previously-created processed dataset is used to generate recommendations

1. identification of bike share stations as top-performers or not (processed)

### Assumptions

1. Same as in data retrieval and processing steps.

## User Inputs

In [5]:
# calendar
start_date = '2023-05-22 00:00:00'
end_date = '2023-12-31 23:00:00'
start_prime_date = '2023-05-01 00:00:00'
end_prime_date = '2023-09-30 23:00:00'

# export to disk
my_timezone = 'America/Toronto'

In [6]:
data_dir = os.path.join(PROJ_ROOT, 'data')
processed_data_dir = os.path.join(data_dir, 'processed')

fpath_top_stations = glob(
    os.path.join(processed_data_dir, 'stations_performance__*.parquet.gzip')
)[0]

# recommended performance filters
fpath_recommends_performance = glob(
    os.path.join(processed_data_dir, 'recommendations_performance__*.parquet.gzip')
)[0]

# recommended temporal filters
fpath_recommends_temporal = glob(
    os.path.join(processed_data_dir, 'recommendations_temporal__*.parquet.gzip')
)[0]

# recommended geospatial filters
fpath_recommends_geospatial = glob(
    os.path.join(processed_data_dir, 'recommendations_geospatial__*.parquet.gzip')
)[0]

In [7]:
def run_sql_query(query: str, verbose: bool=False) -> pd.DataFrame:
    """Run SQL query using DuckDB."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        df_query = duckdb.sql(query).df()
    if verbose:
        print(f"Query returned {len(df_query):,} rows")
    return df_query

## Get Recommendation Filters

Load geospatial filters to generate recommendations for station locations to be used during the campaign

In [8]:
df_geo_recommends = pd.read_parquet(fpath_recommends_geospatial)
recommended_station_filters = df_geo_recommends.squeeze()

Load temporal filters to generate recommendations for an hourly calendar to be followed during the campaign

In [9]:
df_temporal_recommends = pd.read_parquet(fpath_recommends_temporal)
recommended_hour_filters = {
    idx: row.squeeze()
    for idx, row in df_temporal_recommends.transpose().iterrows()
}

Load combined temporal+performance filters to generate recommendations for stations to be used during the campaign

In [10]:
df_performance_recommends = pd.read_parquet(fpath_recommends_performance)
recommended_performance_filters = df_performance_recommends.squeeze()

## Generate Recommendations

Generate recommendations using the above filters by following the workflow below

1. Extract
   - get previously classified stations as top-performers or not
2. Transform
   - get recommended stations based on geospatial analytis and current station attributes and overall performance
     - apply filters to select the recommended stations based on the outcome of the two-part EDA performed in the two previous steps
     - calculate the [market penetration](https://www.wordhippo.com/what-is/another-word-for/market_penetration.html) of the recommded stations
       - this is the fraction of overall ridership from the previous full year (2022) that would have been captured by these stations
       - this is analogous to [market share](https://www.wordhippo.com/what-is/another-word-for/market_share.html)
   - get recommended hourly schedule based on temporal analysis and available hours during campaign execution window
     - create an calendar with daily recommended hours between May 1, 2023 and December 31, 2023
       - these two dates are chosen since one of the client's constraints in the project scope was that the campaign can be run between May 1, 2023 and December 31, 2023
     - calculate the fraction of these available hours during which it is recommended to display ads on bike share stations
4. Load
   - export recommendations to disk

### Extract

Show previously classified stations as top-performers or not

In [11]:
%%time
query = f"""
        SELECT *
        FROM read_parquet({[fpath_top_stations]})
        """
df_top_stations = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_columns', None):
    pu.show_df(df_top_stations)

column,station_id,name,physical_configuration,capacity,is_charging_station,credit,Neighbourhood,Location,census_tract_id,is_active,departures_last_year,arrivals_last_year,departures_last_n_years,arrivals_last_n_years,is_downtown,rank_deps_last_year,rank_deps_last_n_years,rank_arrs_last_year,rank_arrs_last_n_years,is_top_perform_station,departures_weekday_last_year,arrivals_weekday_last_year,departures_weekday_last_n_years,arrivals_weekday_last_n_years,rank_weekday_deps_last_year,rank_weekday_deps_last_n_years,rank_weekday_arrs_last_year,rank_weekday_arrs_last_n_years,is_top_perform_station_weekday,departures_weekend_last_year,arrivals_weekend_last_year,departures_weekend_last_n_years,arrivals_weekend_last_n_years,rank_weekend_deps_last_year,rank_weekend_deps_last_n_years,rank_weekend_arrs_last_year,rank_weekend_arrs_last_n_years,is_top_perform_station_weekend
dtype,Int64,string[python],string[python],Int64,boolean,boolean,string[python],string[python],string[python],boolean,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,boolean
nunique,627,627,6,39,2,2,83,4,214,1,611,603,616,618,2,611,616,603,618,2,606,594,614,609,606,614,594,609,2,570,576,601,598,570,601,576,598,2
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
0,7076,York St / Queens Quay W,REGULAR,57,False,True,Harbourfront-CityPlace,Downtown,5350012.04,True,38593,43978,142416,158202,True,1,1,1,1,True,24425,27958,92962,103596,2,1,1,1,True,14168,16020,49454,54606,1,2,1,1,True
1,7016,Bay St / Queens Quay W (Ferry Terminal),REGULAR,35,False,True,St Lawrence-East Bayfront-The Islands,Downtown,5350013.02,True,27451,28291,109448,117385,True,3,2,5,2,True,16903,17456,69306,74888,13,4,13,4,True,10548,10835,40142,42497,5,5,5,5,True
2,7033,Union Station,REGULAR,43,False,True,St Lawrence-East Bayfront-The Islands,Downtown,5350013.01,True,23752,32254,85757,110893,True,14,13,2,3,True,17726,24900,68404,90677,6,5,2,2,True,6026,7354,17353,20216,35,46,20,33,True
3,7175,HTO Park (Queens Quay W),REGULAR,27,False,True,Harbourfront-CityPlace,Downtown,5350012.04,True,25078,29431,98277,110699,True,9,6,4,4,True,13872,15490,56437,63065,31,14,20,9,True,11206,13941,41840,47634,3,4,2,3,True
4,7203,Bathurst St/Queens Quay(Billy Bishop Airport),REGULAR,35,False,True,Fort York-Liberty Village,West of Downtown,5350008.02,True,22130,25285,100580,109788,False,23,5,15,5,True,13160,15270,61724,68428,37,9,21,6,True,8970,10015,38856,41360,8,6,8,6,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,7156,Salem Ave / Bloor St W,REGULAR,15,False,True,Dovercourt Village,Others,5350096.02,True,5364,5230,20884,20181,False,276,241,282,244,False,3709,3656,14688,14325,279,237,282,242,False,1655,1574,6196,5856,276,232,282,246,False
623,7600,Ursula Franklin St / Huron St - SMART,SMARTMAPFRAME,20,False,False,University,Downtown,5350061.00,True,7406,6742,11968,11152,True,233,334,238,340,False,6453,6002,10363,9898,177,296,192,292,False,953,740,1605,1254,390,467,423,486,False
624,7414,Keele St / Annette St,REGULAR,15,False,True,Junction Area,Others,5350101.00,True,2172,2029,6730,6533,False,446,428,445,421,False,1478,1342,4475,4250,438,428,448,426,False,694,687,2255,2283,442,412,431,401,False
625,7622,Marie Curtis Park,REGULAR,23,False,True,Long Branch,Others,5350206.01,True,1588,1612,4509,4633,False,489,473,472,462,False,847,856,2261,2286,514,498,494,491,False,741,756,2248,2347,434,413,420,398,False


CPU times: user 43.2 ms, sys: 0 ns, total: 43.2 ms
Wall time: 42.5 ms


### Transform

#### Get Recommended Stations

Filter station performance data to identify the recommended stations using the following workflow

1. get recommended stations and append a `is_recommended` column indicating `True`
2. combine all stations with recommended stations and fill missing values in the `is_recommended` column with `False`
3. calculate the market penetration metric on a cumulative basis
   - this metric is defined as the fraction of ridership during last full year that is captured by stations recommended in step 1. above

In [12]:
%%time
query = f"""
        -- 1. get recommended stations
        WITH t1 AS (
            SELECT *,
                   True AS is_recommended
            FROM df_top_stations
            WHERE {recommended_station_filters}
            AND {recommended_performance_filters}
        ),
        -- 2. combine with all stations and fill missing values with FALSE in order to identify
        -- unrecommended stations
        t2 AS (
            SELECT * EXCLUDE(is_recommended),
                   -- fill missing values with FALSE to indicate station is not recommended
                   COALESCE(is_recommended, NULL, FALSE) AS is_recommended
            FROM df_top_stations
            LEFT JOIN (
                SELECT station_id,
                       name,
                       is_recommended
                FROM t1
            ) USING (station_id, name)
        ),
        -- 3. append market penetration column (cumulative)
        t3 AS (
            SELECT *,
                   -- append running total of departures (trips) during last year
                   SUM(departures_last_year) OVER(ORDER BY is_recommended DESC, departures_last_year DESC) AS trips_last_year_cumsum,
                   -- append running fraction of departures (trips) during last year
                   SUM(departures_last_year) OVER() AS total_trips_last_year,
                   -- calculate market penetration metric
                   100* trips_last_year_cumsum/total_trips_last_year AS frac_trips_last_year
            FROM t2
        )
        SELECT *
        FROM t3
        """
df_top_stations_with_recommends = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_columns', None):
    pu.show_df(df_top_stations_with_recommends)

column,station_id,name,physical_configuration,capacity,is_charging_station,credit,Neighbourhood,Location,census_tract_id,is_active,departures_last_year,arrivals_last_year,departures_last_n_years,arrivals_last_n_years,is_downtown,rank_deps_last_year,rank_deps_last_n_years,rank_arrs_last_year,rank_arrs_last_n_years,is_top_perform_station,departures_weekday_last_year,arrivals_weekday_last_year,departures_weekday_last_n_years,arrivals_weekday_last_n_years,rank_weekday_deps_last_year,rank_weekday_deps_last_n_years,rank_weekday_arrs_last_year,rank_weekday_arrs_last_n_years,is_top_perform_station_weekday,departures_weekend_last_year,arrivals_weekend_last_year,departures_weekend_last_n_years,arrivals_weekend_last_n_years,rank_weekend_deps_last_year,rank_weekend_deps_last_n_years,rank_weekend_arrs_last_year,rank_weekend_arrs_last_n_years,is_top_perform_station_weekend,is_recommended,trips_last_year_cumsum,total_trips_last_year,frac_trips_last_year
dtype,Int64,string[python],string[python],Int64,boolean,boolean,string[python],string[python],string[python],boolean,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,boolean,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,boolean,boolean,Int64,Int64,Float64
nunique,627,627,6,39,2,2,83,4,214,1,611,603,616,618,2,611,616,603,618,2,606,594,614,609,606,614,594,609,2,570,576,601,598,570,601,576,598,2,2,611,1,611
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7076,York St / Queens Quay W,REGULAR,57,False,True,Harbourfront-CityPlace,Downtown,5350012.04,True,38593,43978,142416,158202,True,1,1,1,1,True,24425,27958,92962,103596,2,1,1,1,True,14168,16020,49454,54606,1,2,1,1,True,True,38593,4216239,0.915342
1,7006,Bay St / College St (East Side),REGULAR,19,False,True,Yonge-Bay Corridor,Downtown,5350035.00,True,32021,32104,100853,101141,True,2,4,3,7,True,24624,24582,78925,78918,1,2,3,3,True,7397,7522,21928,22223,15,26,18,27,True,True,70614,4216239,1.67481
2,7016,Bay St / Queens Quay W (Ferry Terminal),REGULAR,35,False,True,St Lawrence-East Bayfront-The Islands,Downtown,5350013.02,True,27451,28291,109448,117385,True,3,2,5,2,True,16903,17456,69306,74888,13,4,13,4,True,10548,10835,40142,42497,5,5,5,5,True,True,98065,4216239,2.325888
3,7030,Bay St / Wellesley St W,REGULAR,51,False,True,Bay-Cloverhill,Downtown,5350062.03,True,25864,18196,92838,64322,True,4,8,40,31,True,19108,13467,72224,49228,3,3,37,30,True,6756,4729,20614,15094,21,31,63,66,True,True,123929,4216239,2.939326
4,7322,East Liberty St / Western Battery Rd,REGULAR,18,False,True,Fort York-Liberty Village,West of Downtown,5350008.01,True,25664,26207,48222,50230,False,5,81,9,77,True,18214,18566,34725,36536,4,79,6,74,True,7450,7641,13497,13694,14,86,16,82,True,True,149593,4216239,3.54802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,7736,Birchmount Rd / Kingston Rd,VAULT,15,False,False,Birchcliffe-Cliffside,Others,5350338.00,True,7,8,7,8,False,623,623,622,622,False,1,4,1,4,625,625,624,624,False,6,4,6,4,617,617,617,617,False,False,4216225,4216239,99.999668
623,7755,Park Lawn Rd / Berry Rd,VAULT,15,False,False,Stonegate-Queensway,Others,5350216.00,True,6,8,6,8,False,624,624,622,622,False,4,6,4,6,622,622,622,622,False,2,2,2,2,621,621,622,622,False,False,4216237,4216239,99.999953
624,7759,The Pond Rd / Shoreham Dr,VAULT,15,False,False,York University Heights,Others,5350311.06,True,6,7,6,7,False,624,624,625,625,False,4,5,4,5,622,622,623,623,False,2,2,2,2,621,621,622,622,False,False,4216237,4216239,99.999953
625,7758,Driftwood Ave / Finch Corridor Trail,VAULT,18,False,False,Black Creek,Others,5350316.05,True,2,2,2,2,False,626,626,626,626,False,1,1,1,1,625,625,625,625,False,1,1,1,1,625,625,624,624,False,False,4216239,4216239,100.0


CPU times: user 62.5 ms, sys: 5.32 ms, total: 67.8 ms
Wall time: 62.9 ms


**Notes**

1. The largest value in the `total_trips_last_year` column is less than the total 2022 bike share ridership across the network (see `02_process_data.ipynb`) since the above only shows the currently active stations. Ridership at the stations that are currently inactive is not included in this value.

#### Get Recommended Schedule

Create hourly calendar from May 1, 2023 to December 31, 2023

In [13]:
%%time
dict_in = dict(
    user_type=['Annual', 'Casual'],
    datetime=pd.date_range(start_date, end_date, freq='h'),
)
cart = list(product(*dict_in.values()))
df_calendar = pd.DataFrame(cart, columns=dict_in.keys())
pu.show_df(df_calendar)

column,user_type,datetime
dtype,object,datetime64[ns]
nunique,2,5376
missing,0,0
0,Annual,2023-05-22 00:00:00
1,Annual,2023-05-22 01:00:00
2,Annual,2023-05-22 02:00:00
3,Annual,2023-05-22 03:00:00
4,Annual,2023-05-22 04:00:00
...,...,...
10747,Casual,2023-12-31 19:00:00
10748,Casual,2023-12-31 20:00:00
10749,Casual,2023-12-31 21:00:00
10750,Casual,2023-12-31 22:00:00


CPU times: user 17 ms, sys: 1.03 ms, total: 18.1 ms
Wall time: 17.6 ms


Filter station performance data to identify the recommended stations using the following workflow

1. get recommended weekday hourly schedule during prime bike share season and append a `is_recommended` column indicating `True`
2. get recommended weekend hourly schedule during prime bike share season and append a `is_recommended` column indicating `True`
3. get recommended weekday hourly schedule during the off-season and append a `is_recommended` column indicating `True`
4. combine recommended hourly schedules
5. combine full hourly calendar with recommended hourly schedule and fill missing values in the `is_recommended` column with `False`
6. calculate the activated fraction of available hours metric on a cumulative basis
   - this metric is defined as the fraction of all possible upcoming hours during the candidate months for running the campaign which are recommended for displaying ads in step 4. above

In [14]:
%%time
query = f"""
        -- 1. get weekday recommendations during prime bike share season
        WITH t1 AS (
            SELECT *,
                   MONTHNAME(datetime) AS month,
                   DAYNAME(datetime) AS day_of_week,
                   HOUR(datetime) AS hour,
                   True AS is_recommended
            FROM df_calendar
            WHERE {recommended_hour_filters['weekday_prime']}
        ),
        -- 2. get weekend recommendations during prime bike share season
        t2 AS (
            SELECT *,
                   MONTHNAME(datetime) AS month,
                   DAYNAME(datetime) AS day_of_week,
                   HOUR(datetime) AS hour,
                   True AS is_recommended
            FROM df_calendar
            WHERE {recommended_hour_filters['weekend_prime']}
        ),
        -- 3. get weekday recommendations during off-season
        t3 AS (
            SELECT *,
                   MONTHNAME(datetime) AS month,
                   DAYNAME(datetime) AS day_of_week,
                   HOUR(datetime) AS hour,
                   True AS is_recommended
            FROM df_calendar
            WHERE {recommended_hour_filters['weekday_offseason']}
        ),
        -- 4. combine recommended schedules
        t4 AS (
            SELECT * EXCLUDE(month, day_of_week, hour)
            FROM (
                SELECT *
                FROM t1
                UNION
                SELECT *
                FROM t2
                UNION
                SELECT *
                FROM t3
            )
            ORDER BY datetime
        ),
        -- 5. combine with full schedule and fill missing values with FALSE in order to identify
        -- unrecommended hours
        t5 AS (
            SELECT * EXCLUDE(is_recommended),
                   MONTHNAME(datetime) AS month,
                   DAYNAME(datetime) AS day_of_week,
                   HOUR(datetime) AS hour,
                   -- append whether datetime is on a weekday or weekend
                   (
                       CASE WHEN day_of_week IN ('Saturday', 'Sunday')
                       THEN 'Weekend'
                       ELSE 'Weekday'
                       END
                   ) AS type_of_day,
                   -- append whether datetime is during prime bike-share season or off-Season
                   (
                       CASE
                           WHEN datetime >= '{start_prime_date}'
                           AND datetime <= '{end_prime_date}'
                           THEN 'Prime Bike Share Season'
                       ELSE
                       'Off-Season'
                       END
                   ) AS season,
                   -- fill missing values with FALSE to indicate hour is not recommended
                   COALESCE(is_recommended, NULL, FALSE) AS is_recommended
            FROM df_calendar
            LEFT JOIN t4 USING (datetime, user_type)
        ),
        -- 6. append fractional activated hours column (cumulative)
        t6 AS (
            SELECT *,
                   -- append running total of hours during which ads are recommended for display
                   SUM(CAST(is_recommended AS INTEGER)) OVER(PARTITION BY user_type ORDER BY datetime) AS hours_cumsum,
                   -- append running fraction of hours during which ads are recommended for display
                   COUNT(CAST(is_recommended AS INTEGER)) OVER(PARTITION BY user_type) AS total_hours,
                   -- calculate recommended hours metric
                   100* hours_cumsum/total_hours AS frac_recommended_hours
            FROM t5
            ORDER BY datetime, user_type
        )
        SELECT *
        FROM t6
        """
df_calendar_recommended = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_columns', None):
    pu.show_df(df_calendar_recommended)

column,user_type,datetime,month,day_of_week,hour,type_of_day,season,is_recommended,hours_cumsum,total_hours,frac_recommended_hours
dtype,string[python],datetime64[ns],string[python],string[python],Int64,string[python],string[python],boolean,Int64,Int64,Float64
nunique,2,5376,8,7,24,2,2,2,1225,1,1225
missing,0,0,0,0,0,0,0,0,0,0,0
0,Annual,2023-05-22 00:00:00,May,Monday,0,Weekday,Prime Bike Share Season,False,0,5376,0.0
1,Casual,2023-05-22 00:00:00,May,Monday,0,Weekday,Prime Bike Share Season,False,0,5376,0.0
2,Annual,2023-05-22 01:00:00,May,Monday,1,Weekday,Prime Bike Share Season,False,0,5376,0.0
3,Casual,2023-05-22 01:00:00,May,Monday,1,Weekday,Prime Bike Share Season,False,0,5376,0.0
4,Annual,2023-05-22 02:00:00,May,Monday,2,Weekday,Prime Bike Share Season,False,0,5376,0.0
...,...,...,...,...,...,...,...,...,...,...,...
10747,Casual,2023-12-31 21:00:00,December,Sunday,21,Weekend,Off-Season,False,1224,5376,22.767857
10748,Annual,2023-12-31 22:00:00,December,Sunday,22,Weekend,Off-Season,False,948,5376,17.633929
10749,Casual,2023-12-31 22:00:00,December,Sunday,22,Weekend,Off-Season,False,1224,5376,22.767857
10750,Annual,2023-12-31 23:00:00,December,Sunday,23,Weekend,Off-Season,False,948,5376,17.633929


CPU times: user 89.3 ms, sys: 34.5 ms, total: 124 ms
Wall time: 53 ms


The two metrics are summarized below

1. market penetration of the recommended stations, in terms of 2022 bike share ridership
2. fraction of available upcoming hours between May 1, 2023 and December 31, 2023 that are recommended for displaying ads on the faces of bike share stations

In [15]:
num_is_top, num_stations_used, num_recommends = [
    df_top_stations_with_recommends['is_top_perform_station'].sum(),
    len(df_top_stations_with_recommends),
    df_top_stations_with_recommends['is_recommended'].sum(),
]
msg = (
    f'overall performance insights from {num_is_top} top-performing out of '
    f'{num_stations_used} stations used between 2018 and 2022'
)
query = f"""
        -- 1. summarize market penetration of 2022 ridership by recommended stations
        WITH t1 AS (
            SELECT 'Market Penetration (%)' AS metric,
                   'Fraction of bike share ridership accounted for by recommended stations' AS description,
                   'recommendation' AS metric_type,
                   'geospatial insights and current station attributes' AS data_used,
                   make_date(2022, 1, 1) AS start_date,
                   make_date(2022, 12, 31) AS end_date,
                   'historical' AS timeframe,
                   SUM(departures_last_year) AS total_recommended,
                   {df_top_stations['departures_last_year'].sum()} AS total,
                   MAX(frac_trips_last_year) AS value
            FROM df_top_stations_with_recommends
            WHERE is_recommended = True
        ),
        -- 2. summarize recommended fraction of available hours to activate during 2023
        t2 AS (
            SELECT 'Fraction of activated hours (%)' AS metric,
                   'Fraction of available campaign hours during which it is recommended to display ads' AS description,
                   'recommendation' AS metric_type,
                   'temporal insights and available upcoming calendar' AS data_used,
                   make_date(2023, 5, 1) AS start_date,
                   make_date(2023, 12, 31) AS end_date,
                   'future' AS timeframe,
                   MAX(hours_cumsum) AS total_recommended,
                   MAX(total_hours) AS total,
                   MAX(frac_recommended_hours) AS value
            FROM df_calendar_recommended
            WHERE is_recommended = True
        ),
        t3 AS (
            SELECT 'Fraction of stations used that are recommended (%)' AS metric,
                   'Fraction of bike share stations accounted that are recommended' AS description,
                   'recommendation' AS metric_type,
                   '{msg}' AS data_used,
                   make_date(2023, 5, 1) AS start_date,
                   make_date(2023, 12, 31) AS end_date,
                   'historical' AS timeframe,
                   {num_recommends} AS total_recommended,
                   {num_stations_used} AS total,
                   100*{num_recommends}/{num_stations_used} AS value
            FROM df_top_stations_with_recommends
        )
        SELECT *
        FROM (
            SELECT *
            FROM t1
            UNION
            SELECT *
            FROM t2
            UNION
            SELECT *
            FROM t3
        )
        ORDER BY start_date
        """
df_recommendation_summary = run_sql_query(query).convert_dtypes()
with pd.option_context('display.max_colwidth', None):
    pu.show_df(df_recommendation_summary)

column,metric,description,metric_type,data_used,start_date,end_date,timeframe,total_recommended,total,value
dtype,string[python],string[python],string[python],string[python],datetime64[us],datetime64[us],string[python],Int64,Int64,Float64
nunique,3,3,1,3,2,2,2,3,3,3
missing,0,0,0,0,0,0,0,0,0,0
0,Market Penetration (%),Fraction of bike share ridership accounted for by recommended stations,recommendation,geospatial insights and current station attributes,2022-01-01,2022-12-31,historical,1485958,4216239,35.243685
1,Fraction of activated hours (%),Fraction of available campaign hours during which it is recommended to display ads,recommendation,temporal insights and available upcoming calendar,2023-05-01,2023-12-31,future,1224,5376,22.767857
2,Fraction of stations used that are recommended (%),Fraction of bike share stations accounted that are recommended,recommendation,overall performance insights from 98 top-performing out of 627 stations used between 2018 and 2022,2023-05-01,2023-12-31,historical,78,627,12.440191


**Observations**

1. 78 recommended and top-performing stations captured a 35% market penetration of total bike share ridership in 2022 across the Bike Share Toronto network. The remaining ~550 stations are required to capture the remaining 65%.
2. The recommended hourly schedule calls for ads to be displayed during approximately 23% of all available hours during the combined campaign execution window of May 22 to December 31 of 2023.
3. ~12% of the top-performing stations used in 2018 to 2022 ridership are recommended for displaying digital ads.

### Load

Export recommended stations to disk

In [16]:
%%time
fname_prefix = "recommended_stations"
_ = df_top_stations_with_recommends.pipe(
    flut.load,
    processed_data_dir,
    fname_prefix,
    my_timezone,
    True,
)

Exported 627 rows of recommended_stations data to /home/jovyan/data/processed/recommended_stations__20240321_133049.parquet.gzip
CPU times: user 24.4 ms, sys: 4.48 ms, total: 28.8 ms
Wall time: 28.7 ms


Export recommended schedule to disk

In [17]:
%%time
fname_prefix = "recommended_schedule"
_ = df_calendar_recommended.pipe(
    flut.load,
    processed_data_dir,
    fname_prefix,
    my_timezone,
    True,
)

Exported 10,752 rows of recommended_schedule data to /home/jovyan/data/processed/recommended_schedule__20240321_133049.parquet.gzip
CPU times: user 12.1 ms, sys: 4.14 ms, total: 16.2 ms
Wall time: 16 ms


## Discussion

### Conclusions

1. This step has generated station and schedule recommendations that can be used by the client to recommend
   - which stations that should be targeted during the campaign
   - when the targeted stations should be prioritized

## Summary of Assumptions

1. Same as in data retrieval and processing step.

## Version Information

In [18]:
packages = [
    'pandas',
    'pyarrow',
    'duckdb',
]
print(
    watermark(
        updated=True,
        current_date=True,
        current_time=True,
        timezone=True,
        custom_time="%Y-%m-%d %H:%M:%S %Z",
        python=True,
        machine=True,
        packages=','.join(packages),
    )
)

Last updated: 2024-03-21 17:30:49 UTC

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

pandas : 2.2.1
pyarrow: 15.0.1
duckdb : 0.10.0

Compiler    : GCC 12.3.0
OS          : Linux
Release     : 6.6.10-76060610-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

