## Generate Static Graphs

These are the input parameters for the notebook. They will be automatically changed when the scripts to generate monthly statistics are run. You can modify them manually to generate multiple plots locally as well.

Pass in `None` to remove the filters and plot all data. This is not recommended for production settings, but might be useful for reports based on data snapshots.

In [None]:
year = 2020
month = 11
program = "default"
study_type = "program"
mode_of_interest = "e-bike"
include_test_users = False

In [None]:
from collections import defaultdict
import datetime

import numpy as np
import pandas as pd

from plots import *
import scaffolding

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

In [None]:
# Do not run this notebook at all unless it is for a program; nbclient will run up through this cell
if study_type != "program":
    ipython = get_ipython()
    ipython._showtraceback = scaffolding.no_traceback_handler
    raise Exception("The plots in this notebook are only relevant to programs")

In [None]:
# Loading mapping dictionaries from mapping_dictionaries notebook
%store -r dic_re
%store -r dic_pur

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)

## Collect Data From Database

In [None]:
expanded_ct, file_suffix, quality_text, debug_df = scaffolding.load_viz_notebook_data(year,
                                                                            month,
                                                                            program,
                                                                            study_type,
                                                                            dic_re,
                                                                            dic_pur=dic_pur,
                                                                            include_test_users=include_test_users)

In [None]:
if 'mode_confirm' in expanded_ct.columns:
    mode_of_interest_df = expanded_ct.query(f"mode_confirm == '{mode_of_interest}'")
    debug_df.loc[f"{mode_of_interest}_trips"] = len(mode_of_interest_df)
    debug_df.loc[f"{mode_of_interest}_trips_with_replaced_mode"] = scaffolding.trip_label_count("Replaced_mode", mode_of_interest_df)

## Data Preprocessing

In [None]:
# Get timestamp from known year/month/day aggregated to days
sel_cols_no_label_dep = ['user_id','start_local_dt_year','start_local_dt_month','start_local_dt_day','distance_miles']
sel_cols_with_label_dep = sel_cols_no_label_dep + ['mode_confirm']
if len(expanded_ct) == 0:
    data = expanded_ct.copy()
elif "mode_confirm" not in expanded_ct.columns:
    data = expanded_ct[sel_cols_no_label_dep].copy()
else:
    data = expanded_ct[sel_cols_with_label_dep].copy()
    
if len(expanded_ct) > 0:
    data.rename(columns={'start_local_dt_year':'year','start_local_dt_month':'month','start_local_dt_day':'day'}, inplace=True)
    data['date_time'] = pd.to_datetime(data[['year','month','day']])
    data = data.drop(columns=['year','month','day'])

    # Categorical type will include all days/modes in groupby even if there is no data for a particular tabulation
    data.user_id = pd.Categorical(data.user_id)
    data.date_time = pd.Categorical(data.date_time)
    
data.head()

In [None]:
if len(expanded_ct) > 0:
    # Get the count of unique users that were active on each given date
    active_users = pd.DataFrame(data.groupby(['date_time'], as_index=False)['user_id'].nunique())
    active_users.rename(columns={'user_id':'active_users'}, inplace=True)

    if "mode_confirm" in expanded_ct.columns:
        # Count the number of trips for each confirmed mode
        mode_counts = data.groupby(['user_id','date_time','mode_confirm'], as_index=False).size()
        mode_counts.rename(columns={'size':'trip_count'}, inplace=True)

        # Sum daily distance traveled for each mode
        mode_distance = data.groupby(['user_id','date_time','mode_confirm'], as_index=False)[['distance_miles']].sum()
        mode_distance.rename(columns={'sum':'distance_miles'}, inplace=True)
        mode_distance['distance_miles'] = mode_distance['distance_miles'].fillna(0)

        # Add 7-day rolling avg smoothing to better see trends
        mode_counts['trip_count_smooth'] = mode_counts.groupby(['user_id','mode_confirm'])['trip_count'].apply(lambda x: x.rolling(7,1).mean())
        mode_distance['distance_miles_smooth'] = mode_distance.groupby(['user_id','mode_confirm'])['distance_miles'].apply(lambda x: x.rolling(7,1).mean())
        
        # This is the mode specific part
        mode_counts_interest = mode_counts[mode_counts['mode_confirm']==mode_of_interest].copy()
        mode_distance_interest = mode_distance[mode_distance['mode_confirm']==mode_of_interest].copy()
        
        # Mapping new mode labels with dictionaries
        mode_counts['Mode_confirm'] = mode_counts['mode_confirm'].map(dic_re)
        mode_counts_interest['Mode_confirm'] = mode_counts_interest['mode_confirm'].map(dic_re)
        mode_distance_interest['Mode_confirm'] = mode_distance_interest['mode_confirm'].map(dic_re)

In [None]:
quality_text = scaffolding.get_quality_text(expanded_ct, mode_counts_interest, mode_of_interest, include_test_users)

## Generate Timeseries Plots

### Daily trips for mode of interest

In [None]:
# Plot of total ebikeshare trips across all users
plot_title_no_quality = f'Daily {mode_of_interest} Trips'
file_name = f"ts_{mode_of_interest}_share%s"%file_suffix

try:
    if len(mode_counts_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")
    plot_data = mode_counts_interest.groupby(['date_time'])['trip_count_smooth'].agg(['sum']).reset_index()
    
    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Trip Count'

    timeseries_plot(plot_data['date_time'], plot_data['sum'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Daily mileage for mode of interest

In [None]:
# Plot of total ebikeshare mileage across all users
plot_title_no_quality = f'Daily {mode_of_interest} Mileage'
file_name = f"ts_{mode_of_interest}_miles%s"%file_suffix

try:
    if len(mode_counts_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")

    plot_data = mode_distance_interest.groupby(['date_time'])['distance_miles'].agg(['sum']).reset_index()

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Miles'

    timeseries_plot(plot_data['date_time'], plot_data['sum'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Daily mileage per user for mode of interest

In [None]:
# Plot of total ebikeshare mileage normalized by number of users
plot_title_no_quality = f'Daily {mode_of_interest} Mileage per Active User'
file_name = f"ts_{mode_of_interest}_miles_user%s"%file_suffix

try:
    if len(mode_counts_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")

    plot_data = mode_distance_interest.groupby(['date_time'])['distance_miles'].agg(['sum']).reset_index()
    plot_data = plot_data.merge(active_users, on='date_time')
    plot_data['mileage_per_user'] = plot_data['sum'] / plot_data['active_users']

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'miles/user'

    timeseries_plot(plot_data['date_time'], plot_data['mileage_per_user'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Proportion of total daily mileage for mode of interest

In [None]:
# Plot of ebike mileage share proportion across all users
plot_title_no_quality = f'Daily {mode_of_interest} Mileage Proportion (All Users, excluding air)'
file_name = f"ts_{mode_of_interest}_miles_proportion%s"%file_suffix

try:
    if len(mode_counts_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")

    plot_data = mode_distance_interest.groupby(['date_time'], as_index=False)['distance_miles_smooth'].sum()
    total_miles = mode_distance.groupby(['date_time'], as_index=False)['distance_miles_smooth'].sum()
    plot_data = plot_data.merge(total_miles, on=['date_time'])
    plot_data['miles_proportion'] = plot_data['distance_miles_smooth_x'] / plot_data['distance_miles_smooth_y']

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Proportion of Daily Miles'

    timeseries_plot(plot_data['date_time'], plot_data['miles_proportion'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Proportion of total daily trips for mode of interest

In [None]:
# Plot of ebike trip share proportion across all users
plot_title_no_quality = f'Daily {mode_of_interest} Trip Proportion (All Users, excluding air)'
file_name = f"ts_{mode_of_interest}_trips_proportion%s"%file_suffix

try:
    if len(mode_counts_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")

    plot_data = mode_counts.groupby(['date_time','mode_confirm'], as_index=False)['trip_count_smooth'].sum()
    total_trips = plot_data.groupby(['date_time'], as_index=False).sum()
    plot_data = plot_data.merge(total_trips, on='date_time')
    plot_data['trip_proportion'] = plot_data['trip_count_smooth_x'] / plot_data['trip_count_smooth_y']
    plot_data = plot_data[plot_data['mode_confirm']==mode_of_interest]

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Proportion of Daily Trips'

    timeseries_plot(plot_data['date_time'], plot_data['trip_proportion'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data.drop(columns=['mode_confirm','trip_count_smooth_x','trip_count_smooth_y']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Daily trip proportion for mode of interest with error bounds

In [None]:
# Plots the number of trips per user with error bars
# from the seaborn documentation:
# By default, the plot aggregates over multiple y values at each value of x and 
# shows an estimate of the central tendency and a confidence interval for that estimate.
# In our case, we have multiple trip proportions (one per user) for each day
# so the band represents the variation of the number of trips and the thick line represents the mean/median (unsure which)
# but this still doesn't tell us which users have dropped their ridership

plot_title_no_quality = f'Daily {mode_of_interest} Trip Proportion for Individual users (Running average and variation)'
file_name = f"ts_{mode_of_interest}_trip_individual_variation%s"%file_suffix

try:
    if len(mode_counts_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")

    plot_data_mode = mode_counts_interest.groupby(['date_time', 'user_id'], as_index=False)["trip_count_smooth"].sum()
    plot_data = mode_counts.groupby(['date_time','user_id'], as_index=False)['trip_count_smooth'].sum()
    plot_data = plot_data.merge(plot_data_mode, on=['date_time', 'user_id'])
    plot_data['mode_trip_proportion'] = plot_data['trip_count_smooth_y'] / plot_data['trip_count_smooth_x']

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Proportion of Daily Trips'

    timeseries_plot(plot_data['date_time'], plot_data['mode_trip_proportion'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data.drop(columns=['user_id','trip_count_smooth_x','trip_count_smooth_y']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Daily mileage proportion for mode of interest with error bounds

In [None]:
# Plots the number of trips per user with error bars
# from the seaborn documentation:
# By default, the plot aggregates over multiple y values at each value of x and 
# shows an estimate of the central tendency and a confidence interval for that estimate.
# In our case, we have multiple trip proportions (one per user) for each day
# so the band represents the variation of the number of trips and the thick line represents the mean/median (unsure which)
# but this still doesn't tell us which users have dropped their ridership

plot_title = f'Daily {mode_of_interest} Mileage Proportion for Individual users (Running average and variation)'
file_name = f"ts_{mode_of_interest}_mile_individual_variation%s"%file_suffix

try:
    if len(mode_distance_interest) == 0:
        # force error generation so that we will go into the "missing" data code path
        raise RuntimeError(f"No {mode_of_interest} trips found")

    plot_data_mode = mode_distance_interest.groupby(['date_time', 'user_id'], as_index=False)["distance_miles_smooth"].sum()
    plot_data = mode_distance.groupby(['date_time','user_id'], as_index=False)['distance_miles_smooth'].sum()
    plot_data = plot_data.merge(plot_data_mode, on=['date_time', 'user_id'])
    plot_data['mode_miles_proportion'] = plot_data['distance_miles_smooth_y'] / plot_data['distance_miles_smooth_x']

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Proportion of Daily Miles'

    timeseries_plot(plot_data['date_time'], plot_data['mode_miles_proportion'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data.drop(columns=['user_id','distance_miles_smooth_x','distance_miles_smooth_y']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)