## Generate Static Graphs

These are the input parameters for the notebook. They will be automatically changed when the scripts to generate monthly statistics are run. You can modify them manually to generate multiple plots locally as well.

Pass in `None` to remove the filters and plot all data. This is not recommended for production settings, but might be useful for reports based on data snapshots.

In [None]:
year = 2020
month = 11
program = "default"
study_type = "study"
mode_of_interest = None
include_test_users = False
dynamic_labels = {}

In [None]:
from collections import defaultdict
import datetime

import numpy as np
import pandas as pd

from plots import *
import scaffolding

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

In [None]:
# Loading mapping dictionaries from mapping_dictionaries notebook
%store -r df_ei
%store -r dic_re
%store -r dic_fuel

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_fuel = defaultdict(lambda: 'Other',dic_fuel)

## Collect Data From Database

In [None]:
expanded_ct, file_suffix, quality_text, debug_df = scaffolding.load_viz_notebook_data(year,
                                                                            month,
                                                                            program,
                                                                            study_type,
                                                                            dynamic_labels,
                                                                            dic_re,
                                                                            include_test_users=include_test_users)
expanded_ct = scaffolding.add_energy_labels(expanded_ct, df_ei, dic_fuel, dynamic_labels) if "mode_confirm" in expanded_ct.columns else expanded_ct

## Data Preprocessing

In [None]:
if len(dynamic_labels) > 0:
    # Get timestamp from known year/month/day aggregated to days
    sel_cols_no_label_dep = ['user_id','start_local_dt_year','start_local_dt_month','start_local_dt_day','distance']
    sel_cols_with_label_dep = sel_cols_no_label_dep + ['Mode_confirm','Mode_confirm_EI(kWH)','Mode_confirm_kg_CO2']
else:
    # Get timestamp from known year/month/day aggregated to days
    sel_cols_no_label_dep = ['user_id','start_local_dt_year','start_local_dt_month','start_local_dt_day','distance_miles']
    sel_cols_with_label_dep = sel_cols_no_label_dep + ['Mode_confirm','Mode_confirm_EI(kWH)','Mode_confirm_lb_CO2']

if len(expanded_ct) == 0:
    data = expanded_ct.copy()
elif "Mode_confirm" not in expanded_ct.columns:
    data = expanded_ct[sel_cols_no_label_dep].copy()
else:
    data = expanded_ct[sel_cols_with_label_dep].copy()
    
if len(expanded_ct) > 0:
    data.rename(columns={'start_local_dt_year':'year','start_local_dt_month':'month','start_local_dt_day':'day'}, inplace=True)
    data['date_time'] = pd.to_datetime(data[['year','month','day']])
    data = data.drop(columns=['year','month','day'])

    # Categorical type will include all days/modes in groupby even if there is no data for a particular tabulation
    data.user_id = pd.Categorical(data.user_id)
    data.date_time = pd.Categorical(data.date_time)
    
    if "Mode_confirm" in expanded_ct.columns:
        if (len(dynamic_labels) > 0):
            dic_mode_mapping = scaffolding.mapping_labels(dynamic_labels, "MODE")
            data.Mode_confirm = pd.Categorical(data.Mode_confirm, ordered=True, categories=np.unique(list(dic_mode_mapping.values())))
        else:
            data.Mode_confirm = pd.Categorical(data.Mode_confirm, ordered=True, categories=np.unique(list(dic_re.values())))
data.head()

In [None]:
def compute_daily_metrics(data, weight_unit, distance_unit):
    # Sum daily distance traveled for each mode
    mode_distance = data.groupby(['user_id','date_time','Mode_confirm'], as_index=False)[[distance_unit]].sum()
    mode_distance.rename(columns={'sum':distance_unit}, inplace=True)
    mode_distance[distance_unit] = mode_distance[distance_unit].fillna(0)
    
    # Sum daily emissions for each user
    emissions = data.groupby(['user_id','date_time'], as_index=False)[[f'Mode_confirm_{weight_unit}_CO2', distance_unit]].sum()
    emissions[f'Mode_confirm_{weight_unit}_CO2'] = emissions[f'Mode_confirm_{weight_unit}_CO2'].fillna(0)
    emissions[distance_unit] = emissions[f'Mode_confirm_{weight_unit}_CO2'].fillna(0)
    
    # Sum daily energy for each user
    energy = data.groupby(['user_id','date_time'], as_index=False)[['Mode_confirm_EI(kWH)', distance_unit]].sum()
    energy['Mode_confirm_EI(kWH)'] = energy['Mode_confirm_EI(kWH)'].fillna(0)
    energy[distance_unit] = energy['Mode_confirm_EI(kWH)'].fillna(0)
    
    # Add 7-day rolling avg smoothing to better see trends
    mode_counts['trip_count_smooth'] = mode_counts.groupby(['user_id','Mode_confirm'])['trip_count'].apply(lambda x: x.rolling(7,1).mean())
    mode_distance[f'{distance_unit}_smooth'] = mode_distance.groupby(['user_id','Mode_confirm'])[distance_unit].apply(lambda x: x.rolling(7,1).mean())
    emissions[f'{distance_unit}_smooth'] = emissions.groupby(['user_id'])[distance_unit].apply(lambda x: x.rolling(7,1).mean())
    energy[f'{distance_unit}_smooth'] = energy.groupby(['user_id'])[distance_unit].apply(lambda x: x.rolling(7,1).mean())

    return mode_counts, mode_distance, emissions, energy

if len(expanded_ct) > 0:
    # Get the count of unique users that were active on each given date
    active_users = pd.DataFrame(data.groupby(['date_time'], as_index=False)['user_id'].nunique())
    active_users.rename(columns={'user_id':'active_users'}, inplace=True)

    if "Mode_confirm" in expanded_ct.columns:
        # Count the number of trips for each confirmed mode
        mode_counts = data.groupby(['user_id','date_time','Mode_confirm'], as_index=False).size()
        mode_counts.rename(columns={'size':'trip_count'}, inplace=True)

        if (len(dynamic_labels)) > 0:
            mode_counts, mode_distance, emissions, energy = compute_daily_metrics(data, weight_unit = 'kg', distance_unit = 'distance')
        else:
            mode_counts, mode_distance, emissions, energy = compute_daily_metrics( data, weight_unit = 'lb', distance_unit = 'distance_miles')

## Generate Timeseries Plots

### Emissions per week

In [None]:
plot_title_no_quality = 'Net Daily Emissions (All Users, excluding air)'
file_name = "ts_emissions_user%s"%file_suffix

def plot_emission_per_week(emissions, unit, active_users):
    # Emissions per week across all users (net impact)
    plot_data = emissions.groupby(['date_time'], as_index=False)[f'Mode_confirm_{unit}_CO2'].agg(['sum'])
    total_sum = plot_data['sum'].sum()
    plot_data = plot_data.merge(active_users, on='date_time')
    plot_data['sum'] = plot_data['sum'] / plot_data['active_users']

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = f'Emissions ({unit} CO2/day/user)'
    timeseries_plot(plot_data['date_time'], plot_data['sum'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)

try:
    if (len(dynamic_labels)) > 0:
        plot_emission_per_week(emissions, 'kg', active_users)
    else:
        plot_emission_per_week(emissions, 'lb', active_users)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Energy per week

In [None]:
if (len(dynamic_labels) == 0):
    plot_title_no_quality = 'Net Daily Energy (All Users, excluding air)'
    file_name = "ts_energy_user%s"%file_suffix

    try:
        # Energy per week across all users (net impact)
        plot_data = energy.groupby(['date_time'], as_index=False)['Mode_confirm_EI(kWH)'].agg(['sum'])
        plot_data = plot_data.merge(active_users, on='date_time')
        plot_data['sum'] = plot_data['sum'] / plot_data['active_users']

        plot_title= plot_title_no_quality+"\n"+quality_text
        ylab = 'Energy (kWH/day/user)'
        timeseries_plot(plot_data['date_time'], plot_data['sum'], plot_title, ylab, file_name)
        alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)
    except:
        generate_missing_plot(plot_title_no_quality,debug_df,file_name)
        alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Emissions per mile/kilometer per day

In [None]:
plot_title_no_quality = 'Average Daily Emission Rate (All Users, excluding air)'
file_name = "ts_emissions_vmt%s"%file_suffix

def plot_emissions_per_distance_day(emissions, weight_unit, distance_unit, distance_unit_smooth,):
    # Emissions per kilometer per day across all users (travel efficiency)
    # Note that the energy plot will be identical to this one since scale factor is divided out
    emissions[f'CO2_per_{distance_unit}'] = emissions[f'Mode_confirm_{weight_unit}_CO2'] / emissions[f'{distance_unit_smooth}']
    emissions[f'CO2_per_{distance_unit}'] = emissions[f'CO2_per_{distance_unit}'].fillna(0)
    plot_data = emissions.groupby(['date_time'])[f'CO2_per_{distance_unit}'].agg(['mean']).reset_index()
    
    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = f'Emissions ({weight_unit} CO2/{distance_unit}/day)'
    timeseries_plot(plot_data['date_time'], plot_data['mean'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)

try:
    if (len(dynamic_labels) > 0):
        plot_emissions_per_distance_day(emissions, 'kg', 'km', 'distance_smooth')
    else:
        plot_emissions_per_distance_day(emissions, 'lb', 'mile','distance_miles_smooth')
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Number of active users

In [None]:
plot_title_no_quality = 'Number of Active Users'
file_name = "ts_users%s"%file_suffix

try:
    # Plot of active users
    plot_data = active_users

    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Unique IDs'
    timeseries_plot(plot_data['date_time'], plot_data['active_users'], plot_title, ylab, file_name)
    alt_text = store_alt_text_timeseries(plot_data, file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality) 

### Daily Mode share

In [None]:
plot_title_no_quality = 'Daily Aggregate Mode Share (excluding "Other" and "Not a trip"'
file_name = "ts_all_modes%s"%file_suffix

try:
    # Plot of mode share proportions across all users
    # Consolidate modes
    plot_data = mode_counts.replace('Bikeshare', 'Shared Micromobility')
    plot_data = plot_data.replace('Scooter share', 'Shared Micromobility')
    plot_data = plot_data.replace('Regular Bike', 'Personal Micromobility')
    plot_data = plot_data.replace('Skate board', 'Personal Micromobility')
    plot_data = plot_data.replace('Train', 'Transit')
    plot_data = plot_data.replace('Free Shuttle', 'Transit')
    plot_data = plot_data.replace('Bus', 'Transit')
    plot_data = plot_data.replace('Walk', 'Walk')
    plot_data = plot_data.replace('Taxi/Uber/Lyft', 'Ridehail')
    plot_data = plot_data.replace('Pilot ebike', 'E-Bike')

    plot_data = plot_data.groupby(['date_time','Mode_confirm'], as_index=False)['trip_count_smooth'].sum()
    total_trips = plot_data.groupby(['date_time'], as_index=False).sum()
    plot_data = plot_data.merge(total_trips, on='date_time')
    plot_data['trip_proportion'] = plot_data['trip_count_smooth_x'] / plot_data['trip_count_smooth_y']
    # Re-establish categorical variable to not include Other and Non-trips
    plot_data = plot_data[~plot_data['Mode_confirm'].isin(['Not a Trip','Other'])]

    if (len(dynamic_labels) > 0):
            dic_mode_mapping = scaffolding.mapping_labels(dynamic_labels, label_type = "MODE")
            plot_data.Mode_confirm = pd.Categorical(plot_data.Mode_confirm, ordered=True, categories=np.unique(list(dic_mode_mapping.values())))    
    else:
        plot_data.Mode_confirm = pd.Categorical(plot_data.Mode_confirm, ordered=True, categories=np.unique(list(dic_re.values())))
    plot_title= plot_title_no_quality+"\n"+quality_text
    ylab = 'Proportion of All Trips'
    legend_title = 'Confirmed Mode'
    timeseries_multi_plot(plot_data, 'date_time','trip_proportion','Mode_confirm', plot_title, ylab, legend_title, file_name)
    alt_text = store_alt_text_generic('multivariate timeseries', file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality) 