## Generate Static Graphs

These are the input parameters for the notebook. They will be automatically changed when the scripts to generate monthly statistics are run. You can modify them manually to generate multiple plots locally as well.

Pass in `None` to remove the filters and plot all data. This is not recommended for production settings, but might be useful for reports based on data snapshots.

In [None]:
year = 2020
month = 11
program = "default"
study_type = "study"
mode_of_interest = "e-bike"
include_test_users = False
dynamic_labels = {}
use_imperial = False
sensed_algo_prefix = "cleaned"

In [None]:
from collections import defaultdict

import numpy as np
import pandas as pd

from plots import *
import scaffolding

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

In [None]:
# Loading mapping dictionaries from mapping_dictionaries notebook
%store -r dic_re
%store -r dic_pur

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)

# get metric vs imperial vars
label_units, short_label, label_units_lower, distance_col, weight_unit = scaffolding.get_units(use_imperial)

## Collect Data From Database for Generic Metrics

In [None]:
expanded_ct, file_suffix, quality_text, debug_df = scaffolding.load_viz_notebook_data(year,
                                                                            month,
                                                                            program,
                                                                            study_type,
                                                                            dynamic_labels,
                                                                            dic_re,
                                                                            dic_pur=dic_pur,
                                                                            include_test_users=include_test_users)

## Collect Data from Database for Sensed Metrics

In [None]:
expanded_ct_sensed, file_suffix_sensed, quality_text_sensed, debug_df_sensed = scaffolding.load_viz_notebook_sensor_inference_data(year,
                                                                            month,
                                                                            program,
                                                                            include_test_users,
                                                                            sensed_algo_prefix)

# Create a combined debug df from Generic and Sensed Metrics
merged_debug_df = debug_df.combine_first(debug_df_sensed)

## Metrics for Specific Mode

In [None]:
data_eb = expanded_ct.query(f"mode_confirm == '{mode_of_interest}'") if "mode_confirm" in expanded_ct.columns else expanded_ct
quality_text_specific = scaffolding.get_quality_text(expanded_ct, data_eb, mode_of_interest, include_test_users)

### 1. 100% Stacked Bar Charts (Based on Number of Trips)

In [None]:
def process_data_frame(data_frame, column_name, title):
    labels = data_frame[column_name].value_counts(dropna=True).keys().tolist()
    values = data_frame[column_name].value_counts(dropna=True).tolist()
    return process_trip_data(labels, values, title)

plot_title_no_quality= "Number of trips for each mode (selected by users)"
file_name = f'ntrips_total{file_suffix}'

try:
    bar_count = 0
    plot_title = plot_title_no_quality + "\n" + "For Labeled and Sensed: "+ quality_text
    df_confirmed_tc = process_data_frame(expanded_ct, 'Mode_confirm', "Labeled by user \n (Based on Confirmed Trips)")
    df_sensed_tc = process_data_frame(expanded_ct_sensed, 'primary_mode', "Sensed by OpenPATH \n (Based on Total Trips)")
    all_data_frames = [df_confirmed_tc,df_sensed_tc]
    
    if not data_eb.empty:
        df_replaced_tc = process_data_frame(data_eb, 'Replaced_mode', f"Labeled by user for {mode_of_interest} \n (Based on Replaced mode)" )
        all_data_frames.append(df_replaced_tc)
        if not df_replaced_tc.empty:
            bar_count += 1
            plot_title += "\n" + f"For {mode_of_interest}: " + quality_text_specific
        
    result_df = merge_dataframes(all_data_frames)

    if not df_confirmed_tc.empty:
        bar_count += 1
    if not df_sensed_tc.empty:
        bar_count += 1

    stacked_bar_chart_generic(plot_title, result_df, file_name, bar_count)
    alt_text, alt_html = store_alt_text_stacked_bar_chart(result_df[result_df['Count'] > 0], file_name, plot_title)  
except:
    generate_missing_plot(plot_title_no_quality,merged_debug_df,file_name)
    alt_text = store_alt_text_missing(merged_debug_df, file_name, plot_title_no_quality)

### 2. 100% Stacked Bar Charts (Represents 80th of the Number of Trips)

In [None]:
def process_data_for_cutoff(data_frame, distance_col, mode_col, quality_func, label_title):
    
    cutoff = data_frame.distance.quantile(0.8)
    if pd.isna(cutoff):
        cutoff = 0

    dist_threshold = data_frame[distance_col].quantile(0.8).round(1)
    dist_threshold = str(dist_threshold)
    
    labels = data_frame.loc[(data_frame['distance'] <= cutoff)][mode_col].value_counts(dropna=True).keys().tolist()
    values = data_frame.loc[(data_frame['distance'] <= cutoff)][mode_col].value_counts(dropna=True).tolist()
    processed_data = process_trip_data(labels, values, label_title)    
    
    return processed_data,cutoff, dist_threshold

plot_title_no_quality= "Number of trips for 80th percentile for each mode (selected by users):"
file_name = f'ntrips_under80{file_suffix}'

try:
    bar_count = 0
    df_confirmed_tc_u80,cutoff, dist_threshold  = process_data_for_cutoff(expanded_ct, distance_col, 'Mode_confirm', scaffolding.get_quality_text, "Labeled by user \n (Based on Confirmed Trips)")
    df_sensed_tc_u80, cutoff_sensed, dist_threshold_sensed = process_data_for_cutoff(expanded_ct_sensed, distance_col, 'primary_mode', scaffolding.get_quality_text_sensed, "Sensed by OpenPATH \n (Based on Total Trips)")
    u80_quality_text = scaffolding.get_quality_text(expanded_ct, expanded_ct[expanded_ct['distance'] <= cutoff], "< " + dist_threshold + " " + short_label, include_test_users)
    u80_quality_text_sensed = scaffolding.get_quality_text_sensed(expanded_ct_sensed[expanded_ct_sensed['distance'] <= cutoff_sensed], "<= " + dist_threshold_sensed + " " + short_label , include_test_users)
    all_data_frames_u80 = [df_confirmed_tc_u80, df_sensed_tc_u80]
    
    result_df_u80 = merge_dataframes(all_data_frames_u80)
    plot_title = plot_title_no_quality 
    if not df_confirmed_tc_u80.empty:
        plot_title +=  "\n" + "For Sensed: " + u80_quality_text
        bar_count += 1
    if not df_sensed_tc_u80.empty:
        plot_title +=  "\n" + "For Labeled: " + u80_quality_text_sensed
        bar_count += 1

    stacked_bar_chart_generic(plot_title, result_df_u80, file_name, bar_count)
    alt_text, alt_html = store_alt_text_stacked_bar_chart(result_df_u80[result_df_u80['Count'] > 0], file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality, merged_debug_df, file_name)
    alt_text = store_alt_text_missing(merged_debug_df, file_name, plot_title_no_quality)

### 3. 100% Stacked Bar Charts (Represents Commute Trips) 

In [None]:
plot_title_no_quality= "Number of commute trips for each mode (selected by users)"
file_name = f"ntrips_commute_mode_confirm{file_suffix}"

try:
    bar_count = 0
    if (len(dynamic_labels)):
        purpose_map_label =  scaffolding.mapping_labels(dynamic_labels, "PURPOSE")
        translation_work = purpose_map_label['work']
        trip_purpose_query = f"Trip_purpose == '{translation_work}'"
    else:
        trip_purpose_query = "Trip_purpose == 'Work'"
    
    plot_title = plot_title_no_quality
    commute_quality_text = scaffolding.get_quality_text(expanded_ct, expanded_ct.query(trip_purpose_query), "commute", include_test_users)
    if not expanded_ct.empty:
        bar_count += 1
        plot_title += "\n" + commute_quality_text

    labels_mc_commute = expanded_ct.query(trip_purpose_query).Mode_confirm.value_counts(dropna=True).keys().tolist()
    values_mc_commute = expanded_ct.query(trip_purpose_query).Mode_confirm.value_counts(dropna=True).tolist()
    
    df_total_trip_commute = process_trip_data(labels_mc_commute, values_mc_commute, "Labeled by user \n (Based on Confirmed Trips)")
    
    stacked_bar_chart_generic(plot_title, df_total_trip_commute, file_name, bar_count)
    alt_text, alt_html = store_alt_text_stacked_bar_chart(df_total_trip_commute, file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality, debug_df, file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### 4. 100% Stacked Bar Charts (Represents Distance by Mode)

In [None]:
def process_distance_data(data_frame, group_column, distance_column, label_units_lower, quality_text):
    dist = data_frame.groupby(group_column).agg({distance_column: ['sum', 'count', 'mean']})
    dist.columns = ['Total (' + label_units_lower + ')', 'Count', 'Average (' + label_units_lower + ')']
    dist = dist.reset_index()
    dist = dist.sort_values(by=['Total (' + label_units_lower + ')'], ascending=False)
    
    dist_dict = dict(zip(dist[group_column], dist['Total (' + label_units_lower + ')']))
    labels_dist = []
    values_dist = []
    
    for x, y in dist_dict.items():
        labels_dist.append(x)
        values_dist.append(y)
    
    return process_trip_data(labels_dist, values_dist, quality_text)

plot_title_no_quality = label_units + " for each mode"
file_name = 'distance_mode%s' % file_suffix

try:
    bar_count = 0
    df_confirm_dist = process_distance_data(expanded_ct, 'Mode_confirm', distance_col, label_units_lower, "Labeled by user \n Trip distance \n (Based on Confirmed Trips)")
    df_sensed_dist = process_distance_data(expanded_ct_sensed, 'primary_mode', distance_col, label_units_lower, "Sensed by OpenPATH \n Trip distance (Overall) \n (Based on Total Trips)")
    df_sensed_dist_land = process_distance_data(expanded_ct_sensed[expanded_ct_sensed['primary_mode'] != "AIR_OR_HSR"], 'primary_mode', distance_col, label_units_lower, "Sensed by OpenPATH \n Trips distance (Land) \n (Based on Total Trips)")

    all_data_frames = [df_confirm_dist, df_sensed_dist, df_sensed_dist_land]

    if not df_confirm_dist.empty:
        bar_count += 1
    if not df_sensed_dist.empty:
        bar_count += 1
    if not df_sensed_dist_land.empty:
        bar_count += 1
    
    plot_title = plot_title_no_quality + "\n" + quality_text

    if not data_eb.empty:
        df_replaced_dist = process_distance_data(data_eb, 'Replaced_mode', distance_col, label_units_lower, f"Labeled by user for {mode_of_interest} \n (Based on Replaced mode distance)")
        all_data_frames.append(df_replaced_dist)
        if not df_replaced_dist.empty:
            bar_count += 1
            plot_title += "\n" + f"For {mode_of_interest}:" + quality_text_specific

    result_df = merge_dataframes(all_data_frames)

    stacked_bar_chart_generic(plot_title, result_df, file_name, bar_count)
    alt_text, alt_html = store_alt_text_stacked_bar_chart(result_df[result_df['Count'] > 0], file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality, merged_debug_df, file_name)
    alt_text = store_alt_text_missing(merged_debug_df, file_name, plot_title_no_quality)

### 5. 100% Stacked Bar Charts (Count by Purpose)

In [None]:
def process_trip_purpose_data(data_frame, purpose_column, label_text):
    labels = data_frame[purpose_column].value_counts(dropna=True).keys().tolist()
    values = data_frame[purpose_column].value_counts(dropna=True).tolist()
    
    return process_trip_data(labels, values, label_text)

plot_title_no_quality = "Number of trips for each purpose (selected by users)"
file_name = 'ntrips_purpose%s' %(file_suffix)

try:
    bar_count = 0
    df_purpose_trip = process_trip_purpose_data(expanded_ct, 'Trip_purpose', "Labeled by user \n (Based on Confirmed Trips )")
    df_purpose_trip_eb = process_trip_purpose_data(data_eb, 'Trip_purpose', f"Labeled by user for {mode_of_interest} \n (Based on {mode_of_interest} related Purpose Trips)")
    all_tp_data_frames = [df_purpose_trip, df_purpose_trip_eb]
    result_tp_df = merge_dataframes(all_tp_data_frames)

    plot_title = plot_title_no_quality
    
    if not df_purpose_trip.empty:
        plot_title += "\n" + quality_text
        bar_count += 1
    if not df_purpose_trip_eb.empty:
        plot_title += "\n" + quality_text_specific
        bar_count += 1

    stacked_bar_chart_generic(plot_title, result_tp_df, file_name, bar_count)

    alt_text, alt_html = store_alt_text_stacked_bar_chart(result_tp_df[result_tp_df['Count'] > 0], file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality, merged_debug_df, file_name)
    alt_text = store_alt_text_missing(merged_debug_df, file_name, plot_title_no_quality)

## Generic Metrics (Bar Charts)

### Average miles per transport mode selected (Mode_confirm)

In [None]:
file_name ='average_miles_mode_confirm%s' % file_suffix
plot_title_no_quality="Average "+ label_units+" for each mode with > 3 entries"

try:
    dist = expanded_ct.groupby('Mode_confirm').agg({distance_col: ['sum', 'count' , 'mean']})
    dist.columns = ['Total ('+label_units_lower+')', 'Count', 'Average ('+label_units_lower+')']
    dist = dist.reset_index()
    dist =dist.sort_values(by=['Total ('+label_units_lower+')'], ascending=False)

    x='Mode_confirm'
    y='Average ('+label_units_lower+')'
    plot_title= plot_title_no_quality+"\n"+quality_text
    
    data = dist.drop((dist.query("Count < 3").index)).sort_values(by=['Average ('+label_units_lower+')'], ascending=False)

    barplot_mode(data,x,y,plot_title, expanded_ct['Mode_confirm'].dropna().unique().tolist(), file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Average ('+label_units_lower+')'].values, data['Mode_confirm']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)    

### Number of trips by day¶

In [None]:
plot_title_no_quality="Number of trips by day"
file_name ='ntrips_per_day%s' % file_suffix

try:
    fq_days = expanded_ct.groupby(['start_local_dt_day']).agg({'start_local_dt_day': ['sum', 'count']})
    fq_days = fq_days.reset_index()
    fq_days.columns = ['Day of the Month', 'Total', 'Number of Trips']

    data = fq_days
    x = 'Day of the Month'
    y = 'Number of Trips'
    
    plot_title= plot_title_no_quality+"\n"+quality_text

    barplot_day(data,x,y,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Number of Trips'].values, data['Day of the Month']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)    

### Number of trips by day of week¶

In [None]:
plot_title_no_quality="Number of trips by weekday"
file_name ='ntrips_per_weekday%s' % file_suffix
try:
    fq_weekdays = expanded_ct.groupby(['start_local_dt_weekday']).agg({'start_local_dt_weekday': ['sum', 'count']})
    fq_weekdays = fq_weekdays.reset_index()
    fq_weekdays.columns = ['Weekday', 'Total', 'Number of Trips']
    weekday_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    fq_weekdays["Weekday"] = fq_weekdays.Weekday.apply(lambda x: weekday_labels[x])

    data = fq_weekdays
    x = 'Weekday'
    y = 'Number of Trips'

    plot_title= plot_title_no_quality+"\n"+quality_text
    
    barplot_day(data,x,y,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Number of Trips'].values, data['Weekday']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)    

## Sensed Metrics (Bar Charts)

### Average miles per transport mode selected (primary_mode)

In [None]:
plot_title_no_quality=" Average Miles for each mode with > 3 entries\n(inferred by OpenPATH from phone sensors)"
file_name ='average_miles_sensed_mode%s' % file_suffix

try:
    dist = expanded_ct_sensed.groupby('primary_mode').agg({distance_col: ['sum', 'count' , 'mean']})
    dist.columns = ['Total ('+label_units_lower+')', 'Count', 'Average ('+label_units_lower+')']
    dist = dist.reset_index()
    dist =dist.sort_values(by=['Total ('+label_units_lower+')'], ascending=False)
    
    data = dist.drop((dist.query("Count < 3").index)).sort_values(by=['Average ('+label_units_lower+')'], ascending=False)
    x='primary_mode'
    y='Average ('+label_units_lower+')'
    
    plot_title= plot_title_no_quality+"\n"+quality_text
    
    barplot_mode(data,x,y,plot_title, expanded_ct_sensed['primary_mode'].dropna().unique().tolist(), file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Average ('+label_units_lower+')'].values, data['primary_mode']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df_sensed,file_name)
    alt_text = store_alt_text_missing(debug_df_sensed, file_name, plot_title_no_quality) 

### Number of trips by day¶

In [None]:
plot_title_no_quality="Number of trips by day\n(inferred by OpenPATH from phone sensors)"
file_name ='ntrips_sensed_per_day%s' % file_suffix

try:
    fq_days = expanded_ct_sensed.groupby(['start_local_dt_day']).agg({'start_local_dt_day': ['sum', 'count']})
    fq_days = fq_days.reset_index()
    fq_days.columns = ['Day of the Month', 'Total', 'Number of Trips']

    data = fq_days
    x = 'Day of the Month'
    y = 'Number of Trips'
    
    plot_title= plot_title_no_quality+"\n"+quality_text

    barplot_day(data,x,y,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Number of Trips'].values, data['Day of the Month']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df_sensed,file_name)
    alt_text = store_alt_text_missing(debug_df_sensed, file_name, plot_title_no_quality)

### Number of trips by day of week¶

In [None]:
plot_title_no_quality="Number of trips by weekday\n(inferred by OpenPATH from phone sensors)"
file_name ='ntrips_sensed_per_weekday%s' % file_suffix
try:
    fq_weekdays = expanded_ct_sensed.groupby(['start_local_dt_weekday']).agg({'start_local_dt_weekday': ['sum', 'count']})
    fq_weekdays = fq_weekdays.reset_index()
    fq_weekdays.columns = ['Weekday', 'Total', 'Number of Trips']
    weekday_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    fq_weekdays["Weekday"] = fq_weekdays.Weekday.apply(lambda x: weekday_labels[x])

    data = fq_weekdays
    x = 'Weekday'
    y = 'Number of Trips'

    plot_title= plot_title_no_quality+"\n"+quality_text
    
    barplot_day(data,x,y,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Number of Trips'].values, data['Weekday']), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df_sensed,file_name)
    alt_text = store_alt_text_missing(debug_df_sensed, file_name, plot_title_no_quality)

## Mode Specific (Bar Charts)

### Average miles per trip for specified mode

In [None]:
plot_title_no_quality="Average " + label_units + " for each replaced mode with > 3 entries\n'Other' represents trips with a non-standard or missing replacement"
file_name ='average_miles_replaced_mode%s' % file_suffix

try:
    dg=data_eb.groupby('Replaced_mode').agg({distance_col: ['sum', 'count' , 'mean']},)
    dg.columns = ['Total ('+label_units_lower+')', 'Count' ,'Average ('+label_units_lower+')']
    dg = dg.reset_index()
    dg = dg.sort_values(by=['Total ('+label_units_lower+')'], ascending=False)
    data = dg.drop((dg.query("Count < 3").index)).sort_values(by=['Average ('+label_units_lower+')'], ascending=False) 
        
    x='Replaced_mode'
    y='Average ('+label_units_lower+')'
    y2 = "Count"

    plot_title= plot_title_no_quality+"\n"+quality_text
    barplot_mode(data,x,y,plot_title, expanded_ct['Replaced_mode'].dropna().unique().tolist(), file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Average ('+label_units_lower+')'].values, data.Replaced_mode), file_name, plot_title)

except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Number of trips by day for specified mode

In [None]:
plot_title_no_quality=f"Number of {mode_of_interest} trips by day"
file_name =f'ntrips_{mode_of_interest}_per_day%s' % file_suffix

try:
    fq_days = data_eb.groupby(['start_local_dt_day']).agg({'start_local_dt_day': ['sum', 'count']})
    fq_days = fq_days.reset_index()
    fq_days.columns = ['Day of the Month', 'Total', 'Number of Trips']

    data = fq_days
    x = 'Day of the Month'
    y = 'Number of Trips'

    plot_title= plot_title_no_quality+"\n"+quality_text
    barplot_day(data,x,y,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Number of Trips'].values, data['Day of the Month'].values), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)

### Number of trips by day of week¶

In [None]:
plot_title_no_quality=f"Number of {mode_of_interest} trips by weekday"
file_name =f'ntrips_{mode_of_interest}_per_weekday%s' % file_suffix

try:
    fq_weekdays = data_eb.groupby(['start_local_dt_weekday']).agg({'start_local_dt_weekday': ['sum', 'count']})
    fq_weekdays = fq_weekdays.reset_index()
    fq_weekdays.columns = ['Weekday', 'Total', 'Number of Trips']
    weekday_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    fq_weekdays["Weekday"] = fq_weekdays.Weekday.apply(lambda x: weekday_labels[x])

    data = fq_weekdays
    x = 'Weekday'
    y = 'Number of Trips'

    plot_title= plot_title_no_quality+"\n"+quality_text
    barplot_day(data,x,y,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(data['Number of Trips'].values, data['Weekday'].values), file_name, plot_title)
except:
    generate_missing_plot(plot_title_no_quality,debug_df,file_name)
    alt_text = store_alt_text_missing(debug_df, file_name, plot_title_no_quality)