In [None]:
year = None
month = None
program = "prepilot"
include_replaced_modes_as_valid = True # Flip this when we want to get results versus generate the replaced_mode correction graphs
model_with_sensed = False
input_dataset = "ONLY_LABELED" # "ONLY_LABELED", "ONLY_SENSED" or "BEST_AVAILABLE" for sensitivity analysis
LABEL_ASSIST_THRESHOLD = 0.3

In [None]:
from collections import defaultdict
import datetime

import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.distributions as dist
from biogeme.expressions import Beta, DefineVariable, RandomVariable, exp, PanelLikelihoodTrajectory, bioDraws, log, MonteCarlo, Integrate
import biogeme.results as res
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import sklearn.metrics
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import emission.core.get_database as edb
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.storage.decorations.trip_queries as esdt
import emission.storage.decorations.timeline as esdl
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt
import scaffolding
from uuid import UUID

import replacement_models

%matplotlib inline

In [None]:
import importlib
importlib.reload(replacement_models)

## Collect Data From Database

In [None]:
# Loading mapping dictionaries from mapping_dictionaries notebook
# Dictionary map is putting all other replaced modes into other
%store -r df_EI
%store -r dic_re
%store -r dic_pur
%store -r dic_fuel

# convert a dictionary to a defaultdict
dic_pur = defaultdict(lambda: 'Other',dic_pur)
dic_re = defaultdict(lambda: 'Other',dic_re)

In [None]:
tq = scaffolding.get_time_query(year, month)

In [None]:
# Let's get UUID lists for all the three categories
# stage, all, non_stage
stage_uuids = []
all_uuids = []
non_stage_uuids = []
for ue in edb.get_uuid_db().find():
    all_uuids.append(str(ue['uuid']))
    if ue['user_email'].startswith("stage_"):
        stage_uuids.append(str(ue['uuid']))
    else:
        non_stage_uuids.append(str(ue['uuid']))
stage_uuids

In [None]:
# Here are the proportions across them
len(stage_uuids), len(non_stage_uuids), len(all_uuids)

In [None]:
# Load all the participant confirmed trips
participant_ct_df = scaffolding.load_all_participant_trips(program, tq)

In [None]:
# wait, we have 219 unique trips, which doesn't match any of the numbers
len(participant_ct_df.user_id.unique())

In [None]:
# Let's see if this is because of strings; nope
participant_ct_df["user_id_str"] = participant_ct_df.user_id.apply(lambda u: str(u))
len(participant_ct_df.user_id_str.unique())

In [None]:
# Let's see which category the missing trips fit into
missing_uuids = set(all_uuids).difference(set(participant_ct_df.user_id_str))
len(missing_uuids), 245 - 219

In [None]:
set(missing_uuids).intersection(stage_uuids), set(missing_uuids).intersection(non_stage_uuids)

In [None]:
# They seem to be split pretty evenly between stage and non stage
len(set(missing_uuids).intersection(stage_uuids)), len(set(missing_uuids).intersection(non_stage_uuids))

In [None]:
# Remove stage users, comparing string, we find a difference
non_stage_ct_df = participant_ct_df[~participant_ct_df['user_id_str'].isin(stage_uuids)]
stage_ct_df = participant_ct_df[participant_ct_df['user_id_str'].isin(stage_uuids)]
print(len(non_stage_ct_df))
print(len(stage_ct_df))
print(len(participant_ct_df))

In [None]:
# since we have missing UUIDs, let's confirm that none of them have trips in the database
# and that we are not missing trips just because of read limits
missing_stage_uuids = set(missing_uuids).intersection(stage_uuids)
missing_non_stage_uuids = set(missing_uuids).intersection(non_stage_uuids)

from uuid import UUID
import emission.core.get_database as edb

for uuid_str in missing_stage_uuids.union(missing_non_stage_uuids):
    print(f"For {uuid_str}, found %d trips in the database" % 
          (edb.get_analysis_timeseries_db().count_documents({"user_id": UUID(uuid_str), "metadata.key": "analysis/confirmed_trip"})))

In [None]:
# labeled_ct = scaffolding.filter_labeled_trips(participant_ct_df)
# expanded_ct = scaffolding.expand_userinputs(labeled_ct)

In [None]:
# Use sensed as well as labeled trips
expanded_ct = scaffolding.expand_userinputs(participant_ct_df)
expanded_stage_ct = scaffolding.expand_userinputs(stage_ct_df)
expanded_non_stage_ct = scaffolding.expand_userinputs(non_stage_ct_df)

In [None]:
# Using non-stage users
expanded_ct = scaffolding.data_quality_check(expanded_non_stage_ct)
expanded_ct.shape

In [None]:
# SHANKARI: I am not sure it is OK to fill in the inferred values as `mode_confirm`
# mode confirm should be reserved for user input
# the inferred labels are generated by the label assist algorithm and have accuracy of ~ 50%

def describe_labels(ct_df):
    # How much data total
    print("Total number of trips", len(ct_df))

    # data with user specified modes
    print("Trips with user specified labels", len(ct_df[~pd.isna(ct_df.mode_confirm)]))
      
    # how much data without labels and with and without label assist
    no_user_label_ct_df = ct_df[pd.isna(ct_df.mode_confirm)]
    print("Trips without user specified labels", len(no_user_label_ct_df))
    is_empty_check = lambda ll: len(ll) == 0
    print("Trips without user label but with inferred label", len(no_user_label_ct_df[~no_user_label_ct_df.inferred_labels.apply(is_empty_check)]))
    print("Trips without user label or inferred label", len(no_user_label_ct_df[no_user_label_ct_df.inferred_labels.apply(is_empty_check)]))

describe_labels(expanded_ct)

In [None]:
# no_user_label_ct_df[no_user_label_ct_df.inferred_labels.apply(is_empty_check)][["inferred_labels", "start_fmt_time", "end_fmt_time", "mode_confirm", "purpose_confirm", "user_id"]].head()

In [None]:
# so we really only have 57k labeled trips, not 115k
# we can generate results with label assist and/or primary sensed mode for comparison,
# but I think that the most principled version of the evaluation should use only labeled trips

In [None]:
# small unit test
# mode_split_map = pd.Series({'bicycling': 0.3244066758052265, 'walking': 0.6755933241947736})
# mode_split_map.index[mode_split_map.argmax()]

In [None]:
# Copied from https://github.com/allenmichael099/e-mission-eval-private-data/blob/hybrid_labeling_analysis_Michael/Error_bars/add_new_label_fields.py#L45
# TODO: Need to clean up later instead of copy-paste
def get_primary_sensed_mode(ct_row):
    # These keys were found in emission/core/wrapper/modeprediction.py:
    sensed_mode_types = {0: "unknown", 1: "drove_alone",2: "bike",
                     3: "bus", 4: "train", 5: "car", 6: "air_or_hsr",
                     7: "train", 8: "train", 9: "train"}

    # Get the segments for the trip.
    #cleaned_section will only have walk/bike/automotive, inferred_section is the one that has bus/train/car etc 
    segments = esdt.get_sections_for_trip(key = "analysis/inferred_section", user_id = ct_row["user_id"], trip_id = ct_row['cleaned_trip'])

    # get pairs of mode type and duration
    trip_mode_durations = {}
    total_dur = 0
    for s in segments:

        # the sensed mode is a number in the database, so I'm relabeling it as a string.
        mode = sensed_mode_types[s['data']['sensed_mode']]
        duration = s['data']['duration']

        if mode not in trip_mode_durations.keys(): trip_mode_durations[mode] = 0
        trip_mode_durations[mode] += duration

        total_dur += duration
    # convert the durations to fractions of the total segment moving time (not the trip time, since trips include stop times)
    # output is something like {'bicycling': 0.3244066758052265, 'walking': 0.6755933241947736}
    mode_split_map = pd.Series({mode: duration/total_dur  for mode,duration in trip_mode_durations.items()})
    primary_mode = mode_split_map.index[mode_split_map.argmax()]
#     print(f"After processing {len(segments)} segments for trip {ct_row['_id']} for user {ct_row['user_id']}, returning {primary_mode}")
    return primary_mode

In [None]:
def get_max_prob_label(inferred_label_list, p_threshold):
    # copied from emission/storage/decorations/trip_queries.py
    # Two columns: "labels" and "p"
    label_prob_df = pd.DataFrame(inferred_label_list)
    # logging.debug(label_prob_df)
    # idxmax returns the index corresponding to the max data value in each column
    max_p = label_prob_df.p.max()
    if max_p > p_threshold:
        max_p_idx = label_prob_df.p.idxmax()
        # logging.debug(max_p_idx)
        # now we look up the labels for that index
        return label_prob_df.loc[max_p_idx].labels
    else:
        print(f"max_p {max_p} < threshold {p_threshold}, returning None")
        return None

def get_best_label_assist_mode(ct_row, p_threshold):
    # copied and modified from emission/storage/decorations/trip_queries.py line 290-ish
    all_inferred_labels = ct_row.inferred_labels
    if len(all_inferred_labels) > 0:
        max_p_labels = get_max_prob_label(all_inferred_labels, p_threshold)
        return max_p_labels["mode_confirm"] if max_p_labels is not None else None
    else:
        return None
    
def get_best_hybrid_mode(ct_row):
    if ct_row.mode_confirm is not None:
        return ct_row
    else:
        best_label_assist_mode = get_best_label_assist_mode(ct_row, LABEL_ASSIST_THRESHOLD)
        if best_label_assist_mode is not None:
            return best_label_assist_mode
        else:
            return get_primary_sensed_mode(ct_row)

In [None]:
# Basic sanity checks
sanity_test_df = expanded_ct.sample(n=50, random_state=1234)
print("Sanity testing characteristics ")
describe_labels(sanity_test_df)

only_sensed_df = sanity_test_df.copy()
only_sensed_df.mode_confirm = sanity_test_df.apply(lambda row: get_primary_sensed_mode(row), axis=1)
# This should be zero
print(only_sensed_df[pd.isna(only_sensed_df.mode_confirm)])

only_label_assist_df = sanity_test_df.copy()
only_label_assist_df.mode_confirm = sanity_test_df.apply(lambda row: get_best_label_assist_mode(row, LABEL_ASSIST_THRESHOLD), axis=1)
print(only_label_assist_df[pd.isna(only_label_assist_df.mode_confirm)][["start_fmt_time", "end_fmt_time"]])

hybrid_df = sanity_test_df.copy()
hybrid_df.mode_confirm = sanity_test_df.apply(lambda row: get_best_hybrid_mode(row), axis=1)
print(hybrid_df[pd.isna(hybrid_df.mode_confirm)])

In [None]:
if input_dataset == "ONLY_LABELED":
    expanded_ct = scaffolding.filter_labeled_trips(expanded_ct)
elif input_dataset == "BEST_AVAILABLE":
    expanded_ct = expanded_ct.apply(lambda row: get_best_hybrid_mode(row), axis=1)
elif input_dataset == "ONLY_SENSED":
    expanded_ct.mode_confirm = expanded_ct.apply(lambda row: get_primary_sensed_mode(row), axis=1)

In [None]:
# Mapping new labels with dictionaries
expanded_ct['Trip_purpose'] = expanded_ct['purpose_confirm'].map(dic_pur)
expanded_ct['Mode_confirm'] = expanded_ct['mode_confirm'].map(dic_re)
expanded_ct['Replaced_mode'] = expanded_ct['replaced_mode'].map(dic_re)

# Mapping fuel
expanded_ct['Mode_confirm_fuel'] = expanded_ct['Mode_confirm'].map(dic_fuel)
expanded_ct['Replaced_mode_fuel'] = expanded_ct['Replaced_mode'].map(dic_fuel)

In [None]:
# Change meters to miles
scaffolding.unit_conversions(expanded_ct)

In [None]:
file_suffix = scaffolding.get_file_suffix(year, month, program)
quality_text = scaffolding.get_quality_text(participant_ct_df, expanded_ct)

In [None]:
# Calculate energy impact
expanded_ct = scaffolding.energy_intensity(expanded_ct, df_EI, 'distance_miles', 'Replaced_mode', 'Mode_confirm')
expanded_ct = scaffolding.energy_impact_kWH(expanded_ct, 'distance_miles', 'Replaced_mode', 'Mode_confirm')
expanded_ct = scaffolding.CO2_impact_lb(expanded_ct, 'distance_miles', 'Replaced_mode', 'Mode_confirm')

In [None]:
# Join the expanded database data to socioeconomic data
socio_data = pd.read_csv('./Can Do Colorado eBike Program - en.csv')
socio_data.rename(columns={'Unique User ID (auto-filled, do not edit)':'user_id',
                          'Please identify which category represents your total household income, before taxes, for last year.':'HHINC',
                          'How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?':'VEH',
                           'In which year were you born?':'AGE',
                          'Including yourself, how many people live in your home?':'HHSIZE',
                          'How many children under age 18 live in your home?':'CHILDREN',
                          'What is your gender?':'GENDER',
                          'If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?':'available_modes',
                          'Are you a student?':'STUDENT'}, inplace=True)
socio_data = socio_data[~socio_data.user_id.isnull()]

# Deal with people who have multiple responses by using most recent
socio_data = socio_data.sort_values(by=['user_id', 'Timestamp'])
socio_data.drop_duplicates(subset=['user_id'], keep='last', inplace=True)
socio_data['user_id_socio'] = socio_data.user_id
socio_data = socio_data.drop(labels='user_id', axis=1)

# Lose some trips due to people with no survey responses
expanded_ct['user_id_socio'] = expanded_ct.user_id.astype(str)
expanded_ct.user_id_socio = [i.replace('-','') for i in expanded_ct.user_id_socio] # remove all dashes from strings
expanded_ct = expanded_ct.merge(socio_data, on='user_id_socio')

## Data Preprocessing

In [None]:
# Add non-label category
expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unlabeled')
expanded_ct.loc[expanded_ct['replaced_mode'] == 'Unlabeled', 'Replaced_mode'] = "Unlabeled"

# Select variables of interest from complete OpenPATH data
data = expanded_ct[['Mode_confirm','Replaced_mode','replaced_mode','Trip_purpose','duration','distance_miles','start_local_dt_weekday','available_modes','AGE','HHINC','VEH','HHSIZE','CHILDREN','GENDER','STUDENT','user_id','_id','start_local_dt_year','start_local_dt_month','start_local_dt_day','cleaned_trip']].copy()

# List of variables to keep in data but not turn into categorical number variables
dont_categorize = ['user_id','_id','cleaned_trip']

# Make copy of user_id to be categorized since both versions are needed
data['user_id_int'] = data['user_id']

# Get timestamp from known year/month/day aggregated to days
data.rename(columns={'start_local_dt_year':'year','start_local_dt_month':'month','start_local_dt_day':'day'}, inplace=True)
data['date_time'] = pd.to_datetime(data[['year','month','day']])
data = data.drop(columns=['year','month','day'])

# Fix age
data['AGE'] = 2022 - data['AGE']

# Get number of workers
data['WORKERS'] = data['HHSIZE'] - data['CHILDREN']

# Duration in minutes
data['duration'] = data['duration'] / 60

# Filter out some responses to data
data = data[~data['Mode_confirm'].isin(['Not a Trip','Other'])]
data = data[~data['Replaced_mode'].isin(['Not a Trip','Other'])]
data = data[~data['available_modes'].isin(['None', 'Prefer not to say'])]
# data = data[~data['Trip_purpose'].isin(['not_a_trip','Other'])]
# data = data[~data['HHINC'].isin(['Prefer not to say'])]
# data = data[~data['VEH'].isin(['Prefer not to say / Prefiero no decir.'])]

# Combine variable categories
data = data.replace('Car, drove alone', 'car')
data = data.replace('Car, with others', 's_car')
data = data.replace('Bikeshare', 's_micro')
data = data.replace('Scooter share', 's_micro')
data = data.replace('Regular Bike', 'p_micro')
data = data.replace('Skate board', 'p_micro')
data = data.replace('Train', 'transit')
data = data.replace('Free Shuttle', 'transit')
data = data.replace('Bus', 'transit')
data = data.replace('Walk', 'walk')
data = data.replace('Taxi/Uber/Lyft', 'ridehail')
data = data.replace('Pilot ebike', 'ebike')

# data = data.replace(['Home','School','Work'], 'hbw')
# data = data.replace(['Entertainment/Social','Meal','Personal/Medical','Pick-up/Drop off','Recreation/Exercise','Religious','Shopping','Transit transfer'], 'non_hbw')

# data['start_local_dt_weekday'] = data['start_local_dt_weekday'].replace(['1','2','3','4','5'],'1')
# data['start_local_dt_weekday'] = data['start_local_dt_weekday'].replace(['0','6'],'0')

# data = data.replace(['By hours ','Custodian','Fire Fighter 2 Training',
#  'Graduate','Prefer not to say','Taking prerequisites missing for grad program ',
#  'Yes - Full Time College/University',
#  'Yes - Part-Time College/University',
#  'Yes - Vocation/Technical/Trade School',
#  'taking classes toward early childhood licensure'], 'student')
# data = data.replace('Not a student', 'non_student')

# Calculate travel times for each trip, across every mode
def add_all_mode_tt(data, mode_col, duration_col, dist_col):
    mode_travel_times = {}
    for mode in pd.unique(data[mode_col]):

        # Linear model for duration based on distance for trips belonging to each mode
        mode_data = data[data[mode_col]==mode]
        regr = linear_model.LinearRegression()
        regr.fit(mode_data[dist_col].values.reshape(-1,1), mode_data[duration_col].values.reshape(-1,1))

        # Make prediction for ALL trips
        mode_duration_pred = regr.predict(data[dist_col].values.reshape(-1,1))
        mode_travel_times['tt_'+mode] = mode_duration_pred

    # Apply for each mode existing in the dataframe
    for mode in mode_travel_times:
        data[mode] = mode_travel_times[mode]

    return data

# Calculate all mode travel times and add to dataframe
data = add_all_mode_tt(data,'Mode_confirm','duration','distance_miles')

# Calculate vehicle costs based roughly on $/mi from: https://www.vtpi.org/tca/tca0501.pdf
cost_factors = {'car':0.80,
                's_car':0.40,
                'ridehail':3.00,
                's_micro':1.50,
                'transit':0.40}

def add_all_mode_cost(data, cost_factors, dist_col):
    for factor in cost_factors:
        data['cost_'+factor] = cost_factors[factor] * data[dist_col]
    return data

# Calculate all mode travel costs and add to dataframe
add_all_mode_cost(data, cost_factors, 'distance_miles')

# Labels for modes in the availability survey
availability_codes = {'Public transportation (bus, subway, light rail, etc.)':'transit',
                      'Get a ride from a friend or family member':'s_car',
                      'Rental car (including Zipcar/ Car2Go)':'car',
                      'Taxi (regular taxi, Uber, Lyft, etc)':'ridehail',
                      'Bicycle':'p_micro',
                      'Shared bicycle or scooter':'s_micro',
                      'Walk/roll':'walk',
                      'Skateboard':'p_micro',
                      'ebike':'ebike',
                      'None':'none'}

def add_mode_availability(data, availability_codes, availability_col, choice_col, replaced_col, is_sp):
    mode_list = np.unique(list(availability_codes.values())[:-1])
    choice_list = data[choice_col].values
    replaced_list = data[replaced_col].values
    for mode in mode_list:
        mode_avail = []
        for i, available in enumerate(data[availability_col].values):
            available_modes = [availability_codes[x] for x in available.split(';')]
            # For SP: Replacement/stated available should be 1, chosen should be 0
            if is_sp:
                if mode==choice_list[i]:
                    mode_check = False
                else:
                    mode_check = mode==replaced_list[i] or mode in available_modes
            # For RP: Chosen/replacement/stated available should be 1
            else:
                mode_check = mode==choice_list[i] or mode==replaced_list[i] or mode in available_modes
            # Keep binary list of which trips the mode was available for
            if mode_check:
                mode_avail.append(1)
            else:
                mode_avail.append(0)
        # For each mode add a column with binary availability
        data['av_'+mode] = mode_avail

    return data

# Split data into revealed choice and stated replacement choice (2 obs per trip)
data_rp = data.copy()
data_sp = data.copy()
data_rp['is_sp'] = False
data_sp['is_sp'] = True
data_rp['mode_choice'] = data_rp['Mode_confirm']
data_sp['mode_choice'] = data_sp['Replaced_mode']

# The SP data cannot include trips where the chosen/replaced modes are stated the same
# We need to mark the chosen mode as unavailable in the SP data, which breaks the model if they're the same
data_sp = data_sp[data_sp.Mode_confirm!=data_sp.Replaced_mode]

# Make sure both chosen and replaced modes are in choice sets
data_rp = add_mode_availability(data_rp, availability_codes, 'available_modes', 'Mode_confirm', 'Replaced_mode', is_sp=False)
data_sp = add_mode_availability(data_sp, availability_codes, 'available_modes', 'Mode_confirm', 'Replaced_mode', is_sp=True)

# Combine RP/SP data. Keep RP data separate with a few additional columns for later analysis.
data = pd.concat([data_rp, data_sp])
data = data[~data['Replaced_mode'].isin(['Unlabeled', 'No Travel'])]

# Handle all variables that are ordinal; otherwise they may not end up in correct order
# data.HHINC = pd.Categorical(data.HHINC,
#                             ordered=True,
#                             categories=['Less than $24,999',
#                                        '$25,000-$49,999',
#                                        '$50,000-$99,999',
#                                        '$100,000 -$149,999',
#                                        '$150,000-$199,999',
#                                        '$200,000 or more'])
# data.VEH = pd.Categorical(data.VEH,
#                             ordered=True,
#                             categories=['0',
#                                        '1',
#                                        '2',
#                                        '3',
#                                        '4+'])

# Make sure that all mode variables align after being converted to numeric variables
mode_list = ['car','s_car','ridehail','transit','p_micro','s_micro','walk','ebike']
data.mode_choice = pd.Categorical(data.mode_choice, ordered=True, categories=mode_list)
data.Mode_confirm = pd.Categorical(data.Mode_confirm, ordered=True, categories=mode_list)
data.Replaced_mode = pd.Categorical(data.Replaced_mode, ordered=True, categories=mode_list)

# Convert categorical variables to numeric
cat_columns = data.select_dtypes(['object','category']).columns
cat_columns = cat_columns.drop(labels=dont_categorize)
all_categories = []
for i in range(0,len(cat_columns)):
    # Keep a record of what order the categories are in when converted
    var_categories = data[cat_columns].astype('category').iloc[:,i].cat.categories
    all_categories.append(var_categories)
data[cat_columns] = data[cat_columns].apply(lambda x: x.astype('category').cat.codes)

In [None]:
# Show listed categories in their order
cat_code_lookup = dict(zip(cat_columns.values, [list(x.values) for x in all_categories]))
cat_code_lookup

In [None]:
data.isna().sum()

In [None]:
# Setting up dataframes for different analyses throughout the notebook

# Only ebike, labeled trips
df_ebike = data_rp[data_rp['Mode_confirm'].isin(['ebike'])].copy()
df_ebike = df_ebike[~df_ebike['Replaced_mode'].isin(['Unlabeled','No Travel'])]

# Only ebike, unlabeled trips
df_ebike_unlabeled = data_rp[data_rp['Mode_confirm'].isin(['ebike'])].copy()
df_ebike_unlabeled = df_ebike_unlabeled[df_ebike_unlabeled['Replaced_mode'].isin(['Unlabeled'])].copy()

# Only ebike, new trips
df_ebike_new_travel = data_rp[data_rp['Mode_confirm'].isin(['ebike'])].copy()
df_ebike_new_travel = df_ebike_new_travel[df_ebike_new_travel['Replaced_mode'].isin(['No Travel'])]

# RP data only for basic stats and analysis (Removed unlabeled and no travel)
df_rp = data[data['is_sp']==False]

# For analysis of accurately stated replacements
df_replaced_trips = data_rp[~data_rp['Replaced_mode'].isin(['No Travel','Unlabeled'])].copy()

# Set up K-fold cross validation
kf = KFold(n_splits=3)

# Collect all scores to show at end of modeling
score_results = {}

In [None]:
# Check that mode availability is being set properly
# mode_choice should always be available; in RP it is Mode_confirm in SP it is Replaced_mode
for i, mode in enumerate(cat_code_lookup['mode_choice']):
    print(mode)
    assert sum(data[data['mode_choice']==i][f"av_{mode}"]) == len(data[data['mode_choice']==i])

## Data Stats

In [None]:
# Data stats
print(f"Trips: {len(df_rp)}")
print(f"Observed Choices: {len(data)}")
print(f"Users: {len(np.unique(data.user_id))}")
print(f"Trips per user: {len(data) / len(pd.unique(data.user_id))}")
print(f"New activity: {len(df_ebike_new_travel) / len(df_ebike)}")
print(f"Ebike all trips: {len(df_ebike)}")

In [None]:
# Substitution rate of ebike trips not including new trips
plot_data = df_rp.copy()
plot_data['Trip_purpose'] = plot_data['Trip_purpose'].replace([0,1,2,3,4,5,6,7,8,9,10], cat_code_lookup['Trip_purpose'])
plot_data['mode_choice'] = plot_data['mode_choice'].replace([0,1,2,3,4,5,6,7], cat_code_lookup['mode_choice'])
plot_data = plot_data[~plot_data['Trip_purpose'].isin(['not_a_trip'])].groupby(['Trip_purpose']).count()[['mode_choice']].reset_index()
plot_data = plot_data.sort_values('mode_choice', ascending=False)

fig, ax = plt.subplots(figsize=(13,6))
sns.barplot(ax=ax, data=plot_data, x='Trip_purpose', y='mode_choice', color='darkblue').set(title='Trip Purpose', xlabel='Stated Purpose', ylabel='Count')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Substitution rate of ebike trips
plot_data = df_rp.copy()
plot_data['Trip_purpose'] = plot_data['Trip_purpose'].replace([0,1,2,3,4,5,6,7,8,9,10], cat_code_lookup['Trip_purpose'])
plot_data['mode_choice'] = plot_data['mode_choice'].replace([0,1,2,3,4,5,6,7], cat_code_lookup['mode_choice'])
plot_data = plot_data[plot_data['distance_miles']<10]
plot_data['Mode'] = plot_data['mode_choice']

fig, ax = plt.subplots(figsize=(13,8))
sns.histplot(ax=ax, data=plot_data, x='distance_miles', hue='Mode', kde=True).set(title='Trip Distance', xlabel='Distance (mi)', ylabel='Count')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

# Random Forest Classifier

In [None]:
feature_list = ['tt_ebike', 'tt_s_car', 'tt_walk', 'tt_p_micro',
       'tt_car', 'tt_transit', 'tt_s_micro', 'tt_ridehail', 'cost_car',
       'cost_s_car', 'cost_ridehail', 'cost_s_micro', 'cost_transit', 'av_car', 'av_ebike', 'av_p_micro',
       'av_ridehail', 'av_s_car', 'av_s_micro', 'av_transit', 'av_walk']

### Train on All Choices

In [None]:
# Train and test model
rf, accuracy, f1, confusion = replacement_models.random_forest(data, 'mode_choice', feature_list, kf)

# Save scores for model comparison
score_results['rf_chosen_chosen'] = (np.mean(accuracy), np.mean(f1))
print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values, yticklabels=all_categories[0].values, cbar=False).set(title='Random Forest Confusion Matrix (Chosen)', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

rf_keep = rf

### Train on Chosen Test on Holdout Users

In [None]:
# Train and test model (10 users for now)
accuracy_holdout = []
f1_holdout = []
confusion_holdout = []
for user in list(np.unique(df_rp.user_id))[:10]:

    # Remove user from training data, keep only user in test data
    labeled_df = data[data['user_id']!=user]
    unlabeled_df = df_rp[df_rp['user_id']==user]

    # Set the chosen mode availability to 0
    for i in range(0,len(unlabeled_df)):
        unlabeled_df[f"av_{mode_list[unlabeled_df['Mode_confirm'].iloc[i]]}"].iat[i] = 0

    # Train on all trips by other users
    rf, accuracy, f1, confusion = replacement_models.random_forest(labeled_df, 'mode_choice', feature_list, kf)

    # Test on the stated replacement for holdout user
    X_test = unlabeled_df[feature_list].values
    y_pred = rf.predict(X_test)
    y_test = unlabeled_df['Replaced_mode'].values

    accuracy_holdout.append(sklearn.metrics.accuracy_score(y_test, y_pred))
    f1_holdout.append(sklearn.metrics.f1_score(y_test, y_pred, average='weighted'))
    confusion_holdout.append(sklearn.metrics.confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4,5,6,7], normalize='pred'))

# Save scores for model comparison
score_results['rf_holdout_replaced'] = (np.mean(accuracy_holdout), np.mean(f1_holdout))
print(f"Accuracy: {np.mean(accuracy_holdout)}")
print(f"F1: {np.mean(f1_holdout)}")

# Average and plot the confusion matrices
confusion_mean = np.mean(confusion_holdout, axis=0).reshape(8,8)
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values, yticklabels=all_categories[0].values, cbar=False).set(title='Random Forest Confusion Matrix (Holdout)', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

### Train on n Samples Chosen Test on Replaced

In [None]:
# Train and test model
samples_min = []
user_min = []
accuracy_min = []
f1_min = []
for n_samples in range(10,200,10):

    # Sample n training data points from all users
    labeled_df = data.sample(n_samples)

    # Construct model (accuracy/f1 don't matter here; we re-test per user below)
    rf, accuracy, f1, confusion = replacement_models.random_forest(labeled_df, 'mode_choice', feature_list, kf)

    # Test on the stated replacement for holdout user
    for user in list(np.unique(df_rp.user_id))[:10]:
        unlabeled_df = df_rp[df_rp['user_id']==user]

        # Set the chosen mode availability to 0
        for i in range(0,len(unlabeled_df)):
            unlabeled_df[f"av_{mode_list[unlabeled_df['Mode_confirm'].iloc[i]]}"].iat[i] = 0

        # Test on the stated replacement for holdout user
        X_test = unlabeled_df[feature_list].values
        y_pred = rf.predict(X_test)
        y_test = unlabeled_df['Replaced_mode'].values

        accuracy_min.append(sklearn.metrics.accuracy_score(y_test, y_pred))
        f1_min.append(sklearn.metrics.f1_score(y_test, y_pred, average='weighted'))
        samples_min.append(n_samples)
        user_min.append(user)

In [None]:
plot_data = pd.DataFrame({'samples_min':samples_min, 'user_min':user_min, 'accuracy_min':accuracy_min, 'f1_min':f1_min})

fig, ax = plt.subplots(figsize=(13,6))
sns.boxplot(ax=ax, data=plot_data, x='samples_min', y='accuracy_min', color='purple').set(title='Accuracy on n Samples', xlabel='n Samples', ylabel='Accuracy')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
plot_data = pd.DataFrame({'samples_min':samples_min, 'accuracy_min':accuracy_min, 'f1_min':f1_min})

fig, ax = plt.subplots(figsize=(13,6))
sns.barplot(ax=ax, data=plot_data, x='samples_min', y='accuracy_min', color='darkblue').set(title='Accuracy on n Samples', xlabel='n Samples', ylabel='Accuracy')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

# Choice Model

## MXL

### Train on All Choices

In [None]:
import importlib
importlib.reload(replacement_models)

# Train and test model
mxl, accuracy, f1, confusion = replacement_models.mxl(data, 'mode_choice')

# Save scores for model comparison
score_results['mxl_chosen_chosen'] = (np.mean(accuracy), np.mean(f1))
print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values, yticklabels=all_categories[0].values, cbar=False).set(title='MXL Confusion Matrix', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

# Save model parameters for prediction
mxl_keep = mxl

## MNL

### Train on All Choices

In [None]:
# Train and test model
mnl, accuracy, f1, confusion = replacement_models.mnl(data, 'mode_choice', kf)

# Save scores for model comparison
score_results['mnl_chosen_chosen'] = (np.mean(accuracy), np.mean(f1))
print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values, yticklabels=all_categories[0].values, cbar=False).set(title='MNL Confusion Matrix', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

mnl_keep = mnl

## Model Performance

In [None]:
score_df = pd.DataFrame(score_results.keys(), score_results.values()).reset_index()
score_df.columns = ['Accuracy','F1','Model']
model_types = score_df['Model'].str.split('_', expand=True)
model_types.columns = ['Model Type','Train Set','Test Set']
score_df = pd.concat([score_df, model_types], axis=1)
score_df

In [None]:
# Various model performances
plot_data = score_df[score_df['Train Set']=='chosen']
fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Test Set', y='Accuracy', hue='Model Type').set(title='Model Accuracy Trained on Primary', xlabel='Model', ylabel='Accuracy')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Various model performances
plot_data = score_df[score_df['Train Set']=='chosen']
fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Test Set', y='F1', hue='Model Type').set(title='Model F1 Trained on Primary', xlabel='Model', ylabel='F1')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Various model performances
plot_data = score_df
fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Model', y='Accuracy').set(title='Model Accuracy', xlabel='Model', ylabel='Accuracy')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Various model performances
plot_data = score_df
fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Model', y='F1').set(title='Model F1', xlabel='Model', ylabel='F1')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

## Ebike Substitution Rates and Emissions

In [None]:
# Substitution rate of ebike trips not including new trips
plot_data = df_ebike.groupby(['Replaced_mode']).count()[['Mode_confirm']].reset_index()
plot_data['subst_rate'] = plot_data['Mode_confirm'] / sum(plot_data['Mode_confirm'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Replaced_mode', y='subst_rate').set(title='Ebike Mode Replacement', xlabel='Replaced Mode', ylabel='Substitution Rate')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)
print(plot_data['subst_rate'])

In [None]:
# Substitution rate of ebike trips not including new trips
# Set the chosen mode availability to 0
for i in range(0,len(df_ebike_unlabeled)):
    df_ebike_unlabeled[f"av_{df_ebike_unlabeled['Mode_confirm'].iloc[i]}"].iat[i] = 0

# Predict replaced mode for the unlabeled trips
y_pred = rf_keep.predict(df_ebike_unlabeled[feature_list].values)
df_ebike_unlabeled['Replaced_mode'] = [mode_list[y] for y in y_pred]

# Combine labeled and predicted-unlabeled ebike trips
plot_data = pd.concat([df_ebike, df_ebike_unlabeled])
plot_data = plot_data.groupby(['Replaced_mode']).count()[['Mode_confirm']].reset_index()
plot_data['subst_rate'] = plot_data['Mode_confirm'] / sum(plot_data['Mode_confirm'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Replaced_mode', y='subst_rate').set(title='Ebike Mode Replacement (w/Labeling)', xlabel='Replaced Mode', ylabel='Substitution Rate')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)
print(plot_data['subst_rate'])

In [None]:
# From df_EI
# Combine variable categories
df_EI = df_EI.replace('Car, drove alone', 'car')
df_EI = df_EI.replace('Car, with others', 's_car')
df_EI = df_EI.replace('Bikeshare', 's_micro')
df_EI = df_EI.replace('Scooter share', 's_micro')
df_EI = df_EI.replace('Regular Bike', 'p_micro')
df_EI = df_EI.replace('Skate board', 'p_micro')
df_EI = df_EI.replace('Train', 'transit')
df_EI = df_EI.replace('Free Shuttle', 'transit')
df_EI = df_EI.replace('Bus', 'transit')
df_EI = df_EI.replace('Walk', 'walk')
df_EI = df_EI.replace('Taxi/Uber/Lyft', 'ridehail')
df_EI = df_EI.replace('Pilot ebike', 'ebike')
emission_rates = df_EI.groupby(['mode']).mean().reset_index()[['mode','energy_intensity_factor','CO2_factor']]
emission_rates['g_CO2_per_passmi'] = emission_rates.energy_intensity_factor*emission_rates.CO2_factor*0.000001*453.592
emission_data = plot_data.merge(emission_rates, left_on='Replaced_mode', right_on='mode')

In [None]:
emission_data

In [None]:
# From df_EI
emission_rates = emission_data.g_CO2_per_passmi.values
subst_rates = emission_data.subst_rate.values

# g-CO2/mi reduction through ebike availability
sum(emission_rates * subst_rates) / sum(subst_rates) - 0.007

In [None]:
# From table in paper
emission_rates = [343.3, 18.5, 343.3, (343.3/2), 39.8, 123.8, 0.0]
subst_rates = plot_data['subst_rate'].values

# g-CO2/mi reduction through ebike availability
sum(emission_rates * subst_rates) / sum(subst_rates) - 39.8

## Explore Replaced Mode Accuracy

In [None]:
av = {0: 'av_car',
      1: 'av_s_car',
      2: 'av_ridehail',
      3: 'av_transit',
      4: 'av_p_micro',
      5: 'av_s_micro',
      6: 'av_walk',
      7: 'av_ebike'}

In [None]:
replaced_list = [df_replaced_trips[f"av_{x}"].iloc[i] for i, x in enumerate(df_replaced_trips.Replaced_mode)]
df_replaced_trips['replaced_in_stated'] = replaced_list

# Relabel with original mode names for plotting
for mode in av:
    mode_text = '_'.join(str(av[mode]).split('_')[1:])
    df_replaced_trips['Mode_confirm'] = df_replaced_trips['Mode_confirm'].replace(mode,mode_text)
    df_replaced_trips['Replaced_mode'] = df_replaced_trips['Replaced_mode'].replace(mode,mode_text)

df_replaced_trips['Mode_confirm'] = df_replaced_trips['Mode_confirm'].replace(7,'ebike')
df_replaced_trips['Replaced_mode'] = df_replaced_trips['Replaced_mode'].replace(7,'ebike')

In [None]:
# Accurately stated replacement mode for all users
plot_data = df_replaced_trips[df_replaced_trips['Mode_confirm']=='ebike']
plot_data = plot_data.groupby(['date_time'], as_index=False)['replaced_in_stated'].agg(['sum','count']).apply(lambda x: x.rolling(14,1).mean())
plot_data['proportion'] = plot_data['sum'] / plot_data['count']

fig, ax = plt.subplots(figsize=(13,5))
sns.lineplot(ax=ax, data=plot_data, x='date_time', y='proportion').set(title='Proportion of Daily E-Bike Trips With Correctly Stated Replacement Mode', xlabel='Date', ylabel='Proportion Correct')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users across modes
plot_data = df_replaced_trips.groupby(['Mode_confirm'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['proportion'] = 1 - (plot_data['sum'] / plot_data['count'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Mode_confirm', y='proportion').set(title='Proportion of Infeasible Replacements by Primary Mode', xlabel='Primary Mode', ylabel='Proportion Incorrect')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users across modes
plot_data = df_replaced_trips.groupby(['Replaced_mode'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['proportion'] = 1 - (plot_data['sum'] / plot_data['count'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Replaced_mode', y='proportion').set(title='Proportion of Infeasible Replacements by Replaced Mode', xlabel='Stated Mode Replaced', ylabel='Proportion Incorrect')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users
df_replaced_trips.user_id = df_replaced_trips.user_id.astype(str)
plot_data = df_replaced_trips.groupby(['user_id'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['proportion'] = plot_data['sum'] / plot_data['count']
plot_data = plot_data.sort_values('proportion', ascending=False)

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(ax=ax, data=plot_data, x='user_id', y='proportion', color='darkblue').set(title='Proportion of Trips With Correctly Stated Replacement Mode', xlabel='User', ylabel='Proportion Correct')
plt.xticks(rotation=90)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users
df_replaced_trips.user_id = df_replaced_trips.user_id.astype(str)
plot_data = df_replaced_trips.groupby(['user_id'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['incorrect'] = plot_data['count'] - plot_data['sum']
plot_data['user_id'] = plot_data['user_id'].astype(str).str[-4:]
plot_data = plot_data.sort_values('incorrect', ascending=False)

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(ax=ax, data=plot_data, x='user_id', y='incorrect', color='darkblue').set(title='Trips With Unavailable Stated Replacement Mode', xlabel='User', ylabel='Count Incorrect')
plt.xticks(rotation=90)
plt.subplots_adjust(bottom=0.25)