In [None]:
year = None
month = None
program = "prepilot"
include_replaced_modes_as_valid = True # Flip this when we want to get results versus generate the replaced_mode correction graphs
input_dataset = "ONLY_LABELED" # "ONLY_LABELED", "ONLY_SENSED" or "BEST_AVAILABLE" for sensitivity analysis
LABEL_ASSIST_THRESHOLD = 0.3

In [None]:
from collections import defaultdict
import datetime

import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.distributions as dist
from biogeme.expressions import Beta, DefineVariable, RandomVariable, bioDraws, log, MonteCarlo, Integrate
import biogeme.results as res
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import sklearn.metrics
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import emission.core.get_database as edb
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.storage.decorations.trip_queries as esdt
import emission.storage.decorations.timeline as esdl
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt
import scaffolding
from uuid import UUID

%matplotlib inline

## Collect Data From Database

In [None]:
# Loading mapping dictionaries from mapping_dictionaries notebook
# Dictionary map is putting all other replaced modes into other
%store -r df_EI
%store -r dic_re
%store -r dic_pur
%store -r dic_fuel

# convert a dictionary to a defaultdict
dic_pur = defaultdict(lambda: 'Other',dic_pur)
dic_re = defaultdict(lambda: 'Other',dic_re)

In [None]:
tq = scaffolding.get_time_query(year, month)

In [None]:
# Let's get UUID lists for all the three categories
# stage, all, non_stage
stage_uuids = []
all_uuids = []
non_stage_uuids = []
for ue in edb.get_uuid_db().find():
    all_uuids.append(str(ue['uuid']))
    if ue['user_email'].startswith("stage_"):
        stage_uuids.append(str(ue['uuid']))
    else:
        non_stage_uuids.append(str(ue['uuid']))
stage_uuids

In [None]:
# Here are the proportions across them
len(stage_uuids), len(non_stage_uuids), len(all_uuids)

In [None]:
# Load all the participant confirmed trips
participant_ct_df = scaffolding.load_all_participant_trips(program, tq)

In [None]:
# wait, we have 219 unique trips, which doesn't match any of the numbers
len(participant_ct_df.user_id.unique())

In [None]:
# Let's see if this is because of strings; nope
participant_ct_df["user_id_str"] = participant_ct_df.user_id.apply(lambda u: str(u))
len(participant_ct_df.user_id_str.unique())

In [None]:
# Let's see which category the missing trips fit into
missing_uuids = set(all_uuids).difference(set(participant_ct_df.user_id_str))
len(missing_uuids), 245 - 219

In [None]:
set(missing_uuids).intersection(stage_uuids), set(missing_uuids).intersection(non_stage_uuids)

In [None]:
# They seem to be split pretty evenly between stage and non stage
len(set(missing_uuids).intersection(stage_uuids)), len(set(missing_uuids).intersection(non_stage_uuids))

In [None]:
# Remove stage users, comparing string, we find a difference
non_stage_ct_df = participant_ct_df[~participant_ct_df['user_id_str'].isin(stage_uuids)]
stage_ct_df = participant_ct_df[participant_ct_df['user_id_str'].isin(stage_uuids)]
print(len(non_stage_ct_df))
print(len(stage_ct_df))
print(len(participant_ct_df))

In [None]:
# since we have missing UUIDs, let's confirm that none of them have trips in the database
# and that we are not missing trips just because of read limits
missing_stage_uuids = set(missing_uuids).intersection(stage_uuids)
missing_non_stage_uuids = set(missing_uuids).intersection(non_stage_uuids)

from uuid import UUID
import emission.core.get_database as edb

for uuid_str in missing_stage_uuids.union(missing_non_stage_uuids):
    print(f"For {uuid_str}, found %d trips in the database" % 
          (edb.get_analysis_timeseries_db().count_documents({"user_id": UUID(uuid_str), "metadata.key": "analysis/confirmed_trip"})))

In [None]:
# labeled_ct = scaffolding.filter_labeled_trips(participant_ct_df)
# expanded_ct = scaffolding.expand_userinputs(labeled_ct)

In [None]:
# Use sensed as well as labeled trips
expanded_ct = scaffolding.expand_userinputs(participant_ct_df)
expanded_stage_ct = scaffolding.expand_userinputs(stage_ct_df)
expanded_non_stage_ct = scaffolding.expand_userinputs(non_stage_ct_df)

In [None]:
# Using non-stage users
expanded_ct = scaffolding.data_quality_check(expanded_non_stage_ct)
expanded_ct.shape

In [None]:
# SHANKARI: I am not sure it is OK to fill in the inferred values as `mode_confirm`
# mode confirm should be reserved for user input
# the inferred labels are generated by the label assist algorithm and have accuracy of ~ 50%

def describe_labels(ct_df):
    # How much data total
    print("Total number of trips", len(ct_df))

    # data with user specified modes
    print("Trips with user specified labels", len(ct_df[~pd.isna(ct_df.mode_confirm)]))
      
    # how much data without labels and with and without label assist
    no_user_label_ct_df = ct_df[pd.isna(ct_df.mode_confirm)]
    print("Trips without user specified labels", len(no_user_label_ct_df))
    is_empty_check = lambda ll: len(ll) == 0
    print("Trips without user label but with inferred label", len(no_user_label_ct_df[~no_user_label_ct_df.inferred_labels.apply(is_empty_check)]))
    print("Trips without user label or inferred label", len(no_user_label_ct_df[no_user_label_ct_df.inferred_labels.apply(is_empty_check)]))

describe_labels(expanded_ct)

In [None]:
# no_user_label_ct_df[no_user_label_ct_df.inferred_labels.apply(is_empty_check)][["inferred_labels", "start_fmt_time", "end_fmt_time", "mode_confirm", "purpose_confirm", "user_id"]].head()

In [None]:
# so we really only have 57k labeled trips, not 115k
# we can generate results with label assist and/or primary sensed mode for comparison,
# but I think that the most principled version of the evaluation should use only labeled trips

In [None]:
# Copied from https://github.com/allenmichael099/e-mission-eval-private-data/blob/hybrid_labeling_analysis_Michael/Error_bars/add_new_label_fields.py#L45
# TODO: Need to clean up later instead of copy-paste
def get_primary_sensed_mode(ct_row):
    # These keys were found in emission/core/wrapper/modeprediction.py:
    sensed_mode_types = {0: "unknown", 1: "drove_alone",2: "bike",
                     3: "bus", 4: "train", 5: "car", 6: "air_or_hsr",
                     7: "train", 8: "train", 9: "train"}

    # Get the segments for the trip.
    #cleaned_section will only have walk/bike/automotive, inferred_section is the one that has bus/train/car etc 
    segments = esdt.get_sections_for_trip(key = "analysis/inferred_section", user_id = ct_row["user_id"], trip_id = ct_row['cleaned_trip'])

    # get pairs of mode type and duration
    trip_mode_durations = {}
    total_dur = 0
    for s in segments:

        # the sensed mode is a number in the database, so I'm relabeling it as a string.
        mode = sensed_mode_types[s['data']['sensed_mode']]
        duration = s['data']['duration']

        if mode not in trip_mode_durations.keys(): trip_mode_durations[mode] = 0
        trip_mode_durations[mode] += duration

        total_dur += duration
    # convert the durations to fractions of the total segment moving time (not the trip time, since trips include stop times)
    # output is something like {'bicycling': 0.3244066758052265, 'walking': 0.6755933241947736}
    mode_split_map = pd.Series({mode: duration/total_dur  for mode,duration in trip_mode_durations.items()})
    primary_mode = mode_split_map.index[mode_split_map.argmax()]
    print(f"After processing {len(segments)} segments for trip {ct_row['_id']} for user {ct_row['user_id']}, returning {primary_mode}")
    return primary_mode

In [None]:
# small unit test
# mode_split_map = pd.Series({'bicycling': 0.3244066758052265, 'walking': 0.6755933241947736})
# mode_split_map.index[mode_split_map.argmax()]

In [None]:
def get_max_prob_label(inferred_label_list, p_threshold):
    # copied from emission/storage/decorations/trip_queries.py
    # Two columns: "labels" and "p"
    label_prob_df = pd.DataFrame(inferred_label_list)
    # logging.debug(label_prob_df)
    # idxmax returns the index corresponding to the max data value in each column
    max_p = label_prob_df.p.max()
    if max_p > p_threshold:
        max_p_idx = label_prob_df.p.idxmax()
        # logging.debug(max_p_idx)
        # now we look up the labels for that index
        return label_prob_df.loc[max_p_idx].labels
    else:
        print(f"max_p {max_p} < threshold {p_threshold}, returning None")
        return None

def get_best_label_assist_mode(ct_row, p_threshold):
    # copied and modified from emission/storage/decorations/trip_queries.py line 290-ish
    all_inferred_labels = ct_row.inferred_labels
    if len(all_inferred_labels) > 0:
        max_p_labels = get_max_prob_label(all_inferred_labels, p_threshold)
        return max_p_labels["mode_confirm"] if max_p_labels is not None else None
    else:
        return None
    
def get_best_hybrid_mode(ct_row):
    if ct_row.mode_confirm is not None:
        return ct_row
    else:
        best_label_assist_mode = get_best_label_assist_mode(ct_row, LABEL_ASSIST_THRESHOLD)
        if best_label_assist_mode is not None:
            return best_label_assist_mode
        else:
            return get_primary_sensed_mode(ct_row)

In [None]:
# Basic sanity checks
sanity_test_df = expanded_ct.sample(n=50, random_state=1234)
print("Sanity testing characteristics ")
describe_labels(sanity_test_df)

only_sensed_df = sanity_test_df.copy()
only_sensed_df.mode_confirm = sanity_test_df.apply(lambda row: get_primary_sensed_mode(row), axis=1)
# This should be zero
print(only_sensed_df[pd.isna(only_sensed_df.mode_confirm)])

only_label_assist_df = sanity_test_df.copy()
only_label_assist_df.mode_confirm = sanity_test_df.apply(lambda row: get_best_label_assist_mode(row, LABEL_ASSIST_THRESHOLD), axis=1)
print(only_label_assist_df[pd.isna(only_label_assist_df.mode_confirm)][["start_fmt_time", "end_fmt_time"]])

hybrid_df = sanity_test_df.copy()
hybrid_df.mode_confirm = sanity_test_df.apply(lambda row: get_best_hybrid_mode(row), axis=1)
print(hybrid_df[pd.isna(hybrid_df.mode_confirm)])

In [None]:
if input_dataset == "ONLY_LABELED":
    expanded_ct = scaffolding.filter_labeled_trips(expanded_ct)
elif input_dataset == "BEST_AVAILABLE":
    expanded_ct = expanded_ct.apply(lambda row: get_best_hybrid_mode(row), axis=1)
elif input_dataset == "ONLY_SENSED":
    expanded_ct.mode_confirm = expanded_ct.apply(lambda row: get_primary_sensed_mode(row), axis=1)

In [None]:
# Mapping new labels with dictionaries
expanded_ct['Trip_purpose'] = expanded_ct['purpose_confirm'].map(dic_pur)
expanded_ct['Mode_confirm'] = expanded_ct['mode_confirm'].map(dic_re)
expanded_ct['Replaced_mode'] = expanded_ct['replaced_mode'].map(dic_re)

# Mapping fuel
expanded_ct['Mode_confirm_fuel'] = expanded_ct['Mode_confirm'].map(dic_fuel)
expanded_ct['Replaced_mode_fuel'] = expanded_ct['Replaced_mode'].map(dic_fuel)

In [None]:
# Change meters to miles
scaffolding.unit_conversions(expanded_ct)

In [None]:
file_suffix = scaffolding.get_file_suffix(year, month, program)
quality_text = scaffolding.get_quality_text(participant_ct_df, expanded_ct)

In [None]:
# Calculate energy impact
expanded_ct = scaffolding.energy_intensity(expanded_ct, df_EI, 'distance_miles', 'Replaced_mode', 'Mode_confirm')
expanded_ct = scaffolding.energy_impact_kWH(expanded_ct, 'distance_miles', 'Replaced_mode', 'Mode_confirm')
expanded_ct = scaffolding.CO2_impact_lb(expanded_ct, 'distance_miles', 'Replaced_mode', 'Mode_confirm')

In [None]:
# Join the expanded database data to socioeconomic data
socio_data = pd.read_csv('./replacement_modeling/Can Do Colorado eBike Program - en.csv')
socio_data.rename(columns={'Unique User ID (auto-filled, do not edit)':'user_id',
                          'Please identify which category represents your total household income, before taxes, for last year.':'HHINC',
                          'How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?':'VEH',
                           'In which year were you born?':'AGE',
                          'Including yourself, how many people live in your home?':'HHSIZE',
                          'How many children under age 18 live in your home?':'CHILDREN',
                          'What is your gender?':'GENDER',
                          'If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?':'available_modes',
                          'Are you a student?':'STUDENT'}, inplace=True)
socio_data = socio_data[~socio_data.user_id.isnull()]

# Deal with people who have multiple responses by using most recent
socio_data = socio_data.sort_values(by=['user_id', 'Timestamp'])
socio_data.drop_duplicates(subset=['user_id'], keep='last', inplace=True)

# Lose some trips due to people with no survey responses
expanded_ct.user_id = expanded_ct.user_id.astype(str)
expanded_ct.user_id = [i.replace('-','') for i in expanded_ct.user_id] # remove all dashes from strings
expanded_ct = expanded_ct.merge(socio_data, on='user_id')

In [None]:
expanded_ct.columns

## Data Preprocessing

In [None]:
# Add non-label category
expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unlabeled')
expanded_ct.loc[expanded_ct['replaced_mode'] == 'Unlabeled', 'Replaced_mode'] = "Unlabeled"

# Select variables of interest from complete OpenPATH data
data = expanded_ct[['Mode_confirm','Replaced_mode','replaced_mode','Trip_purpose','duration','distance_miles','start_local_dt_weekday','available_modes','AGE','HHINC','VEH','HHSIZE','CHILDREN','GENDER','STUDENT','user_id','start_local_dt_year','start_local_dt_month','start_local_dt_day']].copy()

# Get timestamp from known year/month/day aggregated to days
data.rename(columns={'start_local_dt_year':'year','start_local_dt_month':'month','start_local_dt_day':'day'}, inplace=True)
data['date_time'] = pd.to_datetime(data[['year','month','day']])
data = data.drop(columns=['year','month','day'])

# Fix age
data['AGE'] = 2022 - data['AGE']

# Get number of workers
data['WORKERS'] = data['HHSIZE'] - data['CHILDREN']

# Duration in minutes
data['duration'] = data['duration'] / 60

# Filter out some responses to data
data = data[~data['Mode_confirm'].isin(['Not a Trip','Other'])]
# data = data[~data['Trip_purpose'].isin(['not_a_trip','Other'])]
# data = data[~data['Replaced_mode'].isin(['Not a Trip','Other'])]
# data = data[~data['HHINC'].isin(['Prefer not to say'])]
# data = data[~data['VEH'].isin(['Prefer not to say / Prefiero no decir.'])]
# Had to add the "prefer not to say" here otherwise I get an KeyError "KeyError: 'Prefer not to say'"
# -- SHANKARI
data = data[~data['available_modes'].isin(['None', 'Prefer not to say'])]

# Combine variable categories
data = data.replace('Car, drove alone', 'car')
data = data.replace('Car, with others', 's_car')
data = data.replace('Bikeshare', 's_micro')
data = data.replace('Scooter share', 's_micro')
data = data.replace('Regular Bike', 'p_micro')
data = data.replace('Skate board', 'p_micro')
data = data.replace('Train', 'transit')
data = data.replace('Free Shuttle', 'transit')
data = data.replace('Bus', 'transit')
data = data.replace('Walk', 'walk')
data = data.replace('Taxi/Uber/Lyft', 'ridehail')
data = data.replace('Pilot ebike', 'ebike')

# data = data.replace(['Home','School','Work'], 'hbw')
# data = data.replace(['Entertainment/Social','Meal','Personal/Medical','Pick-up/Drop off','Recreation/Exercise','Religious','Shopping','Transit transfer'], 'non_hbw')

# data['start_local_dt_weekday'] = data['start_local_dt_weekday'].replace(['1','2','3','4','5'],'1')
# data['start_local_dt_weekday'] = data['start_local_dt_weekday'].replace(['0','6'],'0')

# data = data.replace(['By hours ','Custodian','Fire Fighter 2 Training',
#  'Graduate','Prefer not to say','Taking prerequisites missing for grad program ',
#  'Yes - Full Time College/University',
#  'Yes - Part-Time College/University',
#  'Yes - Vocation/Technical/Trade School',
#  'taking classes toward early childhood licensure'], 'student')
# data = data.replace('Not a student', 'non_student')

# Calculate travel times for each trip, across every mode
def add_all_mode_tt(data, mode_col, duration_col, dist_col):
    mode_travel_times = {}
    for mode in pd.unique(data[mode_col]):

        # Linear model for duration based on distance for trips belonging to each mode
        mode_data = data[data[mode_col]==mode]
        regr = linear_model.LinearRegression()
        regr.fit(mode_data[dist_col].values.reshape(-1,1), mode_data[duration_col].values.reshape(-1,1))

        # Make prediction for ALL trips
        mode_duration_pred = regr.predict(data[dist_col].values.reshape(-1,1))
        mode_travel_times['tt_'+mode] = mode_duration_pred

    # Apply for each mode existing in the dataframe
    for mode in mode_travel_times:
        data[mode] = mode_travel_times[mode]

    return data

# Calculate all mode travel times and add to dataframe
data = add_all_mode_tt(data,'Mode_confirm','duration','distance_miles')

# Calculate vehicle costs based roughly on $/mi from: https://www.vtpi.org/tca/tca0501.pdf
cost_factors = {'car':0.80,
                's_car':0.40,
                'ridehail':3.00,
                's_micro':1.50,
                'transit':0.40}

def add_all_mode_cost(data, cost_factors, dist_col):
    for factor in cost_factors:
        data['cost_'+factor] = cost_factors[factor] * data[dist_col]
    return data

# Calculate all mode travel costs and add to dataframe
add_all_mode_cost(data, cost_factors, 'distance_miles')

# Labels for modes in the availability survey
availability_codes = {'Public transportation (bus, subway, light rail, etc.)':'transit',
                    'Get a ride from a friend or family member':'s_car',
                    'Rental car (including Zipcar/ Car2Go)':'car',
                    'Taxi (regular taxi, Uber, Lyft, etc)':'ridehail',
                    'Bicycle':'p_micro',
                    'Shared bicycle or scooter':'s_micro',
                    'Walk/roll':'walk',
                    'Skateboard':'p_micro',
                    'ebike':'ebike'}

# Create columns for available modes under each trip
def add_mode_availability(data, availability_codes, availability_col, choice_col, replaced_col):
    mode_list = np.unique(list(availability_codes.values()))
    available_list = data[availability_col].values
    choice_list = data[choice_col].values
    replaced_list = data[replaced_col].values
    for mode in mode_list:
        mode_avail = []
        i=0
        for available in available_list:
            if 'None' in available:
                mode_avail.append(1)
                i+=1
                continue
            options = [availability_codes[x] for x in available.split(';')]
            # Chosen mode must be in the available modes list, if mode was chosen it is assumed available
            if include_replaced_modes_as_valid:
                mode_check = lambda mode: mode in options or mode==choice_list[i] or mode==replaced_list[i]
            else:
                mode_check = lambda mode: mode in options or mode==choice_list[i]
            if mode_check(mode):
                mode_avail.append(1)
            else:
                mode_avail.append(0)
            i+=1
        data['av_'+mode] = mode_avail

    return data

# Add availability variables to data
data = add_mode_availability(data, availability_codes, 'available_modes', 'Mode_confirm', 'Replaced_mode')

# Handle all variables that are ordinal; otherwise they may not end up in correct order
# data.HHINC = pd.Categorical(data.HHINC,
#                             ordered=True,
#                             categories=['Less than $24,999',
#                                        '$25,000-$49,999',
#                                        '$50,000-$99,999',
#                                        '$100,000 -$149,999',
#                                        '$150,000-$199,999',
#                                        '$200,000 or more'])
# data.VEH = pd.Categorical(data.VEH,
#                             ordered=True,
#                             categories=['0',
#                                        '1',
#                                        '2',
#                                        '3',
#                                        '4+'])

# Make sure that the confirmed and replaced modes align after being converted to numeric variables
data.Mode_confirm = pd.Categorical(data.Mode_confirm,
                            ordered=True,
                            categories=['car',
                                        's_car',
                                        'ridehail',
                                        'transit',
                                        'p_micro',
                                        's_micro',
                                        'walk',
                                        'ebike'])
data.Replaced_mode = pd.Categorical(data.Replaced_mode,
                            ordered=True,
                            categories=['car',
                                        's_car',
                                        'ridehail',
                                        'transit',
                                        'p_micro',
                                        's_micro',
                                        'walk',
                                        'ebike',
                                        'Other',
                                        'No Travel',
                                        'Unlabeled'])

# Convert categorical variables to numeric
cat_columns = data.select_dtypes(['object','category']).columns
all_categories = []
for i in range(0,len(cat_columns)):
    # Keep a record of what order the categories are in when converted
    var_categories = data[cat_columns].astype('category').iloc[:,i].cat.categories
    all_categories.append(var_categories)
data[cat_columns] = data[cat_columns].apply(lambda x: x.astype('category').cat.codes)

In [None]:
# Show listed categories in their order
print(cat_columns)
print(all_categories)

In [None]:
data.isna().sum()

In [None]:
# All replaced trips
df_replaced_trips = data[~data['Replaced_mode'].isin([8,9,10])].copy()

# Only non-ebike
df_non_ebike = data[~data['Mode_confirm'].isin([7])].copy()

# Only ebike, labeled trips
df_ebike = data[data['Mode_confirm'].isin([7])].copy()
df_ebike = df_ebike[~df_ebike['Replaced_mode'].isin([7,8,9,10])]

# Only ebike, unlabeled trips
df_ebike_unlabeled = data[data['Mode_confirm'].isin([7])].copy()
df_ebike_unlabeled = df_ebike_unlabeled[df_ebike_unlabeled['Replaced_mode'].isin([10])]

# Only ebike, labeled and unlabeled trips
df_ebike_to_label = data[data['Mode_confirm'].isin([7])].copy()
df_ebike_to_label = df_ebike_to_label[~df_ebike_to_label['Replaced_mode'].isin([7,8,9])]

# Only ebike, only new trips
df_ebike_new_travel = data[data['Mode_confirm'].isin([7])].copy()
df_ebike_new_travel = df_ebike_new_travel[df_ebike_new_travel['Replaced_mode'].isin([9])]

# Set up K-fold cross validation
kf = KFold(n_splits=10)

In [None]:
# Data stats
print(f"Trips: {len(data)}")
print(f"Users: {len(np.unique(data.user_id))}")
print(f"Trips per user: {len(data) / len(pd.unique(data.user_id))}")
print(f"New activity: {len(df_ebike_new_travel) / len(df_ebike)}")
print(f"Unlabeled all trips: {len(data[data.Replaced_mode==10]) / len(data)}")
print(f"Unlabeled ebike trips (we model these): {len(df_ebike_unlabeled) / len(df_ebike)}")

# Random Forest Classifier

In [None]:
feature_list = ['tt_car','tt_s_car','tt_walk','tt_p_micro','tt_transit','tt_s_micro','tt_ridehail',
             'cost_car','cost_s_car','cost_ridehail','cost_s_micro','cost_transit',
             'av_car','av_s_car','av_walk','av_p_micro','av_transit','av_s_micro','av_ridehail']

### Train and Test on Primary

In [None]:
# Test random forest on the primary mode
accuracy = []
f1 = []
confusion = []
for train_index, test_index in kf.split(df_non_ebike.values):
    X_train, X_test = df_non_ebike[feature_list].values[train_index], df_non_ebike[feature_list].values[test_index]
    y_train, y_test = df_non_ebike['Mode_confirm'].values[train_index], df_non_ebike['Mode_confirm'].values[test_index]

    # Train random forest on non-ebike trip training set
    rf = RandomForestClassifier(n_estimators=50)
    rf.fit(X_train,y_train)

    y_pred = rf.predict(X_test)
    accuracy.append(sum(y_pred==y_test) / len(y_test))
    f1.append(sklearn.metrics.f1_score(y_test, y_pred, average='weighted'))
    confusion.append(sklearn.metrics.confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4,5,6], normalize='pred'))

print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

# Get numerical feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

# Use the parameters from this model for final labeling
rf_keep = rf

In [None]:
# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values[:-1], yticklabels=all_categories[0].values[:-1], cbar=False).set(title='Random Forest Confusion Matrix (Primary)', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

### Train on Primary Test on Replaced

In [None]:
# Test random forest on the replaced mode
accuracy = []
f1 = []
confusion = []
for train_index, test_index in kf.split(df_non_ebike.values):
    X_train, X_test = df_non_ebike[feature_list].values[train_index], df_non_ebike[feature_list].values[test_index]
    y_train, y_test = df_non_ebike['Mode_confirm'].values[train_index], df_non_ebike['Mode_confirm'].values[test_index]

    X_test = df_ebike[feature_list].values
    y_test = df_ebike['Replaced_mode'].values

    y_pred = rf.predict(X_test)
    accuracy.append(sum(y_pred==y_test) / len(y_test))
    f1.append(sklearn.metrics.f1_score(y_test, y_pred, average='weighted'))
    confusion.append(sklearn.metrics.confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4,5,6], normalize='pred'))

print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

# Get numerical feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [None]:
# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values[:-1], yticklabels=all_categories[0].values[:-1], cbar=False).set(title='Random Forest Confusion Matrix (Replaced)', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

### Train on Replaced Test on Replaced

In [None]:
# Train AND Test random forest on the replaced mode
accuracy = []
f1 = []
for train_index, test_index in kf.split(df_ebike.values):
    X_train, X_test = df_ebike[feature_list].values[train_index], df_ebike[feature_list].values[test_index]
    y_train, y_test = df_ebike['Replaced_mode'].values[train_index], df_ebike['Replaced_mode'].values[test_index]

    # Train random forest on non-ebike trip training set
    rf = RandomForestClassifier(n_estimators=50)
    rf.fit(X_train,y_train)

    y_pred = rf.predict(X_test)
    accuracy.append(sum(y_pred==y_test) / len(y_test))
    f1.append(sklearn.metrics.f1_score(y_test, y_pred, average='weighted'))

print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

# Get numerical feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

# MNL Choice Model

In [None]:
# # The estimation results can be read from the pickle file instead f desired
# results = res.bioResults(pickleFile='openpath_mxl~05.pickle')

In [None]:
# # Mixed logit model in Biogeme
# # Best so far: openpath_mxl~05

# # Alternative specific constants
# ASC_CAR = Beta('ASC_CAR',0,None,None,1)
# ASC_EBIKE = Beta('ASC_EBIKE',0,None,None,0)
# ASC_P_MICRO = Beta('ASC_P_MICRO',0,None,None,0)
# ASC_RIDEHAIL = Beta('ASC_RIDEHAIL',0,None,None,0)
# ASC_S_MICRO = Beta('ASC_S_MICRO',0,None,None,0)
# ASC_TRANSIT = Beta('ASC_TRANSIT',0,None,None,0)
# ASC_WALK = Beta('ASC_WALK',0,None,None,0)

# # Define a random parameter, normally distributed, designed to be used
# # for Monte-Carlo simulation
# B_TIME = Beta('B_TIME', 0, None, None, 0)
# B_COST = Beta('B_COST', 0, None, None, 0)

# # Alternative specific variables
# B_HHINC = Beta('B_HHINC', 0, None, None, 0)

# # It is advised not to use 0 as starting value for the following parameter.
# B_TIME_S = Beta('B_TIME_S', 1, None, None, 0)
# B_TIME_RND = B_TIME + B_TIME_S * bioDraws('B_TIME_RND', 'NORMAL')

# # Utility functions
# V0 = ASC_CAR + \
# B_TIME_RND * tt_car + \
# B_COST * cost_car

# V1 = ASC_P_MICRO + \
# B_TIME_RND * tt_p_micro

# V2 = ASC_RIDEHAIL + \
# B_TIME_RND * tt_ridehail + \
# B_COST * cost_ridehail

# V3 = ASC_S_MICRO + \
# B_TIME_RND * tt_s_micro + \
# B_COST * cost_s_micro

# V4 = ASC_TRANSIT + \
# B_TIME_RND * tt_transit + \
# B_COST * cost_transit

# V5 = ASC_WALK + \
# B_TIME_RND * tt_walk

# # Map modes to utility functions
# V = {0: V0,
#     1: V1,
#     2: V2,
#     3: V3,
#     4: V4,
#     5: V5}

# # Mode availability
# av = {0: av_car,
#     1: av_p_micro,
#     2: av_ridehail,
#     3: av_s_micro,
#     4: av_transit,
#     5: av_walk}

# # Conditional to B_TIME_RND, we have a logit model (called the kernel)
# prob = models.logit(V, av, Mode_confirm)

# # We integrate over B_TIME_RND using Monte-Carlo
# logprob = log(MonteCarlo(prob))

### Train and Test on Primary

In [None]:
# Test MNL on primary mode
accuracy = []
f1 = []
confusion = []
for train_index, test_index in kf.split(df_non_ebike.values):
    X_train, X_test = df_non_ebike.iloc[train_index], df_non_ebike.iloc[test_index]
    y_train, y_test = df_non_ebike.iloc[train_index]['Mode_confirm'].values, df_non_ebike.iloc[test_index]['Mode_confirm'].values

    # Put the variables in global namespace to make Biogeme happy
    df_train = X_train.drop(columns=['date_time'])
    database_train = db.Database('openpath_train', df_train)
    globals().update(database_train.variables)
    
    df_test = X_test.drop(columns=['date_time'])
    database_test = db.Database('openpath_test', df_test)
    globals().update(database_test.variables)

    # Multinomial logit model in Biogeme
    # Alternative specific constants
    ASC_CAR = Beta('ASC_CAR',0,None,None,1)
    ASC_S_CAR = Beta('ASC_S_CAR',0,None,None,0)
    ASC_RIDEHAIL = Beta('ASC_RIDEHAIL',0,None,None,0)
    ASC_TRANSIT = Beta('ASC_TRANSIT',0,None,None,0)
    ASC_P_MICRO = Beta('ASC_P_MICRO',0,None,None,0)
    ASC_S_MICRO = Beta('ASC_S_MICRO',0,None,None,0)
    ASC_WALK = Beta('ASC_WALK',0,None,None,0)
    ASC_EBIKE = Beta('ASC_EBIKE',0,None,None,0)

    # Trip parameters
    B_COST = Beta('B_COST',0,None,None,0)
    B_TT = Beta('B_TT',0,None,None,0)

    # Mode parameters
    B_ASV_TT_MOTOR = Beta('B_ASV_TT_MOTOR',0,None,None,0)
    B_ASV_TT_PHYS = Beta('B_ASV_TT_PHYS',0,None,None,0)

    # Utility functions
    V0 = ASC_CAR + \
    B_COST * cost_car + \
    B_ASV_TT_MOTOR * tt_car

    V1 = ASC_S_CAR + \
    B_COST * cost_s_car + \
    B_ASV_TT_MOTOR * tt_s_car

    V2 = ASC_RIDEHAIL + \
    B_COST * cost_ridehail + \
    B_ASV_TT_MOTOR * tt_ridehail

    V3 = ASC_TRANSIT + \
    B_COST * cost_transit + \
    B_ASV_TT_MOTOR * tt_transit

    V4 = ASC_P_MICRO + \
    B_ASV_TT_PHYS * tt_p_micro

    V5 = ASC_S_MICRO + \
    B_COST * cost_s_micro + \
    B_ASV_TT_PHYS * tt_s_micro

    V6 = ASC_WALK + \
    B_ASV_TT_PHYS * tt_walk

    # Map modes to utility functions
    V = {0: V0,
         1: V1,
         2: V2,
         3: V3,
         4: V4,
         5: V5,
         6: V6}

    # Mode availability
    av = {0: av_car,
          1: av_s_car,
          2: av_ridehail,
          3: av_transit,
          4: av_p_micro,
          5: av_s_micro,
          6: av_walk}
    
    # Train the model parameters
    logprob = models.loglogit(V, av, Mode_confirm)
    biogeme = bio.BIOGEME(database_train, logprob)
    biogeme.modelName = 'openpath_mnl_train'
    biogeme.generateHtml = False
    biogeme.generatePickle = False
    results = biogeme.estimate()
    
    # Assemble utility functions for testing modes
    prob_car = models.logit(V, av, 0)
    prob_s_car = models.logit(V, av, 1)
    prob_ridehail = models.logit(V, av, 2)
    prob_transit = models.logit(V, av, 3)
    prob_p_micro = models.logit(V, av, 4)
    prob_s_micro = models.logit(V, av, 5)
    prob_walk = models.logit(V, av, 6)

    simulate ={'Prob. car': prob_car,
               'Prob. s_car': prob_s_car,
               'Prob. ridehail': prob_ridehail,
               'Prob. transit': prob_transit,
               'Prob. p_micro': prob_p_micro,
               'Prob. s_micro': prob_s_micro,
               'Prob. walk': prob_walk}

    betas = results.getBetaValues()

    # Calculate utility values for each row in the test database
    biogeme = bio.BIOGEME(database_test, simulate)
    biogeme.modelName = 'openpath_mnl_test'
    simulatedValues = biogeme.simulate(betas)

    # Test predicting maximum mode utility as choice
    # Identify the column of highest probability, replace with number corresponding to the mode
    prob_max = simulatedValues.idxmax(axis=1)
    prob_max = prob_max.replace({'Prob. car': 0,
                                 'Prob. s_car': 1,
                                 'Prob. ridehail': 2,
                                 'Prob. transit': 3,
                                 'Prob. p_micro': 4,
                                 'Prob. s_micro': 5,
                                 'Prob. walk': 6})
    data_res = {'y_Actual':df_test['Mode_confirm'], 'y_Predicted': prob_max}
    
#     # Test predicting car every time
#     data_res['y_Predicted_Car'] = np.repeat(0,len(data_res['y_Actual']))

#     # Test predicting probabilistically
#     def probabilistic_mode_choice(probs):
#         return np.random.choice(np.arange(0,len(probs)), p=probs)
#     data_res['y_Predicted_Prob'] = np.apply_along_axis(probabilistic_mode_choice, axis=1, arr=simulatedValues.values)
    
    # Cross tabulate to see accuracy for each mode
    df = pd.DataFrame(data_res, columns=['y_Actual','y_Predicted'])
#     confusion_matrix = pd.crosstab(df['y_Actual'],df['y_Predicted'],rownames=['Actual'],colnames=['Predicted'],normalize=True)
#     print(round(confusion_matrix,2))
    accuracy.append(len(df[df['y_Actual']==df['y_Predicted']])/len(df))
    f1.append(sklearn.metrics.f1_score(df['y_Actual'], df['y_Predicted'], average='weighted'))
    confusion.append(sklearn.metrics.confusion_matrix(df['y_Actual'], df['y_Predicted'], labels=[0,1,2,3,4,5,6], normalize='pred'))

print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")
# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)
print(sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values[:-1], yticklabels=all_categories[0].values[:-1], cbar=False))

# Use the parameters from this model for final labeling
keep_betas = betas

In [None]:
# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values[:-1], yticklabels=all_categories[0].values[:-1], cbar=False).set(title='MNL Confusion Matrix (Primary)', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

### Train on Primary Test on Replaced

In [None]:
# Test MNL on primary mode
accuracy = []
f1 = []
confusion = []
for train_index, test_index in kf.split(df_non_ebike.values):
    X_train, X_test = df_non_ebike.iloc[train_index], df_non_ebike.iloc[test_index]
    y_train, y_test = df_non_ebike.iloc[train_index]['Mode_confirm'].values, df_non_ebike.iloc[test_index]['Mode_confirm'].values

    # Put the variables in global namespace to make Biogeme happy
    df_train = X_train.drop(columns=['date_time'])
    database_train = db.Database('openpath_train', df_train)
    globals().update(database_train.variables)
    
    # Point to the ebike trips dataframe
    df_test = df_ebike.drop(columns=['date_time'])
    database_test = db.Database('openpath_test', df_test)
    globals().update(database_test.variables)

    # Multinomial logit model in Biogeme
    # Alternative specific constants
    ASC_CAR = Beta('ASC_CAR',0,None,None,1)
    ASC_S_CAR = Beta('ASC_S_CAR',0,None,None,0)
    ASC_RIDEHAIL = Beta('ASC_RIDEHAIL',0,None,None,0)
    ASC_TRANSIT = Beta('ASC_TRANSIT',0,None,None,0)
    ASC_P_MICRO = Beta('ASC_P_MICRO',0,None,None,0)
    ASC_S_MICRO = Beta('ASC_S_MICRO',0,None,None,0)
    ASC_WALK = Beta('ASC_WALK',0,None,None,0)
    ASC_EBIKE = Beta('ASC_EBIKE',0,None,None,0)

    # Trip parameters
    B_COST = Beta('B_COST',0,None,None,0)
    B_TT = Beta('B_TT',0,None,None,0)

    # Mode parameters
    B_ASV_TT_MOTOR = Beta('B_ASV_TT_MOTOR',0,None,None,0)
    B_ASV_TT_PHYS = Beta('B_ASV_TT_PHYS',0,None,None,0)

    # Utility functions
    V0 = ASC_CAR + \
    B_COST * cost_car + \
    B_ASV_TT_MOTOR * tt_car

    V1 = ASC_S_CAR + \
    B_COST * cost_s_car + \
    B_ASV_TT_MOTOR * tt_s_car

    V2 = ASC_RIDEHAIL + \
    B_COST * cost_ridehail + \
    B_ASV_TT_MOTOR * tt_ridehail

    V3 = ASC_TRANSIT + \
    B_COST * cost_transit + \
    B_ASV_TT_MOTOR * tt_transit

    V4 = ASC_P_MICRO + \
    B_ASV_TT_PHYS * tt_p_micro

    V5 = ASC_S_MICRO + \
    B_COST * cost_s_micro + \
    B_ASV_TT_PHYS * tt_s_micro

    V6 = ASC_WALK + \
    B_ASV_TT_PHYS * tt_walk

    # Map modes to utility functions
    V = {0: V0,
         1: V1,
         2: V2,
         3: V3,
         4: V4,
         5: V5,
         6: V6}

    # Mode availability
    av = {0: av_car,
          1: av_s_car,
          2: av_ridehail,
          3: av_transit,
          4: av_p_micro,
          5: av_s_micro,
          6: av_walk}
    
    # Train the model parameters
    logprob = models.loglogit(V, av, Mode_confirm)
    biogeme = bio.BIOGEME(database_train, logprob)
    biogeme.modelName = 'openpath_mnl_train'
    biogeme.generateHtml = False
    biogeme.generatePickle = False
    results = biogeme.estimate()
    
    # Assemble utility functions for testing modes
    prob_car = models.logit(V, av, 0)
    prob_s_car = models.logit(V, av, 1)
    prob_ridehail = models.logit(V, av, 2)
    prob_transit = models.logit(V, av, 3)
    prob_p_micro = models.logit(V, av, 4)
    prob_s_micro = models.logit(V, av, 5)
    prob_walk = models.logit(V, av, 6)

    simulate ={'Prob. car': prob_car,
               'Prob. s_car': prob_s_car,
               'Prob. ridehail': prob_ridehail,
               'Prob. transit': prob_transit,
               'Prob. p_micro': prob_p_micro,
               'Prob. s_micro': prob_s_micro,
               'Prob. walk': prob_walk}

    betas = results.getBetaValues()

    # Calculate utility values for each row in the test database
    biogeme = bio.BIOGEME(database_test, simulate)
    biogeme.modelName = 'openpath_mnl_test'
    simulatedValues = biogeme.simulate(betas)

    # Test predicting maximum mode utility as choice
    # Identify the column of highest probability, replace with number corresponding to the mode
    prob_max = simulatedValues.idxmax(axis=1)
    prob_max = prob_max.replace({'Prob. car': 0,
                                 'Prob. s_car': 1,
                                 'Prob. ridehail': 2,
                                 'Prob. transit': 3,
                                 'Prob. p_micro': 4,
                                 'Prob. s_micro': 5,
                                 'Prob. walk': 6})
    data_res = {'y_Actual':df_test['Replaced_mode'], 'y_Predicted': prob_max}
    
#     # Test predicting car every time
#     data_res['y_Predicted_Car'] = np.repeat(0,len(data_res['y_Actual']))

#     # Test predicting probabilistically
#     def probabilistic_mode_choice(probs):
#         return np.random.choice(np.arange(0,len(probs)), p=probs)
#     data_res['y_Predicted_Prob'] = np.apply_along_axis(probabilistic_mode_choice, axis=1, arr=simulatedValues.values)
    
    # Cross tabulate to see accuracy for each mode
    df = pd.DataFrame(data_res, columns=['y_Actual','y_Predicted'])
#     confusion_matrix = pd.crosstab(df['y_Actual'],df['y_Predicted'],rownames=['Actual'],colnames=['Predicted'],normalize=True)
#     print(round(confusion_matrix,2))
    accuracy.append(len(df[df['y_Actual']==df['y_Predicted']])/len(df))
    f1.append(sklearn.metrics.f1_score(df['y_Actual'], df['y_Predicted'], average='weighted'))
    confusion.append(sklearn.metrics.confusion_matrix(df['y_Actual'], df['y_Predicted'], labels=[0,1,2,3,4,5,6], normalize='pred'))

print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

In [None]:
# Average and plot the confusion matrices
confusion_mean = np.mean(np.array(confusion), axis=0)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_mean, annot=True, fmt='.1%', cmap='YlGnBu', linewidths=.5, xticklabels=all_categories[0].values[:-1], yticklabels=all_categories[0].values[:-1], cbar=False).set(title='MNL Confusion Matrix (Replaced)', xlabel='Predicted', ylabel='Actual')
plt.subplots_adjust(bottom=0.25)

### Train on Replaced Test on Replaced

In [None]:
# Test MNL on primary mode
accuracy = []
f1 = []
for train_index, test_index in kf.split(df_ebike.values):
    X_train, X_test = df_ebike.iloc[train_index], df_ebike.iloc[test_index]
    y_train, y_test = df_ebike.iloc[train_index]['Replaced_mode'].values, df_ebike.iloc[test_index]['Replaced_mode'].values

    # Put the variables in global namespace to make Biogeme happy
    df_train = X_train.drop(columns=['date_time'])
    database_train = db.Database('openpath_train', df_train)
    globals().update(database_train.variables)
    
    # Point to the ebike trips dataframe
    df_test = df_ebike.drop(columns=['date_time'])
    database_test = db.Database('openpath_test', df_test)
    globals().update(database_test.variables)

    # Multinomial logit model in Biogeme
    # Alternative specific constants
    ASC_CAR = Beta('ASC_CAR',0,None,None,1)
    ASC_S_CAR = Beta('ASC_S_CAR',0,None,None,0)
    ASC_RIDEHAIL = Beta('ASC_RIDEHAIL',0,None,None,0)
    ASC_TRANSIT = Beta('ASC_TRANSIT',0,None,None,0)
    ASC_P_MICRO = Beta('ASC_P_MICRO',0,None,None,0)
    ASC_S_MICRO = Beta('ASC_S_MICRO',0,None,None,0)
    ASC_WALK = Beta('ASC_WALK',0,None,None,0)
    ASC_EBIKE = Beta('ASC_EBIKE',0,None,None,0)

    # Trip parameters
    B_COST = Beta('B_COST',0,None,None,0)
    B_TT = Beta('B_TT',0,None,None,0)

    # Mode parameters
    B_ASV_TT_MOTOR = Beta('B_ASV_TT_MOTOR',0,None,None,0)
    B_ASV_TT_PHYS = Beta('B_ASV_TT_PHYS',0,None,None,0)

    # Utility functions
    V0 = ASC_CAR + \
    B_COST * cost_car + \
    B_ASV_TT_MOTOR * tt_car

    V1 = ASC_S_CAR + \
    B_COST * cost_s_car + \
    B_ASV_TT_MOTOR * tt_s_car

    V2 = ASC_RIDEHAIL + \
    B_COST * cost_ridehail + \
    B_ASV_TT_MOTOR * tt_ridehail

    V3 = ASC_TRANSIT + \
    B_COST * cost_transit + \
    B_ASV_TT_MOTOR * tt_transit

    V4 = ASC_P_MICRO + \
    B_ASV_TT_PHYS * tt_p_micro

    V5 = ASC_S_MICRO + \
    B_COST * cost_s_micro + \
    B_ASV_TT_PHYS * tt_s_micro

    V6 = ASC_WALK + \
    B_ASV_TT_PHYS * tt_walk

    # Map modes to utility functions
    V = {0: V0,
         1: V1,
         2: V2,
         3: V3,
         4: V4,
         5: V5,
         6: V6}

    # Mode availability
    av = {0: av_car,
          1: av_s_car,
          2: av_ridehail,
          3: av_transit,
          4: av_p_micro,
          5: av_s_micro,
          6: av_walk}
    
    # Train the model parameters
    logprob = models.loglogit(V, av, Replaced_mode)
    biogeme = bio.BIOGEME(database_train, logprob)
    biogeme.modelName = 'openpath_mnl_train'
    biogeme.generateHtml = False
    biogeme.generatePickle = False
    results = biogeme.estimate()
    
    # Assemble utility functions for testing modes
    prob_car = models.logit(V, av, 0)
    prob_s_car = models.logit(V, av, 1)
    prob_ridehail = models.logit(V, av, 2)
    prob_transit = models.logit(V, av, 3)
    prob_p_micro = models.logit(V, av, 4)
    prob_s_micro = models.logit(V, av, 5)
    prob_walk = models.logit(V, av, 6)

    simulate ={'Prob. car': prob_car,
               'Prob. s_car': prob_s_car,
               'Prob. ridehail': prob_ridehail,
               'Prob. transit': prob_transit,
               'Prob. p_micro': prob_p_micro,
               'Prob. s_micro': prob_s_micro,
               'Prob. walk': prob_walk}

    betas = results.getBetaValues()

    # Calculate utility values for each row in the test database
    biogeme = bio.BIOGEME(database_test, simulate)
    biogeme.modelName = 'openpath_mnl_test'
    simulatedValues = biogeme.simulate(betas)

    # Test predicting maximum mode utility as choice
    # Identify the column of highest probability, replace with number corresponding to the mode
    prob_max = simulatedValues.idxmax(axis=1)
    prob_max = prob_max.replace({'Prob. car': 0,
                                 'Prob. s_car': 1,
                                 'Prob. ridehail': 2,
                                 'Prob. transit': 3,
                                 'Prob. p_micro': 4,
                                 'Prob. s_micro': 5,
                                 'Prob. walk': 6})
    data_res = {'y_Actual':df_test['Replaced_mode'], 'y_Predicted': prob_max}
    
#     # Test predicting car every time
#     data_res['y_Predicted_Car'] = np.repeat(0,len(data_res['y_Actual']))

#     # Test predicting probabilistically
#     def probabilistic_mode_choice(probs):
#         return np.random.choice(np.arange(0,len(probs)), p=probs)
#     data_res['y_Predicted_Prob'] = np.apply_along_axis(probabilistic_mode_choice, axis=1, arr=simulatedValues.values)
    
    # Cross tabulate to see accuracy for each mode
    df = pd.DataFrame(data_res, columns=['y_Actual','y_Predicted'])
#     confusion_matrix = pd.crosstab(df['y_Actual'],df['y_Predicted'],rownames=['Actual'],colnames=['Predicted'],normalize=True)
#     print(round(confusion_matrix,2))
    accuracy.append(len(df[df['y_Actual']==df['y_Predicted']])/len(df))
    f1.append(sklearn.metrics.f1_score(df['y_Actual'], df['y_Predicted'], average='weighted'))

print(f"Accuracy: {np.mean(accuracy)}")
print(f"F1: {np.mean(f1)}")

## Ebike Substitution Rates and Emissions

In [None]:
df_analysis = df_ebike.copy()

# Relabel with original mode names for plotting
for mode in av:
    mode_text = '_'.join(str(av[mode]).split('_')[1:])
    df_analysis['Replaced_mode'] = df_analysis['Replaced_mode'].replace(mode,mode_text)
    
# Substitution rate of ebike trips not including new trips
plot_data = df_analysis.groupby(['Replaced_mode']).count()[['Mode_confirm']].reset_index()
plot_data['subst_rate'] = plot_data['Mode_confirm'] / sum(plot_data['Mode_confirm'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Replaced_mode', y='subst_rate').set(title='Ebike Mode Replacement', xlabel='Replaced Mode', ylabel='Substitution Rate')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)
print(plot_data['subst_rate'])

In [None]:
# Now label the replaced mode
y_pred = rf_keep.predict(df_ebike_to_label[feature_list].values)

for i in range(0, len(y_pred)):
    if df_ebike_to_label.iloc[i]['Replaced_mode']!=10:
        y_pred[i] = df_ebike_to_label.iloc[i].Replaced_mode
        
df_ebike_to_label = df_ebike_to_label.copy()
df_ebike_to_label['Replaced_mode_new'] = y_pred

# Generate figure again
# Relabel with original mode names for plotting
for mode in av:
    mode_text = '_'.join(str(av[mode]).split('_')[1:])
    df_ebike_to_label['Replaced_mode_new'] = df_ebike_to_label['Replaced_mode_new'].replace(mode,mode_text)

# Substitution rate of ebike trips not including new trips
plot_data = df_ebike_to_label.groupby(['Replaced_mode_new']).count()[['Mode_confirm']].reset_index()
plot_data['subst_rate'] = plot_data['Mode_confirm'] / sum(plot_data['Mode_confirm'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Replaced_mode_new', y='subst_rate').set(title='Ebike Mode Replacement (w/Labeling)', xlabel='Replaced Mode', ylabel='Substitution Rate')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)
print(plot_data['subst_rate'])

In [None]:
# From df_EI
# Combine variable categories
df_EI = df_EI.replace('Car, drove alone', 'car')
df_EI = df_EI.replace('Car, with others', 's_car')
df_EI = df_EI.replace('Bikeshare', 's_micro')
df_EI = df_EI.replace('Scooter share', 's_micro')
df_EI = df_EI.replace('Regular Bike', 'p_micro')
df_EI = df_EI.replace('Skate board', 'p_micro')
df_EI = df_EI.replace('Train', 'transit')
df_EI = df_EI.replace('Free Shuttle', 'transit')
df_EI = df_EI.replace('Bus', 'transit')
df_EI = df_EI.replace('Walk', 'walk')
df_EI = df_EI.replace('Taxi/Uber/Lyft', 'ridehail')
df_EI = df_EI.replace('Pilot ebike', 'ebike')
emission_rates = df_EI.groupby(['mode']).mean().reset_index()[['mode','energy_intensity_factor','CO2_factor']]
emission_rates['g_CO2_per_passmi'] = emission_rates.energy_intensity_factor*emission_rates.CO2_factor*0.000001*453.592
emission_data = plot_data.merge(emission_rates, left_on='Replaced_mode_new', right_on='mode')

In [None]:
emission_data

In [None]:
# From df_EI
emission_rates = emission_data.g_CO2_per_passmi.values
subst_rates = emission_data.subst_rate.values

# g-CO2/mi reduction through ebike availability
sum(emission_rates * subst_rates) / sum(subst_rates) - 0.007

In [None]:
# From table in paper
emission_rates = [343.3, 18.5, 343.3, (343.3/2), 39.8, 123.8, 0.0]
subst_rates = plot_data['subst_rate'].values

# g-CO2/mi reduction through ebike availability
sum(emission_rates * subst_rates) / sum(subst_rates) - 39.8

## Explore Replaced Mode Accuracy

In [None]:
    av = {0: 'av_car',
          1: 'av_s_car',
          2: 'av_ridehail',
          3: 'av_transit',
          4: 'av_p_micro',
          5: 'av_s_micro',
          6: 'av_walk',
          7: 'av_ebike'}

In [None]:
replaced_list = [df_replaced_trips[av[x]].iloc[i] for i, x in enumerate(df_replaced_trips.Replaced_mode)]
df_replaced_trips['replaced_in_stated'] = replaced_list

# Relabel with original mode names for plotting
for mode in av:
    mode_text = '_'.join(str(av[mode]).split('_')[1:])
    df_replaced_trips['Mode_confirm'] = df_replaced_trips['Mode_confirm'].replace(mode,mode_text)
    df_replaced_trips['Replaced_mode'] = df_replaced_trips['Replaced_mode'].replace(mode,mode_text)

df_replaced_trips['Mode_confirm'] = df_replaced_trips['Mode_confirm'].replace(7,'ebike')
df_replaced_trips['Replaced_mode'] = df_replaced_trips['Replaced_mode'].replace(7,'ebike')

In [None]:
# Accurately stated replacement mode for all users
plot_data = df_replaced_trips[df_replaced_trips['Mode_confirm']=='ebike']
plot_data = plot_data.groupby(['date_time'], as_index=False)['replaced_in_stated'].agg(['sum','count']).apply(lambda x: x.rolling(14,1).mean())
plot_data['proportion'] = plot_data['sum'] / plot_data['count']

fig, ax = plt.subplots(figsize=(13,5))
sns.lineplot(ax=ax, data=plot_data, x='date_time', y='proportion').set(title='Proportion of Daily E-Bike Trips With Correctly Stated Replacement Mode', xlabel='Date', ylabel='Proportion Correct')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users across modes
plot_data = df_replaced_trips.groupby(['Mode_confirm'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['proportion'] = 1 - (plot_data['sum'] / plot_data['count'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Mode_confirm', y='proportion').set(title='Proportion of Infeasible Replacements by Primary Mode', xlabel='Primary Mode', ylabel='Proportion Incorrect')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users across modes
plot_data = df_replaced_trips.groupby(['Replaced_mode'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['proportion'] = 1 - (plot_data['sum'] / plot_data['count'])

fig, ax = plt.subplots(figsize=(13,5))
sns.barplot(ax=ax, data=plot_data, x='Replaced_mode', y='proportion').set(title='Proportion of Infeasible Replacements by Replaced Mode', xlabel='Stated Mode Replaced', ylabel='Proportion Incorrect')
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users
df_replaced_trips.user_id = df_replaced_trips.user_id.astype(str)
plot_data = df_replaced_trips.groupby(['user_id'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['proportion'] = plot_data['sum'] / plot_data['count']
plot_data = plot_data.sort_values('proportion', ascending=False)

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(ax=ax, data=plot_data, x='user_id', y='proportion', color='darkblue').set(title='Proportion of Trips With Correctly Stated Replacement Mode', xlabel='User', ylabel='Proportion Correct')
plt.xticks(rotation=90)
plt.subplots_adjust(bottom=0.25)

In [None]:
# Accurately stated replacement mode for all users
df_replaced_trips.user_id = df_replaced_trips.user_id.astype(str)
plot_data = df_replaced_trips.groupby(['user_id'], as_index=False)['replaced_in_stated'].agg(['sum','count']).reset_index()
plot_data['incorrect'] = plot_data['count'] - plot_data['sum']
plot_data['user_id'] = plot_data['user_id'].astype(str).str[-4:]
plot_data = plot_data.sort_values('incorrect', ascending=False)

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(ax=ax, data=plot_data, x='user_id', y='incorrect', color='darkblue').set(title='Trips With Unavailable Stated Replacement Mode', xlabel='User', ylabel='Count Incorrect')
plt.xticks(rotation=90)
plt.subplots_adjust(bottom=0.25)