In [1]:
year = None
month = None
program = "prepilot"
study_type = "program"
mode_of_interest = "pilot_ebike"

In [2]:
SAVE_DIR = '/plots/'

In [3]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model

from plots import *
import scaffolding

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

URL not formatted, defaulting to "Stage_database"
Connecting to database URL db


In [4]:
# Settings and imports specific to this notebook

include_replaced_modes_as_valid = True # Flip this when we want to get results versus generate the replaced_mode correction graphs
model_with_sensed = False
input_dataset = "ONLY_LABELED" # "ONLY_LABELED", "ONLY_SENSED" or "BEST_AVAILABLE" for sensitivity analysis
LABEL_ASSIST_THRESHOLD = 0.3

# For reloading modules from Jupyter
import importlib
importlib.reload(scaffolding)

import datetime
import pickle
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import emission.core.get_database as edb
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.storage.decorations.trip_queries as esdt
import emission.storage.decorations.timeline as esdl
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt
from uuid import UUID

In [5]:
# Do not run this notebook at all unless it is for a program; nbclient will run up through this cell
if study_type != "program":
    raise Exception("The plots in this notebook are only relevant to programs")

In [6]:
# Loading mapping dictionaries from mapping_dictionaries notebook
%store -r dic_re
%store -r dic_pur

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)

## Get UUIDs by Program

In [7]:
# Split UUIDs by program
program_uuid_map = {}
for ue in edb.get_uuid_db().find():
    program = ue['user_email'].split("_")[0]
    if program in program_uuid_map.keys():
        program_uuid_map[program].append(str(ue['uuid']))
    else:
        print(f"Found new program {program}, creating new list")
        program_uuid_map[program] = []
        program_uuid_map[program].append(str(ue['uuid']))

uuid_program_list = []
for ue in edb.get_uuid_db().find():
    program = ue['user_email'].split("_")[0]
    uuid_program_list.append({"program": program, "opcode": ue["user_email"], "user_id_str": str(ue['uuid'])})

Found new program 4c, creating new list
Found new program pc, creating new list
Found new program sc, creating new list
Found new program vail, creating new list
Found new program stage, creating new list
Found new program cc, creating new list
Found new program fc, creating new list


In [8]:
uuid_program_df = pd.DataFrame.from_dict(uuid_program_list)
uuid_program_df.head()

Unnamed: 0,program,opcode,user_id_str
0,4c,4c_Ze5Y_li7r4MOsbqj,863e9c6c-8ec0-48c4-b765-3f73d839c85b
1,4c,4c_AGgrHoywg5gYmyL8,c6e4db31-c18b-4355-b02a-7dd97deca70b
2,4c,4c_Ri332Xj3DKaCrdwl,c6807997-194c-4c52-8a8f-a8c1f6ee1595
3,4c,4c_6gzfgbVUODrbTdc_,e9e479e9-5c3f-4345-a885-dadb7999b312
4,4c,4c_eJEIg2mWQYU3ISXU,6656c04c-6cba-4c18-9fed-805eaa529741


## Collect Data From Database

In [9]:
expanded_ct, file_suffix, quality_text, debug_df = scaffolding.load_viz_notebook_data(year,
                                                                            month,
                                                                            program,
                                                                            study_type,
                                                                            dic_re,
                                                                            dic_pur=dic_pur)

0          4c_Ze5Y_li7r4MOsbqj
1          4c_AGgrHoywg5gYmyL8
2          4c_Ri332Xj3DKaCrdwl
3          4c_6gzfgbVUODrbTdc_
4          4c_eJEIg2mWQYU3ISXU
                ...           
240    fc_csu_TMUSjqGBWELwW5mZ
241    fc_n2n__1f8tgZOmeizn0pJ
242    fc_n2n_xV9QUA9nMEVPYq1B
243    fc_csu_OZ_EXMB-w436dbCh
244    fc_n2n_gIVaKBYQtpjA7s5-
Name: user_email, Length: 245, dtype: object

Loaded all confirmed trips of length 135471


Unnamed: 0,source,end_ts,end_fmt_time,end_loc,raw_trip,start_ts,start_fmt_time,start_loc,duration,distance,...,end_local_dt_month,end_local_dt_day,end_local_dt_hour,end_local_dt_minute,end_local_dt_second,end_local_dt_weekday,end_local_dt_timezone,_id,user_id,metadata_write_ts
0,DwellSegmentationTimeFilter,1626885000.0,2021-07-21T10:31:16-06:00,"{'type': 'Point', 'coordinates': [-107.8599774...",60f85d2453f7233da0b1a28c,1626884000.0,2021-07-21T10:13:12.769000-06:00,"{'type': 'Point', 'coordinates': [-107.8579336...",1083.231,9413.891182,...,7,21,10,31,16,2,America/Denver,612089e1cb13df1d4d22cfd9,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
1,DwellSegmentationTimeFilter,1626920000.0,2021-07-21T20:05:41.808000-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",60f8dbb173697eb577aaff42,1626918000.0,2021-07-21T19:38:56.389000-06:00,"{'type': 'Point', 'coordinates': [-107.7964539...",1605.419,13124.15228,...,7,21,20,5,41,2,America/Denver,612089e1cb13df1d4d22cfda,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
2,DwellSegmentationTimeFilter,1626971000.0,2021-07-22T10:15:32.259000-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",60f9aea3f399df93fdc1ab41,1626969000.0,2021-07-22T09:51:21.159484-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",1451.099516,10587.088615,...,7,22,10,15,32,3,America/Denver,612089e1cb13df1d4d22cfdb,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
3,DwellSegmentationTimeFilter,1626989000.0,2021-07-22T15:21:31-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",60f9f4f122abd49191e21f5c,1626988000.0,2021-07-22T15:14:35.056996-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",415.943004,3362.028245,...,7,22,15,21,31,3,America/Denver,612089e1cb13df1d4d22cfdc,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
4,DwellSegmentationTimeFilter,1626990000.0,2021-07-22T15:32:52.960000-06:00,"{'type': 'Point', 'coordinates': [-107.8601932...",60f9f4f122abd49191e21f5e,1626989000.0,2021-07-22T15:24:53.289799-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",479.670201,4084.31156,...,7,22,15,32,52,3,America/Denver,612089e2cb13df1d4d22cfdd,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0


After filtering, found 135471 participant trips 


Unnamed: 0,source,end_ts,end_fmt_time,end_loc,raw_trip,start_ts,start_fmt_time,start_loc,duration,distance,...,end_local_dt_month,end_local_dt_day,end_local_dt_hour,end_local_dt_minute,end_local_dt_second,end_local_dt_weekday,end_local_dt_timezone,_id,user_id,metadata_write_ts
0,DwellSegmentationTimeFilter,1626885000.0,2021-07-21T10:31:16-06:00,"{'type': 'Point', 'coordinates': [-107.8599774...",60f85d2453f7233da0b1a28c,1626884000.0,2021-07-21T10:13:12.769000-06:00,"{'type': 'Point', 'coordinates': [-107.8579336...",1083.231,9413.891182,...,7,21,10,31,16,2,America/Denver,612089e1cb13df1d4d22cfd9,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
1,DwellSegmentationTimeFilter,1626920000.0,2021-07-21T20:05:41.808000-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",60f8dbb173697eb577aaff42,1626918000.0,2021-07-21T19:38:56.389000-06:00,"{'type': 'Point', 'coordinates': [-107.7964539...",1605.419,13124.15228,...,7,21,20,5,41,2,America/Denver,612089e1cb13df1d4d22cfda,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
2,DwellSegmentationTimeFilter,1626971000.0,2021-07-22T10:15:32.259000-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",60f9aea3f399df93fdc1ab41,1626969000.0,2021-07-22T09:51:21.159484-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",1451.099516,10587.088615,...,7,22,10,15,32,3,America/Denver,612089e1cb13df1d4d22cfdb,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
3,DwellSegmentationTimeFilter,1626989000.0,2021-07-22T15:21:31-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",60f9f4f122abd49191e21f5c,1626988000.0,2021-07-22T15:14:35.056996-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",415.943004,3362.028245,...,7,22,15,21,31,3,America/Denver,612089e1cb13df1d4d22cfdc,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
4,DwellSegmentationTimeFilter,1626990000.0,2021-07-22T15:32:52.960000-06:00,"{'type': 'Point', 'coordinates': [-107.8601932...",60f9f4f122abd49191e21f5e,1626989000.0,2021-07-22T15:24:53.289799-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",479.670201,4084.31156,...,7,22,15,32,52,3,America/Denver,612089e2cb13df1d4d22cfdd,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0


After filtering, found 57729 labeled trips


Unnamed: 0,source,end_ts,end_fmt_time,end_loc,raw_trip,start_ts,start_fmt_time,start_loc,duration,distance,...,end_local_dt_month,end_local_dt_day,end_local_dt_hour,end_local_dt_minute,end_local_dt_second,end_local_dt_weekday,end_local_dt_timezone,_id,user_id,metadata_write_ts
0,DwellSegmentationTimeFilter,1626885000.0,2021-07-21T10:31:16-06:00,"{'type': 'Point', 'coordinates': [-107.8599774...",60f85d2453f7233da0b1a28c,1626884000.0,2021-07-21T10:13:12.769000-06:00,"{'type': 'Point', 'coordinates': [-107.8579336...",1083.231,9413.891182,...,7,21,10,31,16,2,America/Denver,612089e1cb13df1d4d22cfd9,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
1,DwellSegmentationTimeFilter,1626920000.0,2021-07-21T20:05:41.808000-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",60f8dbb173697eb577aaff42,1626918000.0,2021-07-21T19:38:56.389000-06:00,"{'type': 'Point', 'coordinates': [-107.7964539...",1605.419,13124.15228,...,7,21,20,5,41,2,America/Denver,612089e1cb13df1d4d22cfda,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
2,DwellSegmentationTimeFilter,1626971000.0,2021-07-22T10:15:32.259000-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",60f9aea3f399df93fdc1ab41,1626969000.0,2021-07-22T09:51:21.159484-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",1451.099516,10587.088615,...,7,22,10,15,32,3,America/Denver,612089e1cb13df1d4d22cfdb,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
3,DwellSegmentationTimeFilter,1626989000.0,2021-07-22T15:21:31-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",60f9f4f122abd49191e21f5c,1626988000.0,2021-07-22T15:14:35.056996-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",415.943004,3362.028245,...,7,22,15,21,31,3,America/Denver,612089e1cb13df1d4d22cfdc,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0
4,DwellSegmentationTimeFilter,1626990000.0,2021-07-22T15:32:52.960000-06:00,"{'type': 'Point', 'coordinates': [-107.8601932...",60f9f4f122abd49191e21f5e,1626989000.0,2021-07-22T15:24:53.289799-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",479.670201,4084.31156,...,7,22,15,32,52,3,America/Denver,612089e2cb13df1d4d22cfdd,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0


Unnamed: 0,mode_confirm,purpose_confirm,replaced_mode
0,drove_alone,work,
1,drove_alone,home,
2,drove_alone,work,
3,drove_alone,meal,
4,drove_alone,meal,


Found Index(['mode_confirm', 'purpose_confirm', 'replaced_mode'], dtype='object') columns of length 3
After expanding, columns went from 38 -> 41


Unnamed: 0,source,end_ts,end_fmt_time,end_loc,raw_trip,start_ts,start_fmt_time,start_loc,duration,distance,...,end_local_dt_minute,end_local_dt_second,end_local_dt_weekday,end_local_dt_timezone,_id,user_id,metadata_write_ts,mode_confirm,purpose_confirm,replaced_mode
0,DwellSegmentationTimeFilter,1626885000.0,2021-07-21T10:31:16-06:00,"{'type': 'Point', 'coordinates': [-107.8599774...",60f85d2453f7233da0b1a28c,1626884000.0,2021-07-21T10:13:12.769000-06:00,"{'type': 'Point', 'coordinates': [-107.8579336...",1083.231,9413.891182,...,31,16,2,America/Denver,612089e1cb13df1d4d22cfd9,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0,drove_alone,work,
1,DwellSegmentationTimeFilter,1626920000.0,2021-07-21T20:05:41.808000-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",60f8dbb173697eb577aaff42,1626918000.0,2021-07-21T19:38:56.389000-06:00,"{'type': 'Point', 'coordinates': [-107.7964539...",1605.419,13124.15228,...,5,41,2,America/Denver,612089e1cb13df1d4d22cfda,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0,drove_alone,home,
2,DwellSegmentationTimeFilter,1626971000.0,2021-07-22T10:15:32.259000-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",60f9aea3f399df93fdc1ab41,1626969000.0,2021-07-22T09:51:21.159484-06:00,"{'type': 'Point', 'coordinates': [-107.8643886...",1451.099516,10587.088615,...,15,32,3,America/Denver,612089e1cb13df1d4d22cfdb,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0,drove_alone,work,
3,DwellSegmentationTimeFilter,1626989000.0,2021-07-22T15:21:31-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",60f9f4f122abd49191e21f5c,1626988000.0,2021-07-22T15:14:35.056996-06:00,"{'type': 'Point', 'coordinates': [-107.860199,...",415.943004,3362.028245,...,21,31,3,America/Denver,612089e1cb13df1d4d22cfdc,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0,drove_alone,meal,
4,DwellSegmentationTimeFilter,1626990000.0,2021-07-22T15:32:52.960000-06:00,"{'type': 'Point', 'coordinates': [-107.8601932...",60f9f4f122abd49191e21f5e,1626989000.0,2021-07-22T15:24:53.289799-06:00,"{'type': 'Point', 'coordinates': [-107.8743952...",479.670201,4084.31156,...,32,52,3,America/Denver,612089e2cb13df1d4d22cfdd,e9e479e9-5c3f-4345-a885-dadb7999b312,1629522000.0,drove_alone,meal,


_fc
Based on 57729 confirmed trips from 200 users
of 135471 total  trips from 219 users (42.61%)


0          4c_Ze5Y_li7r4MOsbqj
1          4c_AGgrHoywg5gYmyL8
2          4c_Ri332Xj3DKaCrdwl
3          4c_6gzfgbVUODrbTdc_
4          4c_eJEIg2mWQYU3ISXU
                ...           
240    fc_csu_TMUSjqGBWELwW5mZ
241    fc_n2n__1f8tgZOmeizn0pJ
242    fc_n2n_xV9QUA9nMEVPYq1B
243    fc_csu_OZ_EXMB-w436dbCh
244    fc_n2n_gIVaKBYQtpjA7s5-
Name: user_email, Length: 245, dtype: object

In [10]:
# Add the sensed labels to the data
label_list = expanded_ct.inferred_labels.to_list()
mode_confirm_sensed = []
for x in label_list:
    if len(x) > 0 and 'mode_confirm' in x[0]['labels'].keys():
        mode_confirm_sensed.append(x[0]['labels']['mode_confirm'])
    else:
        mode_confirm_sensed.append("Unknown")
expanded_ct['mode_confirm_sensed'] = mode_confirm_sensed

In [11]:
expanded_ct['mode_confirm_sensed']

0         drove_alone
1         drove_alone
2         pilot_ebike
3         drove_alone
4         drove_alone
             ...     
135358        Unknown
135373        Unknown
135384    drove_alone
135395    drove_alone
135398    drove_alone
Name: mode_confirm_sensed, Length: 57729, dtype: object

In [12]:
# Add non-label category
expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unlabeled')
expanded_ct.loc[expanded_ct['replaced_mode'] == 'Unlabeled', 'Replaced_mode'] = "Unlabeled"

In [13]:
# Join the expanded database data to socioeconomic data
socio_data = pd.read_csv('./Can Do Colorado eBike Program - en.csv')
socio_data.rename(columns={'Unique User ID (auto-filled, do not edit)':'user_id',
                          'Please identify which category represents your total household income, before taxes, for last year.':'HHINC',
                          'How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?':'VEH',
                           'In which year were you born?':'AGE',
                          'Including yourself, how many people live in your home?':'HHSIZE',
                          'How many children under age 18 live in your home?':'CHILDREN',
                          'What is your gender?':'GENDER',
                          'If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?':'available_modes',
                          'Are you a student?':'STUDENT',
                          "Including yourself, how many people have a driver's license in your household?":'DRIVERS'}, inplace=True)
socio_data = socio_data[~socio_data.user_id.isnull()]

# Deal with people who have multiple responses by using most recent
socio_data = socio_data.sort_values(by=['user_id', 'Timestamp'])
socio_data.drop_duplicates(subset=['user_id'], keep='last', inplace=True)
socio_data['user_id_socio'] = socio_data.user_id
socio_data = socio_data.drop(labels='user_id', axis=1)

# Lose some trips due to people with no survey responses
expanded_ct['user_id_socio'] = expanded_ct.user_id.astype(str)
expanded_ct.user_id_socio = [i.replace('-','') for i in expanded_ct.user_id_socio] # remove all dashes from strings
expanded_ct = expanded_ct.merge(socio_data, on='user_id_socio')

## Data Preprocessing

In [14]:
## Filter database to variables of modeling interest
data = expanded_ct[['Mode_confirm','mode_confirm_sensed','Replaced_mode','replaced_mode','Trip_purpose','duration','distance_miles','start_local_dt_weekday','available_modes','AGE','HHINC','VEH','HHSIZE','CHILDREN','GENDER','STUDENT','DRIVERS','user_id','_id','start_local_dt_year','start_local_dt_month','start_local_dt_day','cleaned_trip','start_fmt_time','start_loc','end_loc']].copy()


## Pre-filter round of variable creation
# Make copy of user_id to be categorized since both versions are needed
data['user_id_int'] = data['user_id']

# Get timestamp from known year/month/day aggregated to days
data.rename(columns={'start_local_dt_year':'year','start_local_dt_month':'month','start_local_dt_day':'day'}, inplace=True)
data['date_time'] = pd.to_datetime(data[['year','month','day']])
data = data.drop(columns=['year','day'])

# Get time of day
data['hour'] = [int(x[1][:2]) for x in expanded_ct.start_fmt_time.str.split('T')]

# Fix age
data['AGE'] = 2022 - data['AGE']

# Number of workers
data['WORKERS'] = (data['HHSIZE'] - data['CHILDREN']).astype(int)

# Vehicles per driver
data['VEH'] = data['VEH'].replace('4+', '4')

# Recoded Cyclical Time of Day
hours_in_day = 24
months_in_year = 12
data['sin_time'] = np.sin(2*np.pi*data.hour/hours_in_day)
data['cos_time'] = np.cos(2*np.pi*data.hour/hours_in_day)
data['sin_month'] = np.sin(2*np.pi*data.month/months_in_year)
data['cos_month'] = np.cos(2*np.pi*data.month/months_in_year)

# Duration in minutes
data['duration'] = data['duration'] / 60

# Add coordinates to the data
z = pd.json_normalize(data.start_loc)['coordinates']
olon = [str(x[0]) for x in z]
olat = [str(x[1]) for x in z]
data['olat'] = olat
data['olon'] = olon
z = pd.json_normalize(data.end_loc)['coordinates']
dlon = [str(x[0]) for x in z]
dlat = [str(x[1]) for x in z]
data['dlat'] = dlat
data['dlon'] = dlon

# Recode variables
data.Mode_confirm = data.Mode_confirm.replace(
    ['Gas Car, drove alone',
    'Gas Car, with others',
    'Bikeshare',
    'Scooter share',
    'Regular Bike',
    'Skate board',
    'Train',
    'Free Shuttle',
    'Bus',
    'Walk',
    'Taxi/Uber/Lyft',
    'E-bike'],
    ['car',
    's_car',
    's_micro',
    's_micro',
    'p_micro',
    'p_micro',
    'transit',
    'transit',
    'transit',
    'walk',
    'ridehail',
    'ebike']
)
data.Replaced_mode = data.Replaced_mode.replace(
    ['Gas Car, drove alone',
    'Gas Car, with others',
    'Bikeshare',
    'Scooter share',
    'Regular Bike',
    'Skate board',
    'Train',
    'Free Shuttle',
    'Bus',
    'Walk',
    'Taxi/Uber/Lyft',
    'E-bike',
    'No Travel'],
    ['car',
    's_car',
    's_micro',
    's_micro',
    'p_micro',
    'p_micro',
    'transit',
    'transit',
    'transit',
    'walk',
    'ridehail',
    'ebike',
    'no_travel']
)
data.Trip_purpose = data.Trip_purpose.replace(
    ['Work',
    'School',
    'Recreation/Exercise',
    'Transit transfer',
    'Meal',
    'Entertainment/Social',
    'Shopping',
    'Personal/Medical',
    'Religious',
    'Pick-up/Drop off'],
    ['commute',
    'commute',
    'recreation',
    'transit_transfer',
    'discretionary',
    'discretionary',
    'discretionary',
    'discretionary',
    'discretionary',
    'pudo']
)
data['is_weekend'] = 0
data.loc[data['start_local_dt_weekday'].isin(['0','6']), 'is_weekend'] = 1
data['is_male'] = 0
data.loc[data['GENDER'].isin(['Man']), 'is_male'] = 1

## Filter data
# Filter out responses to data that are not workable
data = data[~data['Mode_confirm'].isin(['Not a Trip','Other'])]
data = data[~data['Replaced_mode'].isin(['Not a Trip','Other','Unlabeled'])]
data = data[~data['available_modes'].isin(['None', 'Prefer not to say'])]
data = data[~data['Trip_purpose'].isin(['not_a_trip','Other'])]
data = data[~data['HHINC'].isin(['Prefer not to say','$100,000 -$149,999','$150,000','$150,000-$199,999','$200,000 or more'])] # Side note why is 150k (n=7) its own bin?
data = data[~data['VEH'].isin(['Prefer not to say / Prefiero no decir.'])]
data = data[data['distance_miles']<50]
data = data[data['AGE']<100]
data = data[data['HHSIZE']<10]
data = data[data['HHSIZE']>data['CHILDREN']]


## Post-filter round of variable creation
# OHE any categorical, non-ordinal variables
ohe_vars = ['Trip_purpose','HHINC','STUDENT']
ohe_prefixes = ['purp','hhinc','student']
data = pd.get_dummies(data, columns=ohe_vars, prefix=ohe_prefixes)

# Calculate travel times for each trip, across every mode
def add_all_mode_tt(data, mode_col, duration_col, dist_col):
    wait_times_init = {'car':0.00,
                      's_car':0.00,
                      'ridehail':5.00,
                      's_micro':5.00,
                      'p_micro':0.00,
                      'transit':7.00,
                      'walk':5.00,
                      'ebike':0.00}
    mode_travel_times = {}
    for mode in pd.unique(data[mode_col]):

        # Linear model for duration based on distance for trips belonging to each mode
        mode_data = data[data[mode_col]==mode]
        regr = linear_model.LinearRegression()
        regr.fit(np.log(mode_data[dist_col].values.reshape(-1,1)), np.log(mode_data[duration_col].values.reshape(-1,1)))
        
        # Make prediction for ALL trips, reverse the log transform to get mins
        mode_duration_pred = regr.predict(np.log(data[dist_col].values.reshape(-1,1)))
        mode_travel_times['tt_'+mode] = np.exp(mode_duration_pred) + wait_times_init[mode]

    # Apply for each mode existing in the dataframe
    for mode in mode_travel_times:
        data[mode] = mode_travel_times[mode]

    return regr, data

# Calculate all mode travel times and add to dataframe
regr, data = add_all_mode_tt(data,'Mode_confirm','duration','distance_miles')

# Calculate vehicle costs
cost_factors_init = {'car':0.00,
                    's_car':0.00,
                    'ridehail':5.00,
                    's_micro':1.00,
                    'p_micro':0.00,
                    'transit':3.50,
                    'ebike':0.00,
                    'walk':0.00}
cost_factors = {'car':0.62,
                's_car':0.31,
                'ridehail':0.80,
                's_micro':0.90,
                'p_micro':0.00,
                'transit':0.00,
                'ebike':0.11,
                'walk':0.00}

def add_all_mode_cost(data, cost_factors, dist_col):
    for factor in cost_factors:
        data['cost_'+factor] = cost_factors_init[factor] + (cost_factors[factor] * data[dist_col])
    return data

# Calculate all mode travel costs and add to dataframe
add_all_mode_cost(data, cost_factors, 'distance_miles')

# Labels for modes in the availability survey
availability_codes = {'Public transportation (bus, subway, light rail, etc.)':'transit',
                      'Get a ride from a friend or family member':'s_car',
                      'Rental car (including Zipcar/ Car2Go)':'car',
                      'Taxi (regular taxi, Uber, Lyft, etc)':'ridehail',
                      'Bicycle':'p_micro',
                      'Shared bicycle or scooter':'s_micro',
                      'Walk/roll':'walk',
                      'Skateboard':'p_micro',
                      'ebike':'ebike',
                      'None':'none'}

def add_mode_availability(data, availability_codes, availability_col, choice_col, replaced_col, is_sp):
    mode_list = np.unique(list(availability_codes.values())[:-1])
    choice_list = data[choice_col].values
    replaced_list = data[replaced_col].values
    for mode in mode_list:
        mode_avail = []
        for i, available in enumerate(data[availability_col].values):
            available_modes = [availability_codes[x] for x in available.split(';')]
            # For SP: Replacement/stated available should be 1, chosen should be 0
            if is_sp:
                if mode==choice_list[i]:
                    mode_check = False
                else:
                    mode_check = mode==replaced_list[i] or mode in available_modes
            # For RP: Chosen/replacement/stated available should be 1
            else:
                mode_check = mode==choice_list[i] or mode==replaced_list[i] or mode in available_modes
            # Keep binary list of which trips the mode was available for
            if mode_check:
                mode_avail.append(1)
            else:
                mode_avail.append(0)
        # For each mode add a column with binary availability
        data['av_'+mode] = mode_avail
    return data

# Add mode availability according to survey responses
data = add_mode_availability(data, availability_codes, 'available_modes', 'Mode_confirm', 'Replaced_mode', is_sp=False)

# # Add mode availability as all available
mode_list = ['car','s_car','ridehail','transit','p_micro','s_micro','walk','ebike','no_travel']
# for mode in mode_list:
#     data[f"av_{mode}"] = 1

# Handle all variables that are ordinal; otherwise they may not end up in correct order
# Make sure that all mode variables align after being converted to numeric variables
data.Mode_confirm = pd.Categorical(data.Mode_confirm, ordered=True, categories=mode_list)
data.mode_confirm_sensed = pd.Categorical(data.mode_confirm_sensed, ordered=True, categories=pd.unique(data.mode_confirm_sensed))
data.Replaced_mode = pd.Categorical(data.Replaced_mode, ordered=True, categories=mode_list)
data['Mode_confirm_num'] = data.Mode_confirm.cat.codes
data['mode_confirm_sensed_num'] = data.mode_confirm_sensed.cat.codes
data['Replaced_mode_num'] = data.Replaced_mode.cat.codes
data = data[data['Mode_confirm_num']!=data['Replaced_mode_num']]

## Save cleaned data to be used in modeling
data.to_csv("processed_replacement_modeling_data.csv")

## Data Checks

In [15]:
# Data stats before cleaning
print(f"Trips: {len(expanded_ct)}")
print(f"Users: {len(np.unique(expanded_ct.user_id))}")
print(f"Trips per user: {len(expanded_ct) / len(pd.unique(expanded_ct.user_id))}")

Trips: 52536
Users: 163
Trips per user: 322.30674846625766


In [16]:
# Data stats after cleaning + columns available
print(f"Trips: {len(data)}")
print(f"Users: {len(np.unique(data.user_id))}")
print(f"Trips per user: {len(data) / len(pd.unique(data.user_id))}\n")
print(f"Columns: \n{data.columns.values}")

Trips: 24588
Users: 118
Trips per user: 208.3728813559322

Columns: 
['Mode_confirm' 'mode_confirm_sensed' 'Replaced_mode' 'replaced_mode'
 'duration' 'distance_miles' 'start_local_dt_weekday' 'available_modes'
 'AGE' 'VEH' 'HHSIZE' 'CHILDREN' 'GENDER' 'DRIVERS' 'user_id' '_id'
 'month' 'cleaned_trip' 'start_fmt_time' 'start_loc' 'end_loc'
 'user_id_int' 'date_time' 'hour' 'WORKERS' 'sin_time' 'cos_time'
 'sin_month' 'cos_month' 'olat' 'olon' 'dlat' 'dlon' 'is_weekend'
 'is_male' 'purp_Home' 'purp_commute' 'purp_discretionary' 'purp_pudo'
 'purp_recreation' 'purp_transit_transfer' 'hhinc_$25,000-$49,999'
 'hhinc_$50,000-$99,999' 'hhinc_Less than $24,999' 'student_Custodian'
 'student_Fire Fighter 2 Training' 'student_Not a student'
 'student_Taking prerequisites missing for grad program '
 'student_Work at csu' 'student_Yes - Full Time College/University'
 'student_Yes - Part-Time College/University'
 'student_Yes - Vocation/Technical/Trade School'
 'student_taking classes toward early

In [17]:
# Check for NAs
data.isna().sum()

Mode_confirm               0
mode_confirm_sensed        0
Replaced_mode              0
replaced_mode              0
duration                   0
                          ..
av_transit                 0
av_walk                    0
Mode_confirm_num           0
mode_confirm_sensed_num    0
Replaced_mode_num          0
Length: 80, dtype: int64

In [18]:
# Check # obs for each class
print(f"Mode_confirm:\n{pd.value_counts(data.Mode_confirm)}\n")
print(f"Replaced_mode:\n{pd.value_counts(data.Replaced_mode)}")

Mode_confirm:
ebike        10815
s_car         5558
car           4172
walk          2761
transit        758
p_micro        414
ridehail        74
s_micro         36
no_travel        0
Name: Mode_confirm, dtype: int64

Replaced_mode:
no_travel    8348
car          6296
p_micro      3190
s_car        2001
walk         1943
transit      1681
ridehail     1028
s_micro        51
ebike          50
Name: Replaced_mode, dtype: int64
