In [3]:
import os
import sys
import pickle
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
from uuid import UUID
from collections import defaultdict

%matplotlib inline

In [4]:
emission_path = Path(os.getcwd()).parent.parent / 'my_emission_server' / 'e-mission-server'
sys.path.append(str(emission_path))

# Also add the home (viz_scripts) to the path
sys.path.append('../viz_scripts')

In [10]:
import scaffolding
import emission.core.get_database as edb

In [11]:
importlib.reload(scaffolding)

<module 'scaffolding' from '/Users/rkulhall/em-public-dashboard/rm_src/../viz_scripts/scaffolding.py'>

In [12]:
def r(df: pd.DataFrame) -> pd.DataFrame:
    return df.reset_index(drop=True, inplace=False)

In [13]:
with open('../viz_scripts/auxiliary_files/dic_re.pkl', 'rb') as f:
    dic_re = pickle.loads(f.read())

with open('../viz_scripts/auxiliary_files/dic_pur.pkl', 'rb') as f:
    dic_pur = pickle.loads(f.read())

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other', dic_re)
dic_pur = defaultdict(lambda: 'Other', dic_pur)

In [14]:
# Split UUIDs by program
program_uuid_map = {}
for ue in edb.get_uuid_db().find():
    uuid = str(ue['uuid'])
    # uuid = str(ue['uuid'])
    program = ue['user_email'].split("_")[0]
    if program in program_uuid_map.keys():
        program_uuid_map[program].append(uuid)
    else:
        print(f"Found new program {program}, creating new list")
        program_uuid_map[program] = []
        program_uuid_map[program].append(uuid)

uuid_program_list = []
for ue in edb.get_uuid_db().find():
    # uuid = str(ue['uuid'].as_uuid(3))
    uuid = str(ue['uuid'])
    program = ue['user_email'].split("_")[0]
    uuid_program_list.append({"program": program, "opcode": ue["user_email"], "user_id": uuid})

Found new program stage, creating new list
Found new program 4c, creating new list
Found new program cc, creating new list
Found new program fc, creating new list
Found new program pc, creating new list
Found new program sc, creating new list
Found new program vail, creating new list
Found new program prepilot, creating new list


In [15]:
uuid_program_df = pd.DataFrame(uuid_program_list)

In [16]:
# %%capture

# for program in uuid_program_df.program.unique():
expanded_ct, file_suffix, quality_text, debug_df = scaffolding.load_viz_notebook_data(None,
                                                                            None,
                                                                            'prepilot',
                                                                            'program',
                                                                            dic_re,
                                                                            dic_pur=dic_pur)

Loaded all confirmed trips of length 241123
After filtering, found 241123 participant trips 
After filtering, found 92446 labeled trips
Found Index(['mode_confirm', 'purpose_confirm', 'replaced_mode'], dtype='object') columns of length 3
After expanding, columns went from 41 -> 44
_prepilot
Based on 92395 confirmed trips from 235 users
of 241123 total  trips from 261 users (38.32%)


In [17]:
# Join to the program df to get each user's program
expanded_ct['original_user_id'] = expanded_ct['user_id'].copy()
expanded_ct['user_id'] = expanded_ct['user_id'].apply(lambda x: str(x))
expanded_ct = expanded_ct.merge(uuid_program_df, on='user_id')
expanded_ct['user_id'] = expanded_ct['user_id'].apply(lambda x: str(x).replace("-", ""))

In [18]:
expanded_ct.columns

Index(['source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts',
       'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place',
       'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip',
       'expectation', 'confidence_threshold', 'expected_trip', 'user_input',
       'section_modes', 'section_distances', 'start_local_dt_year',
       'start_local_dt_month', 'start_local_dt_day', 'start_local_dt_hour',
       'start_local_dt_minute', 'start_local_dt_second',
       'start_local_dt_weekday', 'start_local_dt_timezone',
       'end_local_dt_year', 'end_local_dt_month', 'end_local_dt_day',
       'end_local_dt_hour', 'end_local_dt_minute', 'end_local_dt_second',
       'end_local_dt_weekday', 'end_local_dt_timezone', '_id', 'user_id',
       'metadata_write_ts', 'additions', 'mode_confirm', 'purpose_confirm',
       'replaced_mode', 'distance_miles', 'Mode_confirm', 'Replaced_mode',
       'Trip_purpose', 'original_user_id', 'program', 'opcode'],
 

In [19]:
# Rename the target column.
expanded_ct.drop(columns=['replaced_mode'], axis='rows', inplace=True)
expanded_ct['Replaced_mode'] = expanded_ct['Replaced_mode'].fillna('Unlabeled')

In [20]:
# We only wish to focus on Denver data for now.

denver_data = r(expanded_ct.loc[
    (expanded_ct.start_local_dt_timezone == "America/Denver") & (expanded_ct.end_local_dt_timezone == "America/Denver"), 
    :])

In [21]:
denver_data['start_fmt_time'] = pd.to_datetime(
    denver_data['start_fmt_time'], utc=True
).dt.tz_convert('America/Denver')

In [22]:
print(denver_data.start_fmt_time.min(), denver_data.start_fmt_time.max())

2020-09-22 17:13:55.883513-06:00 2022-12-30 23:33:27.147785-07:00


In [None]:
# Parse the datetime to Denver time.
# denver_data['start_fmt_time'] = pd.to_datetime(denver_data['start_fmt_time'], utc=True).dt.tz_convert('America/Denver')

# Re-compute all the start variables.
denver_data['start_local_dt_year'] = denver_data['start_fmt_time'].dt.year
denver_data['start_local_dt_month'] = denver_data['start_fmt_time'].dt.month
denver_data['start_local_dt_day'] = denver_data['start_fmt_time'].dt.day
denver_data['start_local_dt_hour'] = denver_data['start_fmt_time'].dt.hour
denver_data['start_local_dt_weekday'] = denver_data['start_fmt_time'].dt.weekday

## Do the same with the end time.
denver_data['end_fmt_time'] = pd.to_datetime(denver_data['end_fmt_time'], utc=True).dt.tz_convert('America/Denver')

# Re-compute all the end variables.
denver_data['end_local_dt_year'] = denver_data['end_fmt_time'].dt.year
denver_data['end_local_dt_month'] = denver_data['end_fmt_time'].dt.month
denver_data['end_local_dt_day'] = denver_data['end_fmt_time'].dt.day
denver_data['end_local_dt_hour'] = denver_data['end_fmt_time'].dt.hour
denver_data['end_local_dt_weekday'] = denver_data['end_fmt_time'].dt.weekday

Read the Demographic data

In [None]:
# Read the Denver dedmographic info
survey_data = pd.read_csv('../viz_scripts/Can Do Colorado eBike Program - en.csv')

In [None]:
# column renaming here!

survey_data.rename(
    {
        "Unique User ID (auto-filled, do not edit)": "user_id",
        "In which year were you born?": "birth_year",
        "What is your gender?": "gender",
        "Do you have a valid driver's license?": "has_drivers_license",
        "Are you a student?": "is_student",
        "What is the highest grade or degree that you have completed?": "highest_education",
        "Do you work for either pay or profit?": "is_paid",
        "Do you have more than one job?": "has_multiple_jobs",
        "Do you work full-time or part-time at your primary job?": "primary_job_type",
        "Which best describes your primary job?": "primary_job_description",
        "How did you usually get to your primary job last week? ": "primary_job_commute_mode",
        "Thinking about your daily commute to work last week, how many minutes did it usually take to get from home to the primary job/work place?": "primary_job_commute_time",
        "At your primary job, do you have the ability to set or change your own start time?": "is_primary_job_flexible",
        "Do you have the option of working from home or an alternate location instead of going into your primary work place?": "primary_job_can_wfh",
        "How many days per week do you usually work from home or an alternate location?": "wfh_days",
        "Do you own or rent your place of residence?": "residence_ownership_type",
        "What is your home type?": "residence_type",
        "Please identify which category represents your total household income, before taxes, for last year.": "income_category",
        "Including yourself, how many people live in your home?": "n_residence_members",
        "How many children under age 18 live in your home?": "n_residents_u18",
        "Including yourself, how many people have a driver's license in your household?": "n_residents_with_license",
        "How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?": "n_motor_vehicles",
        "If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?": "available_modes",
        "Do you have a medical condition that makes it difficult to travel outside of the home?": "has_medical_condition",
        "How long have you had this condition?": "medical_condition_duration"
    },
    axis='columns',
    inplace=True
)

In [None]:
# Now, if we have duplicate users, we'd like to retain the last chronological entry.
survey_data = survey_data.loc[~((survey_data.user_id.isna())|(survey_data.user_id == "")), :]

# timezonoe-aware parsing:
survey_data['Timestamp'] = survey_data['Timestamp'].str.replace('PDT|PST', '', regex=True)
survey_data['Timestamp'] = pd.to_datetime(survey_data['Timestamp']).dt.tz_localize('America/Denver')

# Sort by user_id and time, then drop everything but the last entry.
survey_data.sort_values(by=['user_id', 'Timestamp'], ascending=True, inplace=True, axis='rows')
survey_data.drop_duplicates(['user_id'], keep='last', inplace=True)

In [None]:
# Merge the trip data with the survey data.

merged_data = denver_data.merge(
    survey_data, left_on='user_id', right_on='user_id'
)

In [None]:
# Let's start choosing features for modeling.

base_time_features = ['fmt_time', 'local_dt_year', 'local_dt_month', 'local_dt_day', 'local_dt_hour', 'local_dt_weekday']
time_features = ['start_' + x for x in base_time_features] + ['end_' + x for x in base_time_features]

demographic_features = ['available_modes',
    'birth_year', 'income_category', 'n_motor_vehicles', 'n_residence_members', 'n_residents_u18', 'gender', 
    'is_student', 'n_residents_with_license']

sensed_features = ['duration', 'distance_miles', 'cleaned_trip', 'start_loc', 'end_loc', 'section_modes', 'section_distances']

modeling_data = merged_data[['user_id', '_id', 'original_user_id', 'cleaned_trip', 'Replaced_mode', 'Mode_confirm'] + time_features + demographic_features + sensed_features].copy()

# Rename columns in-place.
modeling_data.rename(columns={
    'start_local_dt_year': 'start:year', 'start_local_dt_month': 'start:month', 'start_local_dt_day': 'start:day', 'start_local_dt_hour': 'start:hour',
    'end_local_dt_year': 'end:year', 'end_local_dt_month': 'end:month', 'end_local_dt_day': 'end:day', 'end_local_dt_hour': 'end:hour'
    }, inplace=True)


In [None]:
modeling_data.columns

In [None]:
from calendar import monthrange

# Find day of month: use monthrange with (mm, yyyy) args and find how many days that month had (leap years are supported).
def get_num_days_in_month(yyyy, mm):
    return monthrange(yyyy, mm)[1]

def is_overnight_trip(start_date, end_date):
    return int((end_date - start_date).days > 0)

# get the number of days for the start and end times.
modeling_data['start:n_days_in_month'] = modeling_data.apply(lambda x: get_num_days_in_month(x['start:year'], x['start:month']), axis=1)
modeling_data['end:n_days_in_month'] = modeling_data.apply(lambda x: get_num_days_in_month(x['end:year'], x['end:month']), axis=1)

# age = current year - year of birth
modeling_data['age'] = 2023 - modeling_data['birth_year']

# overnight trips may be more likely taken by car.
modeling_data['is_overnight_trip'] = modeling_data.apply(lambda x: is_overnight_trip(x.start_fmt_time, x.end_fmt_time), axis=1)

# Number of working individuals in the household = number of individuals in the house - number of children.
modeling_data['n_working_residents'] = (modeling_data['n_residence_members'] - modeling_data['n_residents_u18']).astype(int)

# Create a binary indicator.
modeling_data['is_male'] = modeling_data.gender.apply(lambda x: 1 if x=="Male" else 0)

# Bin the number of vehicles owned.
# Drop the observations with (Prefer not to say)
modeling_data = modeling_data.loc[~modeling_data['n_motor_vehicles'].isin(['Prefer not to say / Prefiero no decir.']), :]
modeling_data.loc[modeling_data['n_motor_vehicles'].isin(['4+']), 'n_motor_vehicles'] = 4
modeling_data['n_motor_vehicles'] = modeling_data['n_motor_vehicles'].astype(int)

# Convert the total duration of the trip into minutes.
modeling_data[['duration']] = modeling_data[['duration']]/60

# Extract start and end latitudes and longitudes.
modeling_data['start_lat'] = modeling_data['start_loc'].apply(lambda x: x['coordinates'][1])
modeling_data['start_lng'] = modeling_data['start_loc'].apply(lambda x: x['coordinates'][0])

modeling_data['end_lat'] = modeling_data['end_loc'].apply(lambda x: x['coordinates'][1])
modeling_data['end_lng'] = modeling_data['end_loc'].apply(lambda x: x['coordinates'][0])

In [None]:
from calendar import monthrange

# Find day of month: use monthrange with (mm, yyyy) args and find how many days that month had (leap years are supported).
def get_num_days_in_month(yyyy, mm):
    return monthrange(yyyy, mm)[1]

def is_overnight_trip(start_date, end_date):
    return int((end_date - start_date).days > 0)

# get the number of days for the start and end times.
modeling_data['start:n_days_in_month'] = modeling_data.apply(lambda x: get_num_days_in_month(x['start:year'], x['start:month']), axis=1)
modeling_data['end:n_days_in_month'] = modeling_data.apply(lambda x: get_num_days_in_month(x['end:year'], x['end:month']), axis=1)

# age = current year - year of birth
modeling_data['age'] = 2023 - modeling_data['birth_year']

# overnight trips may be more likely taken by car.
modeling_data['is_overnight_trip'] = modeling_data.apply(lambda x: is_overnight_trip(x.start_fmt_time, x.end_fmt_time), axis=1)

# Number of working individuals in the household = number of individuals in the house - number of children.
modeling_data['n_working_residents'] = (modeling_data['n_residence_members'] - modeling_data['n_residents_u18']).astype(int)

# Create a binary indicator.
modeling_data['is_male'] = modeling_data.gender.apply(lambda x: 1 if x=="Male" else 0)

# Bin the number of vehicles owned.
# Drop the observations with (Prefer not to say)
modeling_data = modeling_data.loc[~modeling_data['n_motor_vehicles'].isin(['Prefer not to say / Prefiero no decir.']), :]
modeling_data.loc[modeling_data['n_motor_vehicles'].isin(['4+']), 'n_motor_vehicles'] = 4
modeling_data['n_motor_vehicles'] = modeling_data['n_motor_vehicles'].astype(int)

# Convert the total duration of the trip into minutes.
modeling_data[['duration']] = modeling_data[['duration']]/60

# Extract start and end latitudes and longitudes.
modeling_data['start_lat'] = modeling_data['start_loc'].apply(lambda x: x['coordinates'][1])
modeling_data['start_lng'] = modeling_data['start_loc'].apply(lambda x: x['coordinates'][0])

modeling_data['end_lat'] = modeling_data['end_loc'].apply(lambda x: x['coordinates'][1])
modeling_data['end_lng'] = modeling_data['end_loc'].apply(lambda x: x['coordinates'][0])

In [None]:
# Time-related feature engineeering:
'''
HOD: hour of day
DOM: day of month
MOY: month of year
'''

def get_HOD(hour, how='sin'):
    if how == 'sin':
        return np.sin(2 * np.pi * (hour/24))
    return np.cos(2 * np.pi * (hour/24))

def get_DOM(day, n_days, how='sin'):
    if how == 'sin':
        return np.sin(2 * np.pi * (day/n_days))
    return np.cos(2 * np.pi * (day/n_days))

def get_MOY(month, how='sin'):
    if how == 'sin':
        return np.sin(2 * np.pi * (month/12))
    return np.cos(2 * np.pi * (month/12))

# Start - sin
modeling_data['start:sin_HOD'] = modeling_data.apply(lambda x: get_HOD(x['start:hour']), axis=1)
modeling_data['start:sin_DOM'] = modeling_data.apply(lambda x: get_DOM(x['start:day'], x['start:n_days_in_month']), axis=1)
modeling_data['start:sin_MOY'] = modeling_data.apply(lambda x: get_MOY(x['start:year']), axis=1)

# Start - cos
modeling_data['start:cos_HOD'] = modeling_data.apply(lambda x: get_HOD(x['start:hour'], how='cos'), axis=1)
modeling_data['start:cos_DOM'] = modeling_data.apply(lambda x: get_DOM(x['start:day'], x['start:n_days_in_month'], how='cos'), axis=1)
modeling_data['start:cos_MOY'] = modeling_data.apply(lambda x: get_MOY(x['start:year'], how='cos'), axis=1)

# End - sin
modeling_data['end:sin_HOD'] = modeling_data.apply(lambda x: get_HOD(x['end:hour']), axis=1)
modeling_data['end:sin_DOM'] = modeling_data.apply(lambda x: get_DOM(x['end:day'], x['end:n_days_in_month']), axis=1)
modeling_data['end:sin_MOY'] = modeling_data.apply(lambda x: get_MOY(x['end:year']), axis=1)

# End - cos
modeling_data['end:cos_HOD'] = modeling_data.apply(lambda x: get_HOD(x['end:hour'], how='cos'), axis=1)
modeling_data['end:cos_DOM'] = modeling_data.apply(lambda x: get_DOM(x['end:day'], x['end:n_days_in_month'], how='cos'), axis=1)
modeling_data['end:cos_MOY'] = modeling_data.apply(lambda x: get_MOY(x['end:year'], how='cos'), axis=1)

In [None]:
modeling_data.head()

Now, for every trip, we have the corresponding section mode that covered the longest distance for the trip.

Using this as well as the `available_modes` column:

```language=python

    # unique available modes:
    {'Bicycle',
    'Do not have vehicle ',
    'Get a ride from a friend or family member',
    'None',
    'Public transportation (bus, subway, light rail, etc.)',
    'Rental car (including Zipcar/ Car2Go)',
    'Shared bicycle or scooter',
    'Skateboard',
    'Taxi (regular taxi, Uber, Lyft, etc)',
    'Walk/roll'}

    # unique section modes:
    {'bicycling', 'bus', 'car', 'no_sensed', 'train', 'walking'}

    
```

What mapping can we establish here? 

In [None]:
def remove_air_or_hsr(df):

    df['mark'] = 0

    for ix, row in df.iterrows():
        sections = row['section_modes']
        if 'air_or_hsr' in sections:
            df.loc[ix, 'mark'] = 1
    
    df = r(df.loc[df.mark == 0, :])
    df.drop(columns=['mark'], inplace=True)

    return df

In [None]:
modeling_data = remove_air_or_hsr(modeling_data)

In [None]:
modeling_data.columns

In [None]:
importlib.reload(scaffolding)

In [None]:
results = list()

In [None]:
modeling_data = modeling_data.loc[:,~modeling_data.columns.duplicated()].copy()

In [None]:
# | [a, b, c] | start_time | end_time |
# -> | [a, b, c] | [s1, s2, s3] |

In [None]:
# print(modeling_data.shape[0])

In [None]:
# I manually split the modeling data into chunks of 10000 points at one time.
# This significantly expedited the processing time. Each chunk takes ~26 minutes to finish.
# split = modeling_data.loc[70000:, :]

In [None]:
# display(split[['original_user_id', 'cleaned_trip']].head())
# print(split.shape[0])

In [None]:
# from time import perf_counter

# now = perf_counter()
# result = scaffolding.get_section_durations(split)
# end = perf_counter() - now

# print(f"Took {end/60} minutes to complete")
# results.append(result)

In [None]:
# print(len(results))

In [None]:
# final_df = pd.concat(results, axis=0)

In [None]:
# print(modeling_data.shape[0], final_df.shape[0])

In [None]:
# modeling_data.to_csv('../data/modeling_data.csv', index=False)
# final_df.to_csv('../data/modeling_w_duration.csv', index=False)

In [None]:
# from time import perf_counter
# importlib.reload(scaffolding)

In [None]:
df = pd.read_csv('../data/modeling_w_duration.csv')
df.drop_duplicates(inplace=True)

In [None]:
# df.shape, modeling_data.shape

In [None]:
dummy = modeling_data.iloc[50000:, :]

print(dummy.shape)

In [None]:
# now = perf_counter()
df_modded = scaffolding.get_section_coordinates(dummy)
# end = perf_counter() - now

# print(f"Fetched sections in {end/60} minutes")

In [None]:
def verify(locations):
    return locations is not None and len(locations) > 0


bools = df_modded.apply(lambda x: verify(x.section_locations_argmax), axis=1)
print(bools.all())

In [None]:
results.append(df_modded)

In [None]:
print(len(results))

In [None]:
final_results = pd.concat(results, axis=0)

In [None]:
final_df = pd.concat([df, final_results['section_locations_argmax']], axis=1)

In [None]:
final_df[['section_modes', 'section_distances', 'section_durations', 'section_locations_argmax']].head()

In [None]:
final_df.to_csv('../data/final_modeling_data.csv', index=False)

In [None]:
final_df.section_locations_argmax[0]