In [1]:
import os
import sys
import pickle
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
from uuid import UUID
from collections import defaultdict

%matplotlib inline

In [2]:
INCLUDE_TEST_USERS = False

In [3]:
emission_path = Path(os.getcwd()).parent.parent / 'my_emission_server' / 'e-mission-server'
sys.path.append(str(emission_path))

# Also add the home (viz_scripts) to the path
sys.path.append('../viz_scripts')

In [4]:
DB_SOURCE = [
    "Stage_database",
    "openpath_prod_durham",
    "openpath_prod_mm_masscec",
    "openpath_prod_ride2own",
    # No replaced mode!
    "openpath_prod_uprm_civic",
    "openpath_prod_uprm_nicr"
]

In [5]:
CURRENT_DB = DB_SOURCE[0]

assert CURRENT_DB in DB_SOURCE

In [6]:
# ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']

REPLACED_MODE_DICT = {
    "Stage_database": {
        'no_travel': 'no_trip',
        'Unknown': 'unknown',
        'bus': 'transit',
        'drove_alone': 'car',
        'bike': 'p_micro',
        'shared_ride': 's_car',
        'walk': 'walk',
        'train': 'transit',
        'bikeshare': 's_micro',
        'not_a trip': 'no_trip',
        'pilot_ebike': 'p_micro',
        'electric_car': 'car',
        'taxi': 'ridehail',
        'not_a_trip': 'no_trip',
        'run': 'walk',
        'scootershare': 's_micro',
        'tramway': 'transit',
        'free_shuttle': 'transit',
        'e-bike': 'p_micro',
        'rental_car': 'car',
        'train_+ bus': 'transit',
        'skateboard': 'p_micro',
        'snowboarding': 'p_micro',
        'e_bike': 'p_micro',
        'golf_cart': 'unknown',
        'emergency_vehicle with others': 's_car',
        'call_friend': 's_car',
        'no_replacement': 'no_travel',
        'doing_nothing': 'no_trip',
        'na': 'no_trip',
        'ebike': 'p_micro',
        'hiking': 'walk',
        'n/a': 'no_trip',
        'testing': 'unknown',
        'home': 'no_trip',
        'must_walk 3-5 mi a day for back': 'walk',
        'family': 's_car',
        'car': 'car',
        'pilot_e-bike': 'p_micro',
        'pilot_bike': 'p_micro',
        'time_spent on the clock at amazon': 'no_trip',
        'working': 'no_trip',
        'walk_at work': 'walk',
        'sitting_on my butt doing nothing': 'no_trip',
        'nothing._delivered food for work': 'no_trip',
        'train,_bus and walk': 'transit',
        'work_vehicle': 'car',
        'friend_picked me up': 's_car',
        'ski': 'p_micro',
        'not_accurate': 'unknown',
        'stolen_ebike': 'p_micro'
    },
    "openpath_prod_durham": {
        'Unknown': 'unknown',
        'bike': 'p_micro',
        'shared_ride': 's_car',
        'drove_alone': 'car',
        'bus': 'transit',
        'no_travel': 'no_trip',
        'scootershare': 's_micro',
        'walk': 'walk',
        'taxi': 'ridehail',
        'e_car_drove_alone': 'car',
        'bikeshare': 's_micro',
        'ebike': 'p_micro',
        'train': 'transit',
        'e_car_shared_ride': 's_car'
    },
    "openpath_prod_mm_masscec": {
        'Unknown': 'unknown',
        'drove_alone': 'car',
        'walk': 'walk',
        'shared_ride': 's_car',
        'bike': 'p_micro',
        'bikeshare': 's_micro',
        'no_travel': 'no_trip',
        'taxi': 'ridehail',
        'bus': 'transit',
        'scootershare': 's_micro',
        'train': 'transit',
        'walking': 'walk',
        'e_car_drove_alone': 'car'
    },
    "openpath_prod_ride2own": {
        'Unknown': 'unknown',
        'drove_alone': 'car',
        'walk': 'walk',
        'shared_ride': 's_car',
        'bike': 'p_micro',
        'no_travel': 'no_trip',
        'taxi': 'ridehail',
        'bus': 'transit',
        'train': 'transit',
        'e_car_drove_alone': 'car',
        'e_car_shared_ride': 's_car'
    },
    "openpath_prod_uprm_nicr": {
        'Unknown': 'unknown',
        'walk': 'walk',
        'drove_alone': 'car'
    }
}

SURVEY_DATA_DICT = {
    "Stage_database": {
        "Unique User ID (auto-filled, do not edit)": "user_id",
        "In which year were you born?": "birth_year",
        "What is your gender?": "gender",
        "Do you have a valid driver's license?": "has_drivers_license",
        "Are you a student?": "is_student",
        "What is the highest grade or degree that you have completed?": "highest_education",
        "Do you work for either pay or profit?": "is_paid",
        "Do you have more than one job?": "has_multiple_jobs",
        "Do you work full-time or part-time at your primary job?": "primary_job_type",
        "Which best describes your primary job?": "primary_job_description",
        "How did you usually get to your primary job last week? ": "primary_job_commute_mode",
        "Thinking about your daily commute to work last week, how many minutes did it usually take to get from home to the primary job/work place?": "primary_job_commute_time",
        "At your primary job, do you have the ability to set or change your own start time?": "is_primary_job_flexible",
        "Do you have the option of working from home or an alternate location instead of going into your primary work place?": "primary_job_can_wfh",
        "How many days per week do you usually work from home or an alternate location?": "wfh_days",
        "Do you own or rent your place of residence?": "residence_ownership_type",
        "What is your home type?": "residence_type",
        "Please identify which category represents your total household income, before taxes, for last year.": "income_category",
        "Including yourself, how many people live in your home?": "n_residence_members",
        "How many children under age 18 live in your home?": "n_residents_u18",
        "Including yourself, how many people have a driver's license in your household?": "n_residents_with_license",
        "How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?": "n_motor_vehicles",
        "If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?": "available_modes",
        "Do you have a medical condition that makes it difficult to travel outside of the home?": "has_medical_condition",
        "How long have you had this condition?": "medical_condition_duration"
    },
    # Retrieved from: e-mission-phone/survey-resources/data-xls/demo-survey-v1.xlsx
    "openpath_prod_durham": {
        "At_your_primary_job_do_you_ha": "is_primary_job_flexible",
        "Which_best_describes_your_prim": "primary_job_description",
        "Do_you_work_full_time_or_part_": "primary_job_type",
        "Do_you_have_the_option_of_work": "primary_job_can_wfh",
        "Please_describe_your_primary_job": "primary_job_description",
        "Do_you_have_more_than_one_job": "has_multiple_jobs",
        # Two columns: how many days/week do you work & what days of the week do you work. 
        # the latter has only 4 NA values, the former has 45 NA values.
        "What_days_of_the_week_do_you_t": "wfh_days",
        "How_many_days_do_you_usually_w_001": "n_wfh_days",
        # All these are NAs.
        "Which_one_below_describe_you_b": "description",
        "What_is_your_race_ethnicity": "race_or_ethnicity",
        "Are_you_a_student": "is_student",
        "What_is_the_highest_grade_or_d": "highest_education",
        "do_you_consider_yourself_to_be": "is_transgender",
        "What_is_your_gender": "gender",
        "How_old_are_you": "age",
        "Are_you_a_paid_worker": "is_paid",
        "Do_you_have_a_driver_license": "has_drivers_license",
        "How_long_you_had_this_conditio": "medical_condition_duration",
        "Including_yourself_how_many_w_001": "n_residents_u18",
        "Including_yourself_how_many_p": "n_residence_members",
        "Do_you_own_or_rent_your_home": "residence_ownership_type",
        "Please_identify_which_category": "income_category",
        "If_you_were_unable_to_use_your": "available_modes",
        "Including_yourself_how_many_p_001": "n_residents_with_license",
        "Including_yourself_how_many_w": "n_working_residents",
        "What_is_your_home_type": "residence_type",
        "How_many_motor_vehicles_are_ow": "n_motor_vehicles",
        "Do_you_have_a_condition_or_han": "has_medical_condition"
    },
    "openpath_prod_mm_masscec": {
        # Same questions as Durham.
        "At_your_primary_job_do_you_ha": "is_primary_job_flexible",
        "Which_best_describes_your_prim": "primary_job_description",
        "Do_you_work_full_time_or_part_": "primary_job_type",
        "Do_you_have_the_option_of_work": "primary_job_can_wfh",
        "Please_describe_your_primary_job": "primary_job_description",
        "Do_you_have_more_than_one_job": "has_multiple_jobs",
        # Two columns: how many days/week do you work & what days of the week do you work. 
        # the latter has only 4 NA values, the former has 45 NA values.
        "What_days_of_the_week_do_you_t": "wfh_days",
        "How_many_days_do_you_usually_w_001": "n_wfh_days",
        # All these are NAs.
        "Which_one_below_describe_you_b": "description",
        "What_is_your_race_ethnicity": "race_or_ethnicity",
        "Are_you_a_student": "is_student",
        "What_is_the_highest_grade_or_d": "highest_education",
        "do_you_consider_yourself_to_be": "is_transgender",
        "What_is_your_gender": "gender",
        "How_old_are_you": "age",
        "Are_you_a_paid_worker": "is_paid",
        "Do_you_have_a_driver_license": "has_drivers_license",
        "How_long_you_had_this_conditio": "medical_condition_duration",
        "Including_yourself_how_many_w_001": "n_residents_u18",
        "Including_yourself_how_many_p": "n_residence_members",
        "Do_you_own_or_rent_your_home": "residence_ownership_type",
        "Please_identify_which_category": "income_category",
        "If_you_were_unable_to_use_your": "available_modes",
        "Including_yourself_how_many_p_001": "n_residents_with_license",
        "Including_yourself_how_many_w": "n_working_residents",
        "What_is_your_home_type": "residence_type",
        "How_many_motor_vehicles_are_ow": "n_motor_vehicles",
        "Do_you_have_a_condition_or_han": "has_medical_condition"
    },
    "openpath_prod_ride2own": {
        # Same questions as Durham.
        "How_old_are_you": "age",
        "What_is_your_gender": "gender",
        "do_you_consider_yourself_to_be": "is_transgender",
        "What_is_your_race_ethnicity": "race_or_ethnicity",
        "Do_you_have_a_driver_license": "has_drivers_license",
        "Are_you_a_student": "is_student",
        "What_is_the_highest_grade_or_d": "highest_education",
        "Are_you_a_paid_worker": "is_paid",
        "Which_one_below_describe_you_b": "description",
        "Do_you_own_or_rent_your_home": "residence_ownership_type",
        "What_is_your_home_type": "residence_type",
        "Please_identify_which_category": "income_category",
        "Including_yourself_how_many_p": "n_residence_members",
        "Including_yourself_how_many_w": "n_working_residents",
        "Including_yourself_how_many_p_001": "n_residents_with_license",
        "Including_yourself_how_many_w_001": "n_residents_u18",
        "How_many_motor_vehicles_are_ow": "n_motor_vehicles",
        "If_you_were_unable_to_use_your": "available_modes",
        "Do_you_have_a_condition_or_han": "has_medical_condition",
        "How_long_you_had_this_conditio": "medical_condition_duration",
        "Do_you_have_more_than_one_job": "has_multiple_jobs",
        "Do_you_work_full_time_or_part_": "primary_job_type",
        "Which_best_describes_your_prim": "primary_job_description",
        "Please_describe_your_primary_job": "primary_job_description",
        "At_your_primary_job_do_you_ha": "is_primary_job_flexible",
        "Do_you_have_the_option_of_work": "primary_job_can_wfh",
        "How_many_days_do_you_usually_w_001": "n_wfh_days",
        "What_days_of_the_week_do_you_t": "wfh_days"
    },
    "openpath_prod_uprm_nicr": {
        # Same as Durham!
        "At_your_primary_job_do_you_ha": "is_primary_job_flexible",
        "Which_best_describes_your_prim": "primary_job_description",
        "Do_you_work_full_time_or_part_": "primary_job_type",
        "Do_you_have_the_option_of_work": "primary_job_can_wfh",
        "Please_describe_your_primary_job": "primary_job_description",
        "Do_you_have_more_than_one_job": "has_multiple_jobs",
        # Two columns: how many days/week do you work & what days of the week do you work. 
        # the latter has only 4 NA values, the former has 45 NA values.
        "What_days_of_the_week_do_you_t": "wfh_days",
        "How_many_days_do_you_usually_w_001": "n_wfh_days",
        # All these are NAs.
        "Which_one_below_describe_you_b": "description",
        "What_is_your_race_ethnicity": "race_or_ethnicity",
        "Are_you_a_student": "is_student",
        "What_is_the_highest_grade_or_d": "highest_education",
        "do_you_consider_yourself_to_be": "is_transgender",
        "What_is_your_gender": "gender",
        "How_old_are_you": "age",
        "Are_you_a_paid_worker": "is_paid",
        "Do_you_have_a_driver_license": "has_drivers_license",
        "How_long_you_had_this_conditio": "medical_condition_duration",
        "Including_yourself_how_many_w_001": "n_residents_u18",
        "Including_yourself_how_many_p": "n_residence_members",
        "Do_you_own_or_rent_your_home": "residence_ownership_type",
        "Please_identify_which_category": "income_category",
        "If_you_were_unable_to_use_your": "available_modes",
        "Including_yourself_how_many_p_001": "n_residents_with_license",
        "Including_yourself_how_many_w": "n_working_residents",
        "What_is_your_home_type": "residence_type",
        "How_many_motor_vehicles_are_ow": "n_motor_vehicles",
        "Do_you_have_a_condition_or_han": "has_medical_condition"
    }
}

In [7]:
set(SURVEY_DATA_DICT["Stage_database"].values()).difference(
    set(SURVEY_DATA_DICT["openpath_prod_durham"].values())
)

{'birth_year',
 'primary_job_commute_mode',
 'primary_job_commute_time',
 'user_id'}

In [8]:
## Source: db_utils.py in op-admin-dashboard.

BINARY_DEMOGRAPHICS_COLS = [
    'user_id',
    '_id',
]

EXCLUDED_DEMOGRAPHICS_COLS = [
    'data.xmlResponse', 
    'data.name',
    'data.version',
    'data.label',
    'xmlns:jr',
    'xmlns:orx',
    'id',
    'start',
    'end',
    'attrxmlns:jr',
    'attrxmlns:orx',
    'attrid',
    '__version__',
    'attrversion',
    'instanceID',
]

<b>Ensure that you point the database to the appropriate name</b>

In [9]:
import scaffolding
import emission.core.get_database as edb
import emission.storage.timeseries.abstract_timeseries as esta

storage not configured, falling back to sample, default configuration
URL not formatted, defaulting to Stage_database
Connecting to database URL localhost


In [10]:
## Source: scaffolding.py

def expand_userinputs(labeled_ct):
    '''
    param: labeled_ct: a dataframe of confirmed trips, some of which have labels
    params: labels_per_trip: the number of labels for each trip.
        Currently, this is 2 for studies and 3 for programs, and should be 
        passed in by the notebook based on the input config.
        If used with a trip-level survey, it could be even larger.
    '''
    # CASE 1 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
    if len(labeled_ct) == 0:
        return labeled_ct
    label_only = pd.DataFrame(labeled_ct.user_input.to_list(), index=labeled_ct.index)
    # disp.display(label_only.head())
    labels_per_trip = len(label_only.columns)
    print("Found %s columns of length %d" % (label_only.columns, labels_per_trip))
    expanded_ct = pd.concat([labeled_ct, label_only], axis=1)
    assert len(expanded_ct) == len(labeled_ct), \
        ("Mismatch after expanding labels, expanded_ct.rows = %s != labeled_ct.rows %s" %
            (len(expanded_ct), len(labeled_ct)))
    print("After expanding, columns went from %s -> %s" %
        (len(labeled_ct.columns), len(expanded_ct.columns)))
    assert len(expanded_ct.columns) == len(labeled_ct.columns) + labels_per_trip, \
        ("Mismatch after expanding labels, expanded_ct.columns = %s != labeled_ct.columns %s" %
            (len(expanded_ct.columns), len(labeled_ct.columns)))
    # disp.display(expanded_ct.head())
    return expanded_ct

In [11]:
## Source: scaffolding.py

def data_quality_check(expanded_ct):
    '''1. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was pilot_ebike.
       2. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was same_mode.
       3. Replace same_mode for the mode_confirm for Energy Impact Calcualtion.'''

    # TODO: This is only really required for the initial data collection around the minipilot
    # in subsequent deployes, we removed "same mode" and "pilot_ebike" from the options, so the
    # dataset did not contain of these data quality issues

    if 'replaced_mode' in expanded_ct.columns:
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'pilot_ebike')].index, inplace=True)
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'same_mode')].index, inplace=True)
        expanded_ct['replaced_mode'] = np.where(expanded_ct['replaced_mode'] == 'same_mode',expanded_ct['mode_confirm'], expanded_ct['replaced_mode'])
    
    return expanded_ct

In [12]:
## Source: scaffolding.py

uuid_df = pd.json_normalize(list(edb.get_uuid_db().find()))

if not INCLUDE_TEST_USERS:
    uuid_df = uuid_df.loc[~uuid_df.user_email.str.contains('_test_'), :]

filtered = uuid_df.uuid.unique()

agg = esta.TimeSeries.get_aggregate_time_series()
all_ct = agg.get_data_df("analysis/confirmed_trip", None)

print(f"Before filtering, length={len(all_ct)}")
participant_ct_df = all_ct.loc[all_ct.user_id.isin(filtered), :]
print(f"After filtering, length={len(participant_ct_df)}")

expanded_ct = expand_userinputs(participant_ct_df)
expanded_ct = data_quality_check(expanded_ct)
print(expanded_ct.columns.tolist())
expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unknown')

Before filtering, length=241123
After filtering, length=241123
Found Index(['mode_confirm', 'purpose_confirm', 'replaced_mode'], dtype='object') columns of length 3
After expanding, columns went from 41 -> 44
['source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation', 'confidence_threshold', 'expected_trip', 'user_input', 'section_modes', 'section_distances', 'start_local_dt_year', 'start_local_dt_month', 'start_local_dt_day', 'start_local_dt_hour', 'start_local_dt_minute', 'start_local_dt_second', 'start_local_dt_weekday', 'start_local_dt_timezone', 'end_local_dt_year', 'end_local_dt_month', 'end_local_dt_day', 'end_local_dt_hour', 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', 'end_local_dt_timezone', '_id', 'user_id', 'metadata_write_ts', 'additions', 'mode_confirm', 'purpose_confirm', 'replaced_mode'

In [13]:
# # Additional preprocessing for replaced mode (if any)

mode_counts = expanded_ct['replaced_mode'].value_counts()
drop_modes = mode_counts[mode_counts == 1].index.tolist()

expanded_ct.drop(
    index=expanded_ct.loc[expanded_ct.replaced_mode.isin(drop_modes)].index,
    inplace=True
)

# Additional modes to drop.
expanded_ct.drop(
    index=expanded_ct.loc[expanded_ct.replaced_mode.isin(
        # Remove all rows with air, boat, or weird answers.
        ['houseboat', 'gondola', 'airline_flight', 'aircraft', 'zoo', 'air',
         'airplane', 'boat', 'flight', 'plane', 'meal', 'lunch']
    )].index,
    inplace=True
)

In [14]:
print(expanded_ct.replaced_mode.unique())

['no_travel' 'Unknown' 'bus' 'drove_alone' 'bike' 'shared_ride' 'walk'
 'train' 'bikeshare' 'not_a trip' 'pilot_ebike' 'electric_car' 'taxi'
 'not_a_trip' 'run' 'scootershare' 'tramway' 'free_shuttle' 'e-bike'
 'rental_car' 'train_+ bus' 'skateboard' 'e_bike' 'golf_cart'
 'emergency_vehicle with others' 'call_friend' 'no_replacement'
 'doing_nothing' 'na' 'ebike' 'hiking' 'ski' 'not_accurate' 'pilot_bike'
 'snowboarding' 'stolen_ebike' 'n/a' 'testing' 'home'
 'must_walk 3-5 mi a day for back' 'family' 'car' 'pilot_e-bike'
 'time_spent on the clock at amazon' 'working' 'walk_at work'
 'sitting_on my butt doing nothing' 'nothing._delivered food for work'
 'train,_bus and walk' 'work_vehicle' 'friend_picked me up']


In [15]:
expanded_ct.replaced_mode = expanded_ct.replaced_mode.apply(lambda x: REPLACED_MODE_DICT[CURRENT_DB][x])

In [16]:
print(expanded_ct.replaced_mode.unique())

['no_trip' 'unknown' 'transit' 'car' 'p_micro' 's_car' 'walk' 's_micro'
 'ridehail' 'no_travel']


In [17]:
# Demographics

if CURRENT_DB != "Stage_database":

    decoded_uuids = [str(x) for x in filtered]

    ## Source: query_demographics() in op-admin-dashboard.
    ts = esta.TimeSeries.get_aggregate_time_series()
    entries = list(ts.find_entries(["manual/demographic_survey"]))

    available_key = {}
    for entry in entries:
        survey_key = list(entry['data']['jsonDocResponse'].keys())[0]
        if survey_key not in available_key:
            available_key[survey_key] = []

        # Minor modification: Added user_id check to filter users.
        if str(entry['user_id']) in decoded_uuids:
            available_key[survey_key].append(entry)

    dataframes = {}
    for key, json_object in available_key.items():
        df = pd.json_normalize(json_object)
        dataframes[key] = df

    for key, df in dataframes.items():
        if not df.empty:
            for col in BINARY_DEMOGRAPHICS_COLS:
                if col in df.columns:
                    df[col] = df[col].apply(str) 
            columns_to_drop = [col for col in df.columns if col.startswith("metadata")]
            df.drop(columns= columns_to_drop, inplace=True) 
            df.columns=[col.rsplit('.',1)[-1] if col.startswith('data.jsonDocResponse.') else col for col in df.columns]
            for col in EXCLUDED_DEMOGRAPHICS_COLS:
                if col in df.columns:
                    df.drop(columns= [col], inplace=True)

    survey_data = pd.DataFrame()                
    for v in dataframes.values():
        survey_data = pd.concat([survey_data, v], axis=0, ignore_index=True)
else:
    # Read the demographics.
    survey_data = pd.read_csv('../viz_scripts/Can Do Colorado eBike Program - en.csv')
    survey_data.rename(columns={'Unique User ID (auto-filled, do not edit)': 'user_id'}, inplace=True)

In [18]:
print(len(survey_data.user_id.unique()), len(expanded_ct.user_id.unique()))

203 261


In [19]:
survey_data.rename(SURVEY_DATA_DICT[CURRENT_DB], axis='columns', inplace=True)

In [20]:
display(survey_data[['user_id']].head())
print(type(survey_data['user_id'][0]))

Unnamed: 0,user_id
0,a2d48b05d5454d428c0841432c7467b6
1,f2799dc202bc4249b42a4fda8770d1b6
2,b2bbe715b6a14fd19f751cae8adf6b4e
3,6373dfb8cb9b47e88e8f76adcfadde20
4,93c6e0f156a44e07b920ded664419dc6


<class 'str'>


In [21]:
display(expanded_ct[['user_id']].head())
print(type(expanded_ct['user_id'][0]))

Unnamed: 0,user_id
0,8a0473ca-e53d-4720-a99c-0696cc1fb407
1,0a093cbd-b536-43af-b03d-293425e84c76
2,0a093cbd-b536-43af-b03d-293425e84c76
3,0a093cbd-b536-43af-b03d-293425e84c76
4,0a093cbd-b536-43af-b03d-293425e84c76


<class 'uuid.UUID'>


In [22]:
# Additional preprocessing to filter unwanted users from sensed trips data.
ct_users = expanded_ct['user_id'].apply(lambda x: str(x).replace('-', ''))
survey_users = survey_data['user_id'].apply(lambda x: str(x).replace('-', ''))

common = set(ct_users.unique()).intersection(set(survey_users.unique()))

filtered_trips = expanded_ct.loc[ct_users.isin(common), :].reset_index(drop=True)
filtered_survey = survey_data.loc[survey_users.isin(common), :].reset_index(drop=True)

In [23]:
print(f"[trip data] After filtering, size goes from {expanded_ct.shape[0]} -> {filtered_trips.shape[0]}")
print(f"[trip data] Unique users after filtering: {len(filtered_trips.user_id.unique())}")

print(f"[survey data] After filtering, size goes from {survey_users.shape[0]} -> {filtered_survey.shape[0]}")
print(f"[survey data] Unique users after filtering: {len(filtered_survey.user_id.unique())}")

[trip data] After filtering, size goes from 240962 -> 205388
[trip data] Unique users after filtering: 184
[survey data] After filtering, size goes from 310 -> 286
[survey data] Unique users after filtering: 184


In [24]:
type(filtered_trips.user_id[0])

uuid.UUID

In [None]:
# raw data dump.
expanded_ct.to_csv(f'../data/raw_data/trips__{CURRENT_DB}.csv', index=False)
survey_data.to_csv(f'../data/raw_data/survey__{CURRENT_DB}.csv', index=False)

# filtered data dump.
filtered_trips.to_csv(f'../data/filtered_data/trips__{CURRENT_DB}.csv', index=False)
filtered_survey.to_csv(f'../data/filtered_data/survey__{CURRENT_DB}.csv', index=False)

In [None]:
# durham has section modes and section distances.
# masscec does not have section distance and section modes.
# ride2own does not have section distances and modes.
# uprm nicr also does not have section distances and modes.