In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import geopandas as gpd
from traceback import print_exception

%matplotlib inline

# Sections

### 0: Refer to the mnl_exploration.ipynb notebook for specifics about user-level preprocessing
### 1. Creating the dummy variables for the available modes
### 2. Creating trip-specific variables for each available mode (cost and time)
### 3. ??

In [None]:
# Data loading. Refer to the mnl_exploration.ipynb file for details on how user-level features are generated.

df = pd.read_csv('../data/modeling_w_duration.csv')

In [None]:
df.drop_duplicates(inplace=True)

### Creating the dummy variables for the available modes

In [None]:
def compute_argmax(df: pd.DataFrame):
    # Create the two columns.
    df[['section_distance_argmax', 'section_duration_argmax', 'section_mode_argmax', 'section_coordinates_argmax']] = None
    rows = list()

    for ix, row in df.iterrows():
        row_dict = row.to_dict()
        
        try:
            parsed_distances = ast.literal_eval(row_dict['section_distances'])
            parsed_durations = ast.literal_eval(row_dict['section_durations'])
            parsed_modes = ast.literal_eval(row_dict['section_modes'])

            argmax_ix = np.argmax(parsed_distances)

            row_dict['section_distance_argmax'] = parsed_distances[argmax_ix] * 0.0006213712        # Pick the argmax and scale to miles.
            row_dict['section_duration_argmax'] = parsed_durations[argmax_ix] / 60                  # Pick the argmax and scale to minutes.
            row_dict['section_mode_argmax'] = parsed_modes[argmax_ix]
            
            row_dict['mark'] = False

        except Exception as e:
            row_dict['mark'] = True

        rows.append(row_dict)

    return pd.DataFrame(rows)

In [None]:
# Let's plot the mode-wise durations as a function of distance.
df_modded = compute_argmax(df)

In [None]:
df_modded = df_modded.loc[~df_modded.mark, :].reset_index(drop=True, inplace=False).drop(columns=['mark'], inplace=False)

Intercept and coefficient values for determining the duration from distance (Obtained from time_distance_estimation.ipynb):

```
Format:
-> mode train_r2 test_r2
-> intercept: x, coefficient: y


walking 0.24312136039586707 0.5572752052765471
intercept:  14.09698152519682 coeff:  19.153802211061134
bicycling 0.5986205637411164 0.646267142326254
intercept:  4.753590860550791 coeff:  4.2723965916811935
car 0.45228118668156203 0.6040302860034135
intercept:  10.515751350707365 coeff:  1.112111276629702
no_sensed 0.5188696686147352 0.5028191556675814
intercept:  9.423026094931313 coeff:  2.640429444466706
public_transport 0.47377939742617625 0.5916737429017003
intercept:  7.5000351741492395 coeff:  1.839562017986485
```

In [None]:
df_modded['no_trip'] = df_modded.apply(
    lambda x: 'Do not have vehicle' in x.available_modes or 'None' in x.available_modes, axis=1
)

In [None]:
no_sensed = df_modded.apply(
    lambda x: 'no_sensed' == x.section_mode_argmax, axis=1
)

n, d = no_sensed[no_sensed == True].shape[0], no_sensed.shape[0]
print(n, d)
print((n/d)*100.)

In [None]:
n = df_modded.loc[df_modded.no_trip, :].shape[0]
d = df_modded.shape[0]
print(n, d)
print((n/d) * 100.)

In [None]:
def generate_available_features(df: pd.DataFrame, available_mode_map: dict, section_mode_map: dict, ordinal_mapper: dict):
    """
    This method generates the wide-form dummy features for available modes.
    The current implementation uses (section_mode_argmax OR available_modes) to indicate mode availability.
    """

    # Reset indices.
    df = df.reset_index(drop=True, inplace=False)

    columns = list(available_mode_map.values()) + ['unknown']

    # Create the columns along with the 'chosen' column.
    df[columns + ['chosen']] = 0

    row_dicts = list()

    for _, row in df.iterrows():
        row_dict = row.to_dict()

        # Access the available modes. Split on ; and strip all the elements.
        available_modes = [x.strip() for x in str(row_dict['available_modes']).split(';')]

        # Update the available modes.
        for available_mode in available_modes:
            row_dict[available_mode_map[available_mode]] = 1
        
        # Now, look at the section_mode_argmax as a proxy for the confirmed mode.
        chosen_modes = section_mode_map[row_dict['section_mode_argmax']]

        # Determine the length of the mapping.
        n = len(chosen_modes)

        # Go ahead and modify the current dict with the first value.
        row_dict['chosen'] = ordinal_mapper[chosen_modes[0]]
        row_dict['primary_mode'] = 1

        # Also update the available modes.
        for chosen_mode in chosen_modes:
            row_dict[chosen_mode] = 1
        
        # Append thee first row to the list.
        row_dicts.append(row_dict)
        
    constructed = pd.DataFrame(row_dicts)
    constructed.rename(columns=dict([(c, 'av_'+c) for c in ordinal_mapper.keys()]), inplace=True)

    return constructed

In [None]:
## Define the mapping strategy

available = {
    'Bicycle': 'p_micro',
    'Do not have vehicle': 'no_trip',
    'Get a ride from a friend or family member': 's_car',
    'None': 'no_trip',
    'Public transportation (bus, subway, light rail, etc.)': 'transit',
    'Rental car (including Zipcar/ Car2Go)': 'car',
    'Shared bicycle or scooter': 's_micro',
    'Skateboard': 'p_micro',
    'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail',
    'Walk/roll': 'walk'
}

# Map each sensed mode to the binary indicators.
section_mode_mapping = {
    'bicycling': ['p_micro', 's_micro'],
    'bus': ['transit'],
    'car': ['s_car', 'car', 'ridehail'],
    'no_sensed': ['unknown'],
    'train': ['transit'],
    'walking': ['walk']
}

# These values were obtained using a simple linear regression model. Specifics can be found in time_distance_estimation.ipynb
duration_dict = {
    'walk': {
        'intercept': 14.09698152519682,
        'coef':  19.153802211061134
    },
    'p_micro': {
        'intercept':  4.753590860550791,
        'coef':  4.2723965916811935
    },
    's_micro': {
        'intercept':  4.753590860550791,
        'coef':  4.2723965916811935
    },
    's_car': {
        'intercept':  10.515751350707365, 
        'coef':  1.112111276629702
    },
    'car': {
        'intercept':  10.515751350707365, 
        'coef':  1.112111276629702
    },
    'ridehail': {
        'intercept':  10.515751350707365, 
        'coef':  1.112111276629702
    },
    'transit': {
        'intercept':  7.5000351741492395,
        'coef':  1.839562017986485
    },
    'unknown': {
        'intercept':  9.423026094931313,
        'coef':  2.640429444466706
    },
    'no_trip': {
        'intercept': 0.,
        'coef': 0.
    }
}

ordinal_mapping = {'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
reverse_ordinal_mapping = {v:k for k,v in ordinal_mapping.items()}

print(ordinal_mapping)

In [None]:
# Let's only select data that we require and copy it into a separate df.
av_df = df_modded[['user_id', 'section_distance_argmax', 'section_duration_argmax', 'section_mode_argmax', 'available_modes']].copy()

# Update the same df.
av_df = generate_available_features(av_df, available, section_mode_mapping, ordinal_mapping)

In [None]:
av_df.head(10)

In [None]:
def compute_alt_durations(df: pd.DataFrame, reverse_mapper: dict, duration_mapper: dict):
    '''
    We re-iterate over the generated available mode df and populate the modes.
    We are only concerned with the rows that have primary_mode = 0, which indicates that they are alternate mode rows.
    This method uses the intercept and coefficients obtained from the regression models.
    '''

    column_names = list(reverse_mapper.values())

    # Start with a 0 cost for everyone.
    df[column_names] = 0

    rows = list()
    for ix, row in df.iterrows():
        
        # Convert to a dict
        row_dict = row.to_dict()

        for mode in column_names:
            if mode == reverse_mapper[row['chosen']]:
                row_dict[mode] = row_dict['section_distance_argmax']
                continue
            
            mode_params = duration_mapper[mode]
            # use availability as a mask.
            row_dict[mode] = row_dict['av_' + mode] * (mode_params['intercept'] + (mode_params['coef'] * row_dict['section_distance_argmax']))
        
        rows.append(row_dict)
    
    return pd.DataFrame(rows).rename(columns=dict([(c, 'tt_'+c) for c in column_names]), inplace=False)

In [None]:
def compute_durations_using_data(df: pd.DataFrame, section_mapper: dict, reverse_ordinal_mapping: dict):
    """
    Function to perform imputation using historical averages. Instead of estimating mode duration from mode distance, we simply
    impute it using the mean of the particular mode's historical duration.
    """

    # First, using the history, generate the average durations calculated per mode.
    duration_dict = df.groupby('section_mode_argmax')['section_duration_argmax'].mean().to_dict()

    extended_dict = dict()
    for k, v in duration_dict.items():
        mapped_sections = section_mapper[k]
        for _k in mapped_sections:
            extended_dict[_k] = v

    # Manually add one more.
    extended_dict['no_trip'] = 0.

    print(extended_dict)

    # NOW, we iterate over the dataframe.
    new_columns = list(reverse_ordinal_mapping.values())

    # Create and init to 0.
    df[new_columns] = 0.

    rows = []

    for _, row in df.iterrows():
        row_dict = row.to_dict()

        # First, look at the section_mode_argmax
        mapped_modes = section_mapper[row_dict['section_mode_argmax']]

        for mode in mapped_modes:
            section_duration = row_dict['section_duration_argmax']
            if section_duration > 0:
                row_dict[mode] = row_dict['section_duration_argmax']
            else:
                row_dict[mode] = extended_dict[mode]

        # for the remaining modes, estimate their duration through the dict.
        remaining = [mode for mode in new_columns if mode not in mapped_modes]

        for mode in remaining:
            # If you'd like to use a mask here, use the av_ dummy values to mask-out irrelevant data.
            row_dict[mode] = extended_dict[mode]
        
        rows.append(row_dict)
    
    return_df = pd.DataFrame(rows)
    return_df.rename(columns=dict([(c, 'tt_'+c) for c in new_columns]), inplace=True)
    return return_df

In [None]:
av_time_df = compute_durations_using_data(av_df, section_mode_mapping, reverse_ordinal_mapping)

In [None]:
av_time_df.head()

## Cost estimation

```
{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
```

In [None]:
# All values are taken from VTPI.

mode_cost_per_mile = {
    # bicycle/skateboard
    'p_micro': 0.,
    'no_trip': 0.,
    # Shared car would be half the cost of regular car, which is $0.6/mile.
    's_car': 0.3,
    # Rental car.
    'car': 0.6,
    # Average of bus and train taken.
    'transit': 0.6,
    # Shared bicyle or scooter - values taken from https://nacto.org/shared-micromobility-2020-2021/ and 
    # https://www.mckinsey.com/industries/automotive-and-assembly/our-insights/how-sharing-the-road-is-likely-to-transform-american-mobility
    's_micro': 0.3,
    # uber/taxi/lyft
    'ridehail': 2.,
    'walk': 0.,
    'unknown': 0.
}

# All 0 for now.
mode_init_cost = {
    k: 0 for k in mode_cost_per_mile.keys()
}

In [None]:
av_time_df.columns

In [None]:
def compute_cost_estimates(df: pd.DataFrame, cost_mapper: dict, init_cost_mapper: dict, ordinal_mapper: dict):
    
    # Create some extra colums.
    columns = list(ordinal_mapper.keys())

    print(columns)

    # Initialize the columns to 0.
    df[columns] = 0.

    rows = list()

    # Iterate over every row.
    for _, row in df.iterrows():
        # Check which flags are active.
        row_dict = row.to_dict()

        # ---- Not using the av_ features as a mask. ----
        # # Iterate over the columns and retain the columns with an available mode of 1. Returns a list of [av_* keys]
        # columns = [c for c in row_dict.keys() if c.startswith('av_') and row_dict[c] == 1]

        # Access the section_distance_argmax attribute for the distance. Note that this is now in miles.
        distance = row_dict['section_distance_argmax']

        for lookup in columns:
            row_dict[lookup] = init_cost_mapper[lookup] + (cost_mapper[lookup] * distance)

        rows.append(row_dict)

    new_df = pd.DataFrame(rows)
    new_df.rename(columns=dict([(c, 'cost_'+c) for c in ordinal_mapper.keys()]), inplace=True)

    return new_df

In [None]:
cost_df = compute_cost_estimates(av_time_df, cost_mapper=mode_cost_per_mile, init_cost_mapper=mode_init_cost, ordinal_mapper=ordinal_mapping)

In [None]:
cost_df[['user_id'] + [c for c  in cost_df.columns if 'av_' in c or 'cost_' in c or 'tt_' in c]].head()

In [None]:
cost_df.to_csv('../data/preprocessed_data.csv', index=False)