In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from traceback import print_exception

%matplotlib inline

In [2]:
# Data loading. Refer to the mnl_exploration.ipynb file for details on how user-level features are generated.

df = pd.read_csv('../data/final_modeling_data_02142024.csv')

  df = pd.read_csv('../data/final_modeling_data_02142024.csv')


In [3]:
print("Shape: ", df.shape)

Shape:  (80691, 103)


In [4]:
df.drop_duplicates(inplace=True)

### Add weather info

In [5]:
df.start_fmt_time = pd.to_datetime(df.start_fmt_time, utc=True).dt.tz_convert('America/Denver')

In [6]:
# We now need weather data. To get that, we download data from OpenMeteo for the given dates.
df.start_fmt_time.min(), df.start_fmt_time.max()

(Timestamp('2020-11-02 18:48:13.450000-0700', tz='America/Denver'),
 Timestamp('2022-12-30 23:33:27.147785-0700', tz='America/Denver'))

In [7]:
# Downloaded using OpenMeteo API.
weather_df = pd.read_csv('../data/denver_weather_data.csv')

In [8]:
weather_df.time = pd.to_datetime(weather_df.time, utc=True).dt.tz_convert('America/Denver')

In [9]:
original_shape = df.shape[0]

# Round start time to nearest hour.
df['start_time_rounded'] = df.start_fmt_time.dt.round(
    'H', ambiguous='infer', nonexistent='shift_backward'
)

In [10]:
# True intersection
df = df.merge(right=weather_df, left_on='start_time_rounded', right_on='time', how='left')

In [11]:
# Drop rows where weather data is NA.
df.dropna(subset=['temperature_2m (°F)','relative_humidity_2m (%)', 'dew_point_2m (°F)', 'rain (inch)',
                  'snowfall (inch)', 'wind_speed_10m (mp/h)', 'wind_gusts_10m (mp/h)'], inplace=True)

In [12]:
print(f"Old shape: {original_shape}. new shape: {df.shape[0]}")

Old shape: 80691. new shape: 80689


In [13]:
df.drop(columns=['start_time_rounded', 'time'], inplace=True)

### Creating the dummy variables for the available modes

In [14]:
df.isna().sum().to_dict()

{'source': 0,
 'end_ts': 0,
 'end_fmt_time': 0,
 'end_loc': 0,
 'raw_trip': 0,
 'start_ts': 0,
 'start_fmt_time': 0,
 'start_loc': 0,
 'duration': 0,
 'distance': 0,
 'start_place': 0,
 'end_place': 0,
 'cleaned_trip': 0,
 'inferred_labels': 217,
 'inferred_trip': 217,
 'expectation': 217,
 'confidence_threshold': 217,
 'expected_trip': 217,
 'user_input': 0,
 'section_modes': 0,
 'section_distances': 0,
 'start:year': 0,
 'start:month': 0,
 'start:day': 0,
 'start:hour': 0,
 'start_local_dt_minute': 0,
 'start_local_dt_second': 0,
 'start_local_dt_weekday': 0,
 'start_local_dt_timezone': 0,
 'end:year': 0,
 'end:month': 0,
 'end:day': 0,
 'end:hour': 0,
 'end_local_dt_minute': 0,
 'end_local_dt_second': 0,
 'end_local_dt_weekday': 0,
 'end_local_dt_timezone': 0,
 '_id': 0,
 'user_id': 0,
 'metadata_write_ts': 0,
 'additions': 80687,
 'mode_confirm': 337,
 'purpose_confirm': 527,
 'distance_miles': 0,
 'Mode_confirm': 0,
 'Replaced_mode': 0,
 'Trip_purpose': 0,
 'original_user_id': 0,


In [15]:
# | [d1, d2, d3] | [t1, t2, t3] | [m1, m2, m3] |


def compute_argmax(df: pd.DataFrame):
    # Create the two columns.
    df[['section_distance_argmax', 'section_duration_argmax', 'section_mode_argmax', 'section_coordinates_argmax']] = None
    rows = list()

    for ix, row in df.iterrows():
        row_dict = row.to_dict()
        
        try:
            parsed_distances = ast.literal_eval(row_dict['section_distances'])
            parsed_durations = ast.literal_eval(row_dict['section_durations'])
            parsed_modes = ast.literal_eval(row_dict['section_modes'])

            argmax_ix = np.argmax(parsed_distances)
            
            # Pick the argmax and scale to miles. (meters -> miles)
            row_dict['section_distance_argmax'] = parsed_distances[argmax_ix] * 0.0006213712
            
            # Pick the argmax and scale to minutes.
            row_dict['section_duration_argmax'] = parsed_durations[argmax_ix] / 60
            
            # Pick the argmax mode.
            row_dict['section_mode_argmax'] = parsed_modes[argmax_ix]
            
            row_dict['mark'] = False

        except Exception as e:
            row_dict['mark'] = True

        rows.append(row_dict)

    return pd.DataFrame(rows)

In [16]:
# Let's plot the mode-wise durations as a function of distance.
df = compute_argmax(df)

In [17]:
df.drop(columns=['section_distances', 'section_durations', 'section_modes'], inplace=True)

In [18]:
df = df.loc[~df.mark, :].reset_index(drop=True, inplace=False).drop(columns=['mark'], inplace=False)

In [19]:
# Drop instances where duration/distance is unusable.
df.drop(
    index=df.loc[(df.section_distance_argmax <= 0) | (df.section_duration_argmax <= 0), :].index,
    inplace=False
).reset_index(drop=True, inplace=True)

In [20]:
# bus, train, bicycling, walking, car
# split-apply-combine
def drop_outliers(df: pd.DataFrame, low=0.1, high=0.9) -> pd.DataFrame:
    def filter_by_percentiles(group):
        distance_low = group['section_distance_argmax'].quantile(low)
        distance_high = group['section_distance_argmax'].quantile(high)
        duration_low = group['section_duration_argmax'].quantile(low)
        duration_high = group['section_duration_argmax'].quantile(high)
        
        l1_filter = group[
            (group['section_distance_argmax'] >= distance_low) &
            (group['section_distance_argmax'] <= distance_high)
        ].reset_index(drop=True)
        
        l2_filter = l1_filter[
            (l1_filter['section_duration_argmax'] >= duration_low) &
            (l1_filter['section_duration_argmax'] <= duration_high)
        ].reset_index(drop=True)
        
        return l2_filter
    
    return df.groupby('section_mode_argmax').apply(filter_by_percentiles).reset_index(drop=True)

In [21]:
filtered_df = drop_outliers(df, low=0.01, high=0.99)

# Ideal speed. distance/time.
filtered_df['mph'] = (
    (filtered_df['section_distance_argmax'] * 60.)/filtered_df['section_duration_argmax']
)

In [22]:
def filter_mph(df: pd.DataFrame, low=0.1, high=0.9) -> pd.DataFrame:
    
    MPH_THRESHOLDS = {
        # https://www.sciencedirect.com/science/article/pii/S2210670718304682
        'bicycling': 15.,
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7806575/
        'walking': 2.93
    }
    
    def custom_filter(group):
        # Drop data specified in the dict manually.
        if group.name in MPH_THRESHOLDS.keys():
            f_df = group[group['mph'] <= MPH_THRESHOLDS[group.name]]
        else:
            mph_low = group['mph'].quantile(low)
            mph_high = group['mph'].quantile(high)

            f_df = group[(group['mph'] >= mph_low) & (group['mph'] <= mph_high)]
        
        return f_df
    
    return df.groupby('section_mode_argmax').apply(custom_filter).reset_index(drop=True)

In [23]:
filtered_df = filter_mph(filtered_df, low=0.01, high=0.99)

In [24]:
filtered_df.groupby('section_mode_argmax')[['section_distance_argmax', 'section_duration_argmax']].describe()

Unnamed: 0_level_0,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
section_mode_argmax,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
bicycling,9330.0,2.344544,1.591889,0.237542,1.356889,1.944232,2.849511,11.063928,9330.0,15.473286,10.290371,1.944501,9.035766,12.605268,18.452003,73.866648
bus,478.0,1.55223,1.41328,0.090773,0.646264,1.294477,1.960493,11.592445,478.0,10.017439,7.30211,0.518257,5.801983,8.16453,12.269039,59.1474
car,40885.0,5.940542,7.984688,0.196141,1.713557,3.34606,6.762013,74.464352,40885.0,16.717427,13.274385,2.089567,8.291839,12.638901,20.47257,110.945333
no_sensed,2730.0,4.63538,7.477224,0.103854,0.751998,1.992592,4.91712,71.856971,2730.0,21.321379,26.638677,0.262941,5.851082,12.440902,26.501255,215.273083
subway,9.0,22.422637,12.175141,3.259611,14.510151,23.587655,27.756583,42.891905,9.0,43.065269,23.009173,13.103616,27.562759,34.4,58.714,81.366026
train,96.0,14.303936,13.079218,0.886342,8.647429,10.572576,15.51667,67.987009,96.0,30.687195,17.134854,6.332442,20.763549,25.695106,40.190158,88.009998
walking,16608.0,0.545524,0.719691,0.071237,0.17301,0.309777,0.604884,8.706443,16608.0,27.730449,35.489695,2.283617,8.551447,15.590376,31.206353,295.377052


In [25]:
filtered_df.groupby('section_mode_argmax')[['mph']].describe()

Unnamed: 0_level_0,mph,mph,mph,mph,mph,mph,mph,mph
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
section_mode_argmax,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
bicycling,9330.0,9.247391,2.596876,2.579201,7.363136,9.29994,11.149174,14.998472
bus,478.0,9.802104,4.985744,0.541139,5.87176,9.732367,13.059672,28.627234
car,40885.0,18.588257,10.342023,1.644719,11.099615,16.53874,24.238533,55.344377
no_sensed,2730.0,13.042674,9.209485,3.891109,6.289566,10.042121,16.658858,53.570005
subway,9.0,31.47696,12.524113,14.925398,25.308404,27.726875,35.93073,59.570373
train,96.0,25.33505,10.249756,4.6823,21.237739,24.781257,28.723128,62.081
walking,16608.0,1.494737,0.807007,0.02255,0.804196,1.475082,2.192542,2.929953


In [26]:
print(f"Dropped {df.shape[0] - filtered_df.shape[0]} rows.")

Dropped 10460 rows.


In [2]:
(80689 - 70229)/80689

0.12963353121243293

In [27]:
filtered_df.Replaced_mode.unique()

array(['Regular Bike', 'Other', 'No Travel', 'Gas Car, drove alone',
       'Gas Car, with others', 'Walk', 'Taxi/Uber/Lyft', 'Train',
       'Bikeshare', 'Bus', 'Skate board', 'Free Shuttle', 'Scooter share',
       'E-bike', 'Not a Trip'], dtype=object)

In [28]:
## Define the mapping strategy

"""
p_micro: Personal micromobility
s_micro: Shared micromobility
s_car: Shared car
car: Car/rental car
transit: train + bus
no_trip: No vehicle / None
ridehail: Uber/Lyft
"""

available = {
    'Bicycle': 'p_micro',
    'Do not have vehicle': 'unknown',
    'Get a ride from a friend or family member': 's_car',
    'None': 'no_trip',
    'Public transportation (bus, subway, light rail, etc.)': 'transit',
    'Rental car (including Zipcar/ Car2Go)': 'car',
    'Shared bicycle or scooter': 's_micro',
    'Skateboard': 'p_micro',
    'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail',
    'Walk/roll': 'walk',
    'Prefer not to say': 'unknown'
}

# Map each sensed mode to the binary indicators.
section_mode_mapping = {
    'bicycling': ['p_micro', 's_micro'],
    'bus': ['transit'],
    'car': ['s_car', 'car', 'ridehail'],
    'no_sensed': ['unknown'],
    'train': ['transit'],
    'walking': ['walk'],
    'subway': ['transit']
}

# For target - mode_confirm
# target_mapping = {
#     'Regular Bike': 'p_micro',
#     'Walk': 'walk',
#     'Gas Car, with others': 's_car',
#     'Gas Car, drove alone': 'car',
#     'Bikeshare': 's_micro',
#     'Other': 'unknown',
#     'Bus': 'transit',
#     'Not a Trip': 'no_trip',
#     'E-bike': 'p_micro',
#     'Train': 'transit',
#     'Taxi/Uber/Lyft': 'ridehail',
#     'Free Shuttle': 'transit',
#     'Scooter share': 's_micro',
#     'Skate board': 'p_micro'
# }

# For target - replaced_mode
target_mapping = {
    'Regular Bike': 'p_micro', 
    'Walk': 'walk', 
    'No Travel': 'no_trip', 
    'Other': 'unknown',
    'Gas Car, with others': 's_car',
    'Gas Car, drove alone': 'car',
    'Train': 'transit',
    'Scooter share': 's_micro', 
    'Taxi/Uber/Lyft': 'ridehail', 
    'Free Shuttle': 'transit', 
    'Skate board': 'p_micro',
    'Bikeshare': 's_micro',
    'Bus': 'transit',
    'Not a Trip': 'no_trip',
    'E-bike': 'p_micro'
}

# # These values were obtained using a simple linear regression model. Specifics can be found in time_distance_estimation.ipynb
# duration_dict = {
#     'walk': {
#         'intercept': 14.09698152519682,
#         'coef':  19.153802211061134
#     },
#     'p_micro': {
#         'intercept':  4.753590860550791,
#         'coef':  4.2723965916811935
#     },
#     's_micro': {
#         'intercept':  4.753590860550791,
#         'coef':  4.2723965916811935
#     },
#     's_car': {
#         'intercept':  10.515751350707365, 
#         'coef':  1.112111276629702
#     },
#     'car': {
#         'intercept':  10.515751350707365, 
#         'coef':  1.112111276629702
#     },
#     'ridehail': {
#         'intercept':  10.515751350707365, 
#         'coef':  1.112111276629702
#     },
#     'transit': {
#         'intercept':  7.5000351741492395,
#         'coef':  1.839562017986485
#     },
#     'unknown': {
#         'intercept':  9.423026094931313,
#         'coef':  2.640429444466706
#     },
#     'no_trip': {
#         'intercept': 0.,
#         'coef': 0.
#     }
# }

ordinal_mapping = {'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
reverse_ordinal_mapping = {v:k for k,v in ordinal_mapping.items()}

print(ordinal_mapping)

{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}


In [29]:
def generate_available_features(
    df: pd.DataFrame, av: dict, sm: dict, target_mapper: dict
):
    """
    This method generates the wide-form dummy features for available modes.
    The current implementation uses (section_mode_argmax OR available_modes) to indicate mode availability.
    """

    # Reset indices.
    df = df.reset_index(drop=True, inplace=False)

    columns = list(set(av.values())) + ['unknown']

    # Create the columns along with the 'chosen' column.
    df[columns] = 0

    row_dicts = list()

    for i, row in df.iterrows():
        row_dict = row.to_dict()

        # Access the available modes. Split on ; and strip all the elements.
        available_modes = [x.strip() for x in str(row_dict['available_modes']).split(';')]
        
        # Set all the available modes associated with the chosen modes too.
        # Toggle this flag off if you don't want the target to be considered as an available mode.
        # row_dict[row_dict['chosen']] = 1
        
        row_dict['target'] = target_mapper[row_dict['target']]
        
        # Update the available modes.
        for available_mode in available_modes:
            row_dict[av[available_mode]] = 1
        
        # Now, look at the section_mode_argmax as a proxy for the confirmed mode.
        chosen_modes = sm[row_dict['section_mode_argmax']]

        # Determine the length of the mapping.
        n = len(chosen_modes)

        # # Go ahead and modify the current dict with the first value.
        # row_dict['chosen'] = ordinal_mapper[chosen_modes[0]]
        # row_dict['primary_mode'] = 1

        # Also update the available modes.
        for chosen_mode in chosen_modes:
            row_dict[chosen_mode] = 1

        # # Append the first row to the list.
        row_dicts.append(row_dict)
        
    constructed = pd.DataFrame(row_dicts)
    constructed.rename(columns=dict([(c, 'av_'+c) for c in target_mapper.keys()]), inplace=True)

    return constructed

```language=python
{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
```

In [30]:
def generate_target_variable(df: pd.DataFrame, mapper: dict):
    df['target'] = df['Replaced_mode'].apply(lambda x: mapper[x])
    df.drop(columns=['Replaced_mode'], inplace=True)
    return df

filtered_df = generate_target_variable(filtered_df, target_mapping)

In [31]:
# Generate wide-format availability indicators..
av_df = generate_available_features(filtered_df, available, section_mode_mapping, ordinal_mapping)

# Collapse train and bus into 'transit'.
av_df.loc[av_df.section_mode_argmax.isin(['bus', 'train', 'subway']), 'section_mode_argmax'] = 'transit'

In [32]:
av_df.section_mode_argmax.unique()

array(['bicycling', 'transit', 'car', 'no_sensed', 'walking'],
      dtype=object)

In [33]:
display(av_df.head())

Unnamed: 0,source,end_ts,end_fmt_time,end_loc,raw_trip,start_ts,start_fmt_time,start_loc,duration,distance,...,target,av_s_micro,av_ridehail,av_unknown,av_car,av_transit,av_walk,av_s_car,av_no_trip,av_p_micro
0,DwellSegmentationTimeFilter,1619140000.0,2021-04-22 18:59:39-06:00,"{'type': 'Point', 'coordinates': [-105.1002447...",6082e47f27dcf1f393b70dd8,1619139000.0,2021-04-22 18:48:48.365291-06:00,"{'type': 'Point', 'coordinates': [-105.0772567...",10.843912,1995.541857,...,1,1,0,0,0,0,0,0,0,1
1,DwellSegmentationTimeFilter,1619197000.0,2021-04-23 11:04:53-06:00,"{'type': 'Point', 'coordinates': [-105.0776493...",60830eac2b9762e085f774e8,1619196000.0,2021-04-23 10:45:48.219950-06:00,"{'type': 'Point', 'coordinates': [-105.1228294...",19.079667,4764.133777,...,1,1,0,0,0,0,0,0,0,1
2,DwellSegmentationTimeFilter,1619662000.0,2021-04-28 19:58:51-06:00,"{'type': 'Point', 'coordinates': [-105.0854639...",608adb94b706ddbc669dabb5,1619658000.0,2021-04-28 19:07:47.510844-06:00,"{'type': 'Point', 'coordinates': [-105.1227172...",51.058153,9049.744908,...,1,1,0,0,0,0,0,0,0,1
3,DwellSegmentationTimeFilter,1619728000.0,2021-04-29 14:23:25-06:00,"{'type': 'Point', 'coordinates': [-105.0776309...",608b21db654c4a5ab2fa435a,1619727000.0,2021-04-29 14:05:50.979267-06:00,"{'type': 'Point', 'coordinates': [-105.1228283...",17.567012,4731.604069,...,9,1,0,0,0,0,0,0,0,1
4,DwellSegmentationTimeFilter,1619794000.0,2021-04-30 08:54:03-06:00,"{'type': 'Point', 'coordinates': [-105.0775022...",608c1f002f548f5db71d16ef,1619793000.0,2021-04-30 08:34:49.162054-06:00,"{'type': 'Point', 'coordinates': [-105.1226652...",19.230632,4675.329725,...,9,1,0,0,0,0,0,0,0,1


In [34]:
# def compute_alt_durations(df: pd.DataFrame, reverse_mapper: dict, duration_mapper: dict):
#     '''
#     We re-iterate over the generated available mode df and populate the modes.
#     We are only concerned with the rows that have primary_mode = 0, which indicates that they are alternate mode rows.
#     '''

#     column_names = list(reverse_mapper.values())

#     # Start with a 0 cost for everyone.
#     df[column_names] = 0

#     rows = list()
#     for ix, row in df.iterrows():
        
#         # Convert to a dict
#         row_dict = row.to_dict()

#         for mode in column_names:
#             if mode == reverse_mapper[row['chosen']]:
#                 row_dict[mode] = row_dict['section_distance_argmax']
#                 continue
            
#             mode_params = duration_mapper[mode]
#             # use availability as a mask.
#             row_dict[mode] = mode_params['intercept'] + (mode_params['coef'] * row_dict['section_distance_argmax'])
        
#         rows.append(row_dict)
    
#     return pd.DataFrame(rows).rename(columns=dict([(c, 'tt_'+c) for c in column_names]), inplace=False)

In [35]:
# av_time_df = compute_alt_durations(av_df, reverse_ordinal_mapping, duration_dict)

## Cost estimation

```
{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
```

In [36]:
# All values are taken from VTPI.
# https://www.vtpi.org/tca/tca0501.pdf
mode_cost_per_mile = {
    # bicycle/skateboard
    'p_micro': 0.,
    'no_trip': 0.,
    # Shared car is half the cost of regular car, which is $0.6/mile.
    's_car': 0.3,
    # Rental car.
    'car': 0.6,
    # Average of bus and train taken.
    'transit': 0.5,
    # Shared bicyle or scooter - values taken from https://nacto.org/shared-micromobility-2020-2021/ and 
    # https://www.mckinsey.com/industries/automotive-and-assembly/our-insights/how-sharing-the-road-is-likely-to-transform-american-mobility
    's_micro': 0.3,
    # uber/taxi/lyft
    'ridehail': 2.,
    'walk': 0.,
    'unknown': 0.
}

# Assumptions.
mode_init_cost = {
    'p_micro': 0.,
    'no_trip': 0.,
    # Shared car is half the cost of regular car, which is $0.6/mile.
    's_car': 0.,
    # Rental car.
    'car': 0.,
    # Average of bus and train taken.
    'transit': 0.,
    # $1 unlocking cost.
    's_micro': 1.,
    # uber/taxi/lyft
    'ridehail': 1.5,
    'walk': 0.,
    'unknown': 0.
}

In [37]:
def compute_cost_estimates(df: pd.DataFrame, cost_mapper: dict, init_cost_mapper: dict, ordinal_mapper: dict):
    
    # Create some extra colums.
    columns = list(ordinal_mapper.keys())

    # Initialize the columns to 0.
    df[columns] = 0.

    rows = list()

    # Iterate over every row.
    for _, row in df.iterrows():
        # Check which flags are active.
        row_dict = row.to_dict()

        # Access the section_distance_argmax attribute for the distance. Note that this is now in miles.
        distance = row_dict['section_distance_argmax']
        
        # Mask using availability.
        for lookup in columns:
            row_dict[lookup] = row_dict['av_' + lookup] * (init_cost_mapper[lookup] + (cost_mapper[lookup] * distance))

        rows.append(row_dict)

    new_df = pd.DataFrame(rows)
    new_df.rename(columns=dict([(c, 'cost_'+c) for c in ordinal_mapper.keys()]), inplace=True)

    return new_df

In [38]:
cost_df = compute_cost_estimates(
    av_df, cost_mapper=mode_cost_per_mile, 
    init_cost_mapper=mode_init_cost, 
    ordinal_mapper=ordinal_mapping
)

In [39]:
cost_df[[c for c in cost_df.columns if 'cost_' in c]].describe()

Unnamed: 0,cost_p_micro,cost_no_trip,cost_s_car,cost_transit,cost_car,cost_s_micro,cost_ridehail,cost_walk,cost_unknown
count,70136.0,70136.0,70136.0,70136.0,70136.0,70136.0,70136.0,70136.0,70136.0
mean,0.0,0.0,1.141106,1.377155,2.105648,0.424135,8.499317,0.0,0.0
std,0.0,0.0,2.036475,2.882227,4.072574,1.092019,13.763483,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.072439,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.507384,0.414311,0.784733,0.0,4.668714,0.0,0.0
75%,0.0,0.0,1.254198,1.419553,2.388235,0.0,9.727618,0.0,0.0
max,0.0,0.0,22.339306,37.232176,44.678611,23.26293,150.428703,0.0,0.0


In [40]:
# student_status = {
#     'Not a student': 0,
#     'Yes - Full Time College/University': 1,
#     'Yes - Vocation/Technical/Trade School': 1,
#     'Yes - Part-Time College/University': 1,
#     'Fire Fighter 2 Training': 0,
#     'Taking prerequisites missing for grad program ': 1,
#     'Work': 0,
#     'Graduate': 1,
#     'Work at csu': 0,
#     'Custodian': 0, 
#     'taking classes toward early childhood licensure': 1,
#     'Prefer not to say': 0
# }


# cost_df['is_student'] = cost_df['is_student'].apply(lambda x: student_status[x])

In [41]:
# cost_df['age'] = cost_df['age'].apply(lambda x: x if x < 100 else 2024 - x)
# cost_df['n_working_residents'] = cost_df['n_working_residents'].apply(lambda x: 0 if x < 0 else x)
# cost_df.rename(
#     columns={'start_local_dt_weekday': 'start:DOW', 'end_local_dt_weekday': 'end:DOW'},
#     inplace=True
# )

In [42]:
cost_df.to_csv('../data/ReplacedMode_Fix_02142024.csv', index=False)