In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from traceback import print_exception

%matplotlib inline

In [2]:
# Data loading. Refer to the mnl_exploration.ipynb file for details on how user-level features are generated.

df = pd.read_csv('../data/final_modeling_data.csv')

In [3]:
print("Shape: ", df.shape)

Shape:  (74631, 57)


In [4]:
df.drop_duplicates(inplace=True)

### Add weather info

In [5]:
# Downloaded using OpenMeteo API.
weather_df = pd.read_csv('../data/denver_weather_data.csv')

In [6]:
weather_df.time = pd.to_datetime(weather_df.time, utc=True).dt.tz_convert('America/Denver')

In [7]:
weather_df.describe()

Unnamed: 0,temperature_2m (°F),relative_humidity_2m (%),dew_point_2m (°F),rain (inch),snowfall (inch),cloud_cover (%),wind_speed_10m (mp/h),wind_gusts_10m (mp/h)
count,64728.0,64728.0,64728.0,64728.0,64728.0,64728.0,64728.0,64728.0
mean,49.363325,52.405358,29.105254,0.001158,0.004958,29.364541,5.538084,11.855783
std,21.502112,23.370179,15.578672,0.008632,0.03559,33.181215,3.167199,5.973471
min,-22.4,5.0,-27.8,0.0,0.0,0.0,0.0,1.1
25%,32.5,33.0,18.0,0.0,0.0,0.0,3.5,7.6
50%,49.2,53.0,28.9,0.0,0.0,18.0,4.9,10.5
75%,65.6,71.0,41.5,0.0,0.0,46.0,6.8,15.0
max,100.9,100.0,68.8,0.449,1.378,100.0,45.9,77.6


In [8]:
# First, convert to UTC. Then, parse to America/Denver.
df['start_fmt_time'] = pd.to_datetime(
    df['start_fmt_time'], utc=True
).dt.tz_convert('America/Denver')

original_shape = df.shape[0]

In [9]:
# Round start time to nearest hour.
df['start_time_rounded'] = df.start_fmt_time.dt.round(
    'H', ambiguous='infer', nonexistent='shift_backward'
)

In [10]:
# True intersection
df = df.merge(right=weather_df, left_on='start_time_rounded', right_on='time', how='inner')

In [11]:
print(f"Old shape: {original_shape}. new shape: {df.shape[0]}")

Old shape: 74631. new shape: 74622


In [12]:
df.drop(columns=['start_time_rounded', 'time'], inplace=True)

### Creating the dummy variables for the available modes

In [13]:
# We want this to be ordinal because 2 > 1 implies that it has higher associated value.

income_ordinal_mapping = {
    np.nan: 0,
    'Prefer not to say': 0,
    'Less than $24,999': 1,
    '$25,000-$49,999': 2,
    '$50,000-$99,999': 3,
    '$100,000 -$149,999': 4,
    '$150,000-$199,999': 5
}

df.income_category = df.income_category.apply(lambda x: income_ordinal_mapping[x])

In [14]:
# | [d1, d2, d3] | [t1, t2, t3] | [m1, m2, m3] |


def compute_argmax(df: pd.DataFrame):
    # Create the two columns.
    df[['section_distance_argmax', 'section_duration_argmax', 'section_mode_argmax', 'section_coordinates_argmax']] = None
    rows = list()

    for ix, row in df.iterrows():
        row_dict = row.to_dict()
        
        try:
            parsed_distances = ast.literal_eval(row_dict['section_distances'])
            parsed_durations = ast.literal_eval(row_dict['section_durations'])
            parsed_modes = ast.literal_eval(row_dict['section_modes'])

            argmax_ix = np.argmax(parsed_distances)
            
            # Pick the argmax and scale to miles. (meters -> miles)
            row_dict['section_distance_argmax'] = parsed_distances[argmax_ix] * 0.0006213712
            
            # Pick the argmax and scale to minutes.
            row_dict['section_duration_argmax'] = parsed_durations[argmax_ix] / 60
            
            # Pick the argmax mode.
            row_dict['section_mode_argmax'] = parsed_modes[argmax_ix]
            
            row_dict['mark'] = False

        except Exception as e:
            row_dict['mark'] = True

        rows.append(row_dict)

    return pd.DataFrame(rows)

In [15]:
# Let's plot the mode-wise durations as a function of distance.
df = compute_argmax(df)

In [16]:
df = df.loc[~df.mark, :].reset_index(drop=True, inplace=False).drop(columns=['mark'], inplace=False)

In [17]:
# Drop instances where duration/distance is unusable.
df.drop(
    index=df.loc[(df.section_distance_argmax <= 0) | (df.section_duration_argmax <= 0), :].index,
    inplace=False
).reset_index(drop=True, inplace=True)

In [18]:
# bus, train, bicycling, walking, car
# split-apply-combine
def drop_outliers(df: pd.DataFrame) -> pd.DataFrame:
    def filter_by_percentiles(group):
        distance_low = group['section_distance_argmax'].quantile(0.1)
        distance_high = group['section_distance_argmax'].quantile(0.9)
        duration_low = group['section_duration_argmax'].quantile(0.1)
        duration_high = group['section_duration_argmax'].quantile(0.9)
        
        l1_filter = group[
            (group['section_distance_argmax'] >= distance_low) &
            (group['section_distance_argmax'] <= distance_high)
        ].reset_index(drop=True)
        
        l2_filter = l1_filter[
            (l1_filter['section_duration_argmax'] >= duration_low) &
            (l1_filter['section_duration_argmax'] <= duration_high)
        ].reset_index(drop=True)
        
        return l2_filter
    
    return df.groupby('section_mode_argmax').apply(filter_by_percentiles).reset_index(drop=True)

In [19]:
filtered_df = drop_outliers(df)

# Ideal speed. distance/time.
filtered_df['mph'] = (
    (filtered_df['section_distance_argmax'] * 60.)/filtered_df['section_duration_argmax']
)

In [20]:
def filter_mph(df: pd.DataFrame) -> pd.DataFrame:
    
    MPH_THRESHOLDS = {
        # https://www.sciencedirect.com/science/article/pii/S2210670718304682
        'bicycling': 15.,
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7806575/
        'walking': 2.93
    }
    
    def custom_filter(group):
        # Drop data specified in the dict manually.
        if group.name in MPH_THRESHOLDS.keys():
            f_df = group[group['mph'] <= MPH_THRESHOLDS[group.name]]
        else:
            mph_low = group['mph'].quantile(0.1)
            mph_high = group['mph'].quantile(0.9)

            f_df = group[(group['mph'] >= mph_low) & (group['mph'] <= mph_high)]
        
        return f_df
    
    return df.groupby('section_mode_argmax').apply(custom_filter).reset_index(drop=True)

In [21]:
filtered_df = filter_mph(filtered_df)

In [22]:
filtered_df.groupby('section_mode_argmax')[['section_distance_argmax', 'section_duration_argmax']].describe()

Unnamed: 0_level_0,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_distance_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax,section_duration_argmax
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
section_mode_argmax,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
bicycling,6894.0,2.139411,0.86315,0.816301,1.508245,1.954151,2.570408,4.863086,6894.0,13.605943,4.973791,5.883333,9.915279,12.605268,16.441494,28.536488
bus,262.0,1.41524,0.597587,0.372875,0.922474,1.455352,1.914197,2.7909,262.0,8.870938,3.495783,2.963177,6.559803,8.240974,10.53729,19.882077
car,22816.0,3.971898,2.465458,0.902574,2.141305,3.331898,5.101229,14.183176,22816.0,13.625199,6.457826,5.613349,8.624635,11.845664,16.993311,34.438154
no_sensed,1537.0,2.594322,2.28607,0.175934,1.019584,1.924904,3.369579,13.154217,1537.0,15.280348,11.299653,1.045078,7.129649,12.035167,20.788176,52.847837
train,40.0,11.08754,2.284191,6.352325,10.18673,10.384223,12.414722,15.743037,40.0,26.469555,5.940263,18.180953,21.977135,24.722844,29.0457,40.568931
walking,11032.0,0.478625,0.374218,0.119507,0.220222,0.347397,0.598698,2.251371,11032.0,20.024429,13.372401,5.284233,9.434756,15.743495,26.810285,61.655195


In [23]:
filtered_df.groupby('section_mode_argmax')[['mph']].describe()

Unnamed: 0_level_0,mph,mph,mph,mph,mph,mph,mph,mph
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
section_mode_argmax,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
bicycling,6894.0,9.620581,2.353776,2.81758,7.928498,9.656866,11.284167,14.998472
bus,262.0,9.841872,3.153465,4.233195,7.215855,9.818024,12.487953,15.579753
car,22816.0,17.154212,5.456196,8.510055,12.61861,16.421485,21.203608,29.536059
no_sensed,1537.0,10.387926,4.575956,4.608177,6.59696,9.28998,13.206031,22.796647
train,40.0,25.415228,3.468611,17.513938,23.234048,25.795934,27.932657,30.865389
walking,11032.0,1.605326,0.74749,0.131592,0.983049,1.591385,2.247452,2.929953


In [24]:
print(f"Dropped {df.shape[0] - filtered_df.shape[0]} rows.")

Dropped 32041 rows.


In [25]:
## Define the mapping strategy

"""
p_micro: Personal micromobility
s_micro: Shared micromobility
s_car: Shared car
car: Car/rental car
transit: train + bus
no_trip: No vehicle / None
ridehail: Uber/Lyft
"""

available = {
    'Bicycle': 'p_micro',
    'Do not have vehicle': 'no_trip',
    'Get a ride from a friend or family member': 's_car',
    'None': 'no_trip',
    'Public transportation (bus, subway, light rail, etc.)': 'transit',
    'Rental car (including Zipcar/ Car2Go)': 'car',
    'Shared bicycle or scooter': 's_micro',
    'Skateboard': 'p_micro',
    'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail',
    'Walk/roll': 'walk'
}

# Map each sensed mode to the binary indicators.
section_mode_mapping = {
    'bicycling': ['p_micro', 's_micro'],
    'bus': ['transit'],
    'car': ['s_car', 'car', 'ridehail'],
    'no_sensed': ['unknown'],
    'train': ['transit'],
    'walking': ['walk']
}

# For target - mode_confirm
# target_mapping = {
#     'Regular Bike': 'p_micro',
#     'Walk': 'walk',
#     'Gas Car, with others': 's_car',
#     'Gas Car, drove alone': 'car',
#     'Bikeshare': 's_micro',
#     'Other': 'unknown',
#     'Bus': 'transit',
#     'Not a Trip': 'no_trip',
#     'E-bike': 'p_micro',
#     'Train': 'transit',
#     'Taxi/Uber/Lyft': 'ridehail',
#     'Free Shuttle': 'transit',
#     'Scooter share': 's_micro',
#     'Skate board': 'p_micro'
# }

# For target - replaced_mode
target_mapping = {
    'Regular Bike': 'p_micro', 
    'Walk': 'walk', 
    'No Travel': 'no_trip', 
    'Other': 'unknown',
    'Gas Car, with others': 's_car',
    'Gas Car, drove alone': 'car',
    'Train': 'transit',
    'Scooter share': 's_micro', 
    'Taxi/Uber/Lyft': 'ridehail', 
    'Free Shuttle': 'transit', 
    'Skate board': 'p_micro',
    'Bikeshare': 's_micro',
    'Bus': 'transit',
    'Not a Trip': 'no_trip',
    'E-bike': 'p_micro'
}

# # These values were obtained using a simple linear regression model. Specifics can be found in time_distance_estimation.ipynb
# duration_dict = {
#     'walk': {
#         'intercept': 14.09698152519682,
#         'coef':  19.153802211061134
#     },
#     'p_micro': {
#         'intercept':  4.753590860550791,
#         'coef':  4.2723965916811935
#     },
#     's_micro': {
#         'intercept':  4.753590860550791,
#         'coef':  4.2723965916811935
#     },
#     's_car': {
#         'intercept':  10.515751350707365, 
#         'coef':  1.112111276629702
#     },
#     'car': {
#         'intercept':  10.515751350707365, 
#         'coef':  1.112111276629702
#     },
#     'ridehail': {
#         'intercept':  10.515751350707365, 
#         'coef':  1.112111276629702
#     },
#     'transit': {
#         'intercept':  7.5000351741492395,
#         'coef':  1.839562017986485
#     },
#     'unknown': {
#         'intercept':  9.423026094931313,
#         'coef':  2.640429444466706
#     },
#     'no_trip': {
#         'intercept': 0.,
#         'coef': 0.
#     }
# }

ordinal_mapping = {'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
reverse_ordinal_mapping = {v:k for k,v in ordinal_mapping.items()}

print(ordinal_mapping)

{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}


In [26]:
def generate_available_features(
    df: pd.DataFrame, av: dict, sm: dict, target_mapper: dict
):
    """
    This method generates the wide-form dummy features for available modes.
    The current implementation uses (section_mode_argmax OR available_modes) to indicate mode availability.
    """

    # Reset indices.
    df = df.reset_index(drop=True, inplace=False)

    columns = list(set(av.values())) + ['unknown']

    # Create the columns along with the 'chosen' column.
    df[columns] = 0

    row_dicts = list()

    for i, row in df.iterrows():
        row_dict = row.to_dict()

        # Access the available modes. Split on ; and strip all the elements.
        available_modes = [x.strip() for x in str(row_dict['available_modes']).split(';')]
        
        # Set all the available modes associated with the chosen modes too.
        # Toggle this flag off if you don't want the target to be considered as an available mode.
        # row_dict[row_dict['chosen']] = 1
        
        row_dict['chosen'] = target_mapper[row_dict['chosen']]
        
        # Update the available modes.
        for available_mode in available_modes:
            row_dict[av[available_mode]] = 1
        
        # Now, look at the section_mode_argmax as a proxy for the confirmed mode.
        chosen_modes = sm[row_dict['section_mode_argmax']]

        # Determine the length of the mapping.
        n = len(chosen_modes)

        # # Go ahead and modify the current dict with the first value.
        # row_dict['chosen'] = ordinal_mapper[chosen_modes[0]]
        # row_dict['primary_mode'] = 1

        # Also update the available modes.
        for chosen_mode in chosen_modes:
            row_dict[chosen_mode] = 1

        # # Append the first row to the list.
        row_dicts.append(row_dict)
        
    constructed = pd.DataFrame(row_dicts)
    constructed.rename(columns=dict([(c, 'av_'+c) for c in target_mapper.keys()]), inplace=True)

    return constructed

```language=python
{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
```

In [27]:
def generate_target_variable(df: pd.DataFrame, mapper: dict):
    df['chosen'] = df['Replaced_mode'].apply(lambda x: mapper[x])
    df.drop(columns=['Replaced_mode'], inplace=True)
    return df

filtered_df = generate_target_variable(filtered_df, target_mapping)

In [28]:
# Generate wide-format availability indicators..
av_df = generate_available_features(filtered_df, available, section_mode_mapping, ordinal_mapping)

# Collapse train and bus into 'transit'.
av_df.loc[av_df.section_mode_argmax.isin(['bus', 'train']), 'section_mode_argmax'] = 'transit'

In [None]:
av_df.section_mode_argmax.unique()

In [29]:
display(av_df.head())

Unnamed: 0,user_id,_id,original_user_id,cleaned_trip,Mode_confirm,start_fmt_time,start:year,start:month,start:day,start:hour,...,chosen,av_s_car,av_walk,av_car,av_ridehail,av_p_micro,av_no_trip,av_s_micro,av_transit,av_unknown
0,6373dfb8cb9b47e88e8f76adcfadde20,611fdd7cbc8ce4a9e3911adb,6373dfb8-cb9b-47e8-8e8f-76adcfadde20,6082e48127dcf1f393b70ea8,Regular Bike,2021-04-22 18:48:48.365291-06:00,2021.0,4.0,22.0,18.0,...,1,0,0,0,0,1,0,1,0,0
1,6373dfb8cb9b47e88e8f76adcfadde20,611fdd7cbc8ce4a9e3911add,6373dfb8-cb9b-47e8-8e8f-76adcfadde20,60830ead2b9762e085f774f2,Regular Bike,2021-04-23 10:45:48.219950-06:00,2021.0,4.0,23.0,10.0,...,1,0,0,0,0,1,0,1,0,0
2,6373dfb8cb9b47e88e8f76adcfadde20,611fdd7ebc8ce4a9e3911af4,6373dfb8-cb9b-47e8-8e8f-76adcfadde20,608b21db654c4a5ab2fa4364,Other,2021-04-29 14:05:50.979267-06:00,2021.0,4.0,29.0,14.0,...,9,0,0,0,0,1,0,1,0,0
3,6373dfb8cb9b47e88e8f76adcfadde20,611fdd7fbc8ce4a9e3911af9,6373dfb8-cb9b-47e8-8e8f-76adcfadde20,608c1f002f548f5db71d16f9,Other,2021-04-30 08:34:49.162054-06:00,2021.0,4.0,30.0,8.0,...,9,0,0,0,0,1,0,1,0,0
4,6373dfb8cb9b47e88e8f76adcfadde20,611fdd7fbc8ce4a9e3911afb,6373dfb8-cb9b-47e8-8e8f-76adcfadde20,608c8f7fb2d3b88f6947766b,Regular Bike,2021-04-30 11:58:07.194775-06:00,2021.0,4.0,30.0,11.0,...,1,0,0,0,0,1,0,1,0,0


In [30]:
# def compute_alt_durations(df: pd.DataFrame, reverse_mapper: dict, duration_mapper: dict):
#     '''
#     We re-iterate over the generated available mode df and populate the modes.
#     We are only concerned with the rows that have primary_mode = 0, which indicates that they are alternate mode rows.
#     '''

#     column_names = list(reverse_mapper.values())

#     # Start with a 0 cost for everyone.
#     df[column_names] = 0

#     rows = list()
#     for ix, row in df.iterrows():
        
#         # Convert to a dict
#         row_dict = row.to_dict()

#         for mode in column_names:
#             if mode == reverse_mapper[row['chosen']]:
#                 row_dict[mode] = row_dict['section_distance_argmax']
#                 continue
            
#             mode_params = duration_mapper[mode]
#             # use availability as a mask.
#             row_dict[mode] = mode_params['intercept'] + (mode_params['coef'] * row_dict['section_distance_argmax'])
        
#         rows.append(row_dict)
    
#     return pd.DataFrame(rows).rename(columns=dict([(c, 'tt_'+c) for c in column_names]), inplace=False)

In [31]:
# av_time_df = compute_alt_durations(av_df, reverse_ordinal_mapping, duration_dict)

## Cost estimation

```
{'p_micro': 1, 'no_trip': 2, 's_car': 3, 'transit': 4, 'car': 5, 's_micro': 6, 'ridehail': 7, 'walk': 8, 'unknown': 9}
```

In [32]:
# All values are taken from VTPI.
# https://www.vtpi.org/tca/tca0501.pdf
mode_cost_per_mile = {
    # bicycle/skateboard
    'p_micro': 0.,
    'no_trip': 0.,
    # Shared car is half the cost of regular car, which is $0.6/mile.
    's_car': 0.3,
    # Rental car.
    'car': 0.6,
    # Average of bus and train taken.
    'transit': 0.6,
    # Shared bicyle or scooter - values taken from https://nacto.org/shared-micromobility-2020-2021/ and 
    # https://www.mckinsey.com/industries/automotive-and-assembly/our-insights/how-sharing-the-road-is-likely-to-transform-american-mobility
    's_micro': 0.3,
    # uber/taxi/lyft
    'ridehail': 2.,
    'walk': 0.,
    'unknown': 0.
}

# All 0 for now.
mode_init_cost = {
    'p_micro': 0.,
    'no_trip': 0.,
    # Shared car is half the cost of regular car, which is $0.6/mile.
    's_car': 0.,
    # Rental car.
    'car': 0.,
    # Average of bus and train taken.
    'transit': 0.,
    # $1 unlocking cost.
    's_micro': 1.,
    # uber/taxi/lyft
    'ridehail': 0.,
    'walk': 0.,
    'unknown': 0.
}

In [33]:
def compute_cost_estimates(df: pd.DataFrame, cost_mapper: dict, init_cost_mapper: dict, ordinal_mapper: dict):
    
    # Create some extra colums.
    columns = list(ordinal_mapper.keys())

    # Initialize the columns to 0.
    df[columns] = 0.

    rows = list()

    # Iterate over every row.
    for _, row in df.iterrows():
        # Check which flags are active.
        row_dict = row.to_dict()

        # Access the section_distance_argmax attribute for the distance. Note that this is now in miles.
        distance = row_dict['section_distance_argmax']
        
        # Mask using availability.
        for lookup in columns:
            row_dict[lookup] = row_dict['av_' + lookup] * (init_cost_mapper[lookup] + (cost_mapper[lookup] * distance))

        rows.append(row_dict)

    new_df = pd.DataFrame(rows)
    new_df.rename(columns=dict([(c, 'cost_'+c) for c in ordinal_mapper.keys()]), inplace=True)

    return new_df

In [34]:
cost_df = compute_cost_estimates(
    av_df, cost_mapper=mode_cost_per_mile, 
    init_cost_mapper=mode_init_cost, 
    ordinal_mapper=ordinal_mapping
)

In [35]:
cost_df[[c for c in cost_df.columns if 'cost_' in c]].describe()

Unnamed: 0,cost_p_micro,cost_no_trip,cost_s_car,cost_transit,cost_car,cost_s_micro,cost_ridehail,cost_walk,cost_unknown
count,42581.0,42581.0,42581.0,42581.0,42581.0,42581.0,42581.0,42581.0,42581.0
mean,0.0,0.0,0.730677,1.099012,1.29223,0.413521,4.696148,0.0,0.0
std,0.0,0.0,0.766783,1.410388,1.602841,0.774587,5.179642,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.06405,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.5569,0.660925,0.862317,0.0,3.510922,0.0,0.0
75%,0.0,0.0,1.07903,1.632999,2.10671,0.0,7.14146,0.0,0.0
max,0.0,0.0,4.254953,9.445822,8.509905,5.253764,28.366351,0.0,0.0


In [None]:
cost_df.is_male = cost_df.apply(lambda x: 1 if x.gender == 'Man' else 0, axis=1)

In [None]:
student_status = {
    'Not a student': 0,
    'Yes - Full Time College/University': 1,
    'Yes - Vocation/Technical/Trade School': 1,
    'Yes - Part-Time College/University': 1,
    'Fire Fighter 2 Training': 0,
    'Taking prerequisites missing for grad program ': 1,
    'Work': 0,
    'Graduate': 1,
    'Work at csu': 0,
    'Custodian': 0, 
    'taking classes toward early childhood licensure': 1,
    'Prefer not to say': 0
}


cost_df['is_student'] = cost_df['is_student'].apply(lambda x: student_status[x])

In [None]:
cost_df['age'] = cost_df['age'].apply(lambda x: x if x < 100 else 2024 - x)
cost_df['n_working_residents'] = cost_df['n_working_residents'].apply(lambda x: 0 if x < 0 else x)
cost_df.rename(
    columns={'start_local_dt_weekday': 'start:DOW', 'end_local_dt_weekday': 'end:DOW'},
    inplace=True
)

In [None]:
# cost_df.to_csv('../data/FULL_preprocessed_data_RM_weather.csv', index=False)
cost_df.to_csv('../data/ReplacedMode_Fix_02072024.csv', index=False)