### All experiments are logged in Notion [here](https://www.notion.so/Replacement-mode-modeling-257c2f460377498d921e6b167f465945)

In [1]:
from enum import Enum
import random

# Math and graphing.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn imports.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, r2_score, ConfusionMatrixDisplay

%matplotlib inline

In [2]:
# Global experiment flags and variables.
SEED = 19348
TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']

# Set the Numpy seed too.
random.seed(SEED)
np.random.seed(SEED)

In [3]:
class SPLIT_TYPE(Enum):
    # 5 trips for user1, 4 trips in train, 1 in test
    INTRA_USER = 0
    # 5 users, 4 users in train, 1 user in test
    INTER_USER = 1
    # stratifying using target (chosen)
    # [5000 samples (c=1), 8000 samples (c=2)]
    # [4000 samples c=1, 3200 samples c=2 in train]
    TARGET = 2
    
    MIXED = 3
    

class SPLIT(Enum):
    TRAIN = 0
    TEST = 1

def get_splits(count_df: pd.DataFrame, n:int, test_size=0.2):
    maxsize = int(n * test_size)

    max_threshold = int(maxsize * 1.05)
    min_threshold = int(maxsize * 0.95)

    print(f"{min_threshold}, {max_threshold}")
    
    # Allow a 10% tolerance
    def _dp(ix, curr_size, ids, cache):
        
        if ix >= count_df.shape[0]:
            return []

        key = ix

        if key in cache:
            return cache[key]

        if curr_size > max_threshold:
            return []

        if min_threshold <= curr_size <= max_threshold:
            return ids

        # two options - either pick the current id or skip it.
        branch_a = _dp(ix, curr_size+count_df.loc[ix, 'count'], ids+[count_df.loc[ix, 'index']], cache)
        branch_b = _dp(ix+1, curr_size, ids, cache)
        
        curr_max = []
        if branch_a and len(branch_a) > 0:
            curr_max = branch_a
        
        if branch_b and len(branch_b) > len(branch_a):
            curr_max = branch_b
            
        cache[key] = curr_max
        return cache[key]
    
    return _dp(0, 0, ids=list(), cache=dict())


def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):

    n_users = list(data.user_id.unique())
    n = data.shape[0]
    
    if shuffle:
        data = data.sample(data.shape[0], random_state=SEED).reset_index(drop=True, inplace=False)

    if how == SPLIT_TYPE.INTER_USER:
        # Make the split, ensuring that a user in one fold is not leaked into the other fold.
        # Basic idea: we want to start with the users with the highest instances and place 
        # alternating users in each set.
        counts = data.user_id.value_counts().reset_index(drop=False, inplace=False, name='count')

        # Now, start with the user_id at the top, and keep adding to either split.
        # This can be achieved using a simple DP program.
        test_ids = get_splits(counts, data.shape[0])
        test_data = data.loc[data.user_id.isin(test_ids), :]
        train_index = data.index.difference(test_data.index)
        train_data = data.loc[data.user_id.isin(train_index), :]
        
        return train_data, test_data
    
    elif how == SPLIT_TYPE.INTRA_USER:
        
        # There are certain users with only one observation. What do we do with those?
        # As per the mobilitynet modeling pipeline, we randomly assign them to either the
        # training or test set.
        
        value_counts = data.user_id.value_counts()
        single_count_ids = value_counts[value_counts == 1].index
        
        data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)
        data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)
        
        X_tr, X_te = train_test_split(
            data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,
            random_state=SEED
        )
        
        data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))
        X_tr_merged = pd.concat(
            [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(
                columns=['assigned'], inplace=False
            )],
            ignore_index=True, axis=0
        )
        
        X_te_merged = pd.concat(
            [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(
                columns=['assigned'], inplace=False
            )],
            ignore_index=True, axis=0
        )
        
        return X_tr_merged, X_te_merged
    
    elif how == SPLIT_TYPE.TARGET:
        
        X_tr, X_te = train_test_split(
            data, test_size=test_ratio, shuffle=shuffle, stratify=data.chosen,
            random_state=SEED
        )
        
        return X_tr, X_te
    
    raise NotImplementedError("Unknown split type")

## Modeling

In [4]:
# Read the data.
# data = pd.read_csv('../data/FULL_preprocessed_data_RM_weather.csv')
# data = pd.read_csv('../data/ReplacedMode_Fix.csv')
data = pd.read_csv('../data/ReplacedMode_Fix_02072024.csv')

In [5]:
data.drop_duplicates(inplace=True)

In [6]:
def plot_hist(df, features=None):
    if not features:
        # All features.
        features = df.columns.tolist()
        
    n_features = len(features)
    
    ncols = 6
    nrows = n_features//ncols if n_features%ncols == 0 else (n_features//ncols) + 1
    
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 10))
    for ix, ax in enumerate(axes.flatten()):
        
        if ix > n_features:
            break
        
        df[features[ix]].hist(ax=ax)
        ax.set(title=features[ix])
    
    plt.tight_layout()
    plt.show()

In [7]:
# First, we map the user IDs to ints.

USERS = list(data.user_id.unique())

USER_MAP = {
    u: i+1 for (i, u) in enumerate(USERS)
}

data['user_id'] = data['user_id'].apply(lambda x: USER_MAP[x])

# data.rename(
#     columns={'start_local_dt_weekday': 'start:DOW', 'end_local_dt_weekday': 'end:DOW'},
#     inplace=True
# )

# Drop the samples with chosen == no trip or chosen == unknown
# data.drop(index=data.loc[data.chosen.isin([2, 9])].index, inplace=True)

# data.n_working_residents = data.n_working_residents.apply(lambda x: 0 if x < 0 else x)

# Fix some age preprocessing issues.
# data.age = data.age.apply(lambda x: x if x < 100 else 2024-x)

# Collapse 'train' and 'bus' into 'transit'
# data.loc[
#     data.section_mode_argmax.isin(['train', 'bus']), 'section_mode_argmax'
# ] = 'transit'

In [8]:
# display(data.section_mode_argmax.value_counts())

In [9]:
# transit = data.loc[data.section_mode_argmax == 'transit', :].copy()
# transit['section_duration_argmax'] /= 60.

# transit['mph'] = transit['section_distance_argmax']/transit['section_duration_argmax']

# display(transit[['section_duration_argmax', 'section_distance_argmax', 'mph']].describe())

In [10]:
# import plotly.express as px

# sp = data.loc[data.section_mode_argmax.isin(['car', 'transit', 'walking']), :]
# fig = px.line(sp, y='section_distance_argmax', color='section_mode_argmax')
# fig.show()

In [11]:
# Close the figure above.
# plt.close()

In [12]:
def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict):
    
    X_features = ['section_distance_argmax', 'age']
    
    if 'mph' in df.columns:
        X_features += ['mph']
    
    if dset == SPLIT.TRAIN and model_dict is None:
        model_dict = dict()
    
    if dset == SPLIT.TEST and model_dict is None:
        raise AttributeError("Expected model dict for testing.")
    
    if dset == SPLIT.TRAIN:
        for section_mode in df.section_mode_argmax.unique():
            section_data = df.loc[df.section_mode_argmax == section_mode, :]
            if section_mode not in model_dict:
                model_dict[section_mode] = dict()

                model = LinearRegression(fit_intercept=True)

                X = section_data[
                    X_features
                ]
                Y = section_data[['section_duration_argmax']]

                model.fit(X, Y.values.ravel())

                r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel())
                print(f"Train R2 for {section_mode}: {r2}")

                model_dict[section_mode]['model'] = model
                
    elif dset == SPLIT.TEST:
        for section_mode in df.section_mode_argmax.unique():
            section_data = df.loc[df.section_mode_argmax == section_mode, :]
            X = section_data[
                X_features
            ]
            Y = section_data[['section_duration_argmax']]
            
            y_pred = model_dict[section_mode]['model'].predict(X)
            r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel())
            print(f"Test R2 for {section_mode}: {r2}")
    
    # Create the new columns for the duration.
    new_columns = ['p_micro','no_trip','s_car','transit','car','s_micro','ridehail','walk','unknown']
    df[new_columns] = 0
    df['temp'] = 0
    
    for section in df.section_mode_argmax.unique():
        X_section = df.loc[df.section_mode_argmax == section, X_features]
        
        # broadcast to all columns.
        df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section)
    
    for c in new_columns:
        df[c] = df['av_' + c] * df['temp']
    
    df.drop(columns=['temp'], inplace=True)
    
    df.rename(columns=dict([(x, 'tt_'+x) for x in new_columns]), inplace=True)
    
    # return model_dict, result_df
    return model_dict, df

In [13]:
# Now, we split the data (either inter-user or intra-user split)

# train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTER_USER, shuffle=True)

train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTRA_USER, shuffle=True)

# train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.TARGET, shuffle=True)

In [14]:
print(train_data.columns)

Index(['user_id', '_id', 'original_user_id', 'cleaned_trip', 'Mode_confirm',
       'start_fmt_time', 'start:year', 'start:month', 'start:day',
       'start:hour', 'start:DOW', 'end_fmt_time', 'end:year', 'end:month',
       'end:day', 'end:hour', 'end:DOW', 'available_modes', 'birth_year',
       'income_category', 'n_motor_vehicles', 'n_residence_members',
       'n_residents_u18', 'gender', 'is_student', 'n_residents_with_license',
       'duration', 'distance_miles', 'start_loc', 'end_loc', 'section_modes',
       'section_distances', 'start:n_days_in_month', 'end:n_days_in_month',
       'age', 'is_overnight_trip', 'n_working_residents', 'is_male',
       'start_lat', 'start_lng', 'end_lat', 'end_lng', 'start:sin_HOD',
       'start:sin_DOM', 'start:sin_MOY', 'start:cos_HOD', 'start:cos_DOM',
       'start:cos_MOY', 'end:sin_HOD', 'end:sin_DOM', 'end:sin_MOY',
       'end:cos_HOD', 'end:cos_DOM', 'end:cos_MOY', 'section_durations',
       'section_locations_argmax', 'temperature_

In [14]:
params, train_data = get_duration_estimate(train_data, SPLIT.TRAIN, None)
print(10 * "-")
_, test_data = get_duration_estimate(test_data, SPLIT.TEST, params)

Train R2 for car: 0.9107819633844028
Train R2 for bicycling: 0.9381499933467025
Train R2 for walking: 0.7840120837242898
Train R2 for no_sensed: 0.838164213315293
Train R2 for transit: 0.9167190695089265
----------
Test R2 for car: 0.9115083509175145
Test R2 for walking: 0.7836715824022498
Test R2 for no_sensed: 0.8457638150514823
Test R2 for bicycling: 0.94215202813422
Test R2 for transit: 0.9130004787209818


In [15]:
train_data.shape, test_data.shape

((34064, 97), (8517, 97))

In [16]:
# Some helper functions that will help ease redundancy in the code.

def drop_columns(df: pd.DataFrame):
    u_time_features = [
        'start_fmt_time', 'start:year', 'start:month', 'start:day',
        'start:hour', 'end_fmt_time', 'end:year',
        'end:month', 'end:day', 'end:hour', 'end:n_days_in_month', 
        'start:sin_DOM', 'start:sin_MOY', 'start:cos_MOY', 'start:cos_DOM',
        'end:sin_DOM', 'end:sin_MOY', 'end:cos_DOM', 'end:cos_MOY', 'start:n_days_in_month'
    ]

    u_user_features = [
        '_id', 'original_user_id', 'gender', 'birth_year',
#         'user_id', 
    ]
    
    u_trip_features = [
        'cleaned_trip', 'Mode_confirm', 'available_modes', 'duration', 'start_loc',
        'end_loc', 'section_modes', 'section_distances', 'section_durations',
        'section_locations_argmax', 'section_mode_argmax', 'section_coordinates_argmax',
#         'start_lat', 'start_lng', 'end_lat', 'end_lng'
    ]
    
    # Drop section_mode_argmax and available_modes.
    return df.drop(
        columns=u_time_features + u_user_features + u_trip_features, 
        inplace=False
    )


def scale_values(df: pd.DataFrame, split: SPLIT, scalers=None):
    # Scale costs using StandardScaler.
    costs = df[[c for c in df.columns if 'cost_' in c]].copy()
    times = df[[c for c in df.columns if 'tt_' in c or 'duration' in c]].copy()
    distances = df[[c for c in df.columns if 'distance' in c]]
    
    print(
        "Cost columns to be scaled: ", costs.columns,"\nTime columns to be scaled: ", times.columns, \
        "\nDistance columns to be scaled: ", distances.columns
    )
    
    if split == SPLIT.TRAIN and scalers is None:
        cost_scaler = StandardScaler()
        tt_scaler = StandardScaler()
        dist_scaler = StandardScaler()
        
        cost_scaled = pd.DataFrame(
            cost_scaler.fit_transform(costs), 
            columns=costs.columns, 
            index=costs.index
        )
        
        tt_scaled = pd.DataFrame(
            tt_scaler.fit_transform(times),
            columns=times.columns,
            index=times.index
        )
        
        dist_scaled = pd.DataFrame(
            dist_scaler.fit_transform(distances),
            columns=distances.columns,
            index=distances.index
        )
    
    elif split == SPLIT.TEST and scalers is not None:
        
        cost_scaler, tt_scaler, dist_scaler = scalers
        
        cost_scaled = pd.DataFrame(
            cost_scaler.transform(costs), 
            columns=costs.columns, 
            index=costs.index
        )
        
        tt_scaled = pd.DataFrame(
            tt_scaler.transform(times), 
            columns=times.columns, 
            index=times.index
        )
        
        dist_scaled = pd.DataFrame(
            dist_scaler.transform(distances),
            columns=distances.columns,
            index=distances.index
        )
        
    else:
        raise NotImplementedError("Unknown split")
    
    # Drop the original columns.
    df.drop(
        columns=costs.columns.tolist() + times.columns.tolist() + distances.columns.tolist(), 
        inplace=True
    )
    
    df = df.merge(right=cost_scaled, left_index=True, right_index=True)
    df = df.merge(right=tt_scaled, left_index=True, right_index=True)
    df = df.merge(right=dist_scaled, left_index=True, right_index=True)
    
    return df, (cost_scaler, tt_scaler, dist_scaler)

In [17]:
# First, drop columns.

train_data = drop_columns(train_data)

# Scale cost.
# train_data, scalers = scale_values(train_data, SPLIT.TRAIN, None)

In [18]:
test_data = drop_columns(test_data)

# Scale cost.
# test_data, _ = scale_values(test_data, SPLIT.TEST, scalers)

In [19]:
train_data.columns

Index(['user_id', 'start:DOW', 'end:DOW', 'income_category',
       'n_motor_vehicles', 'n_residence_members', 'n_residents_u18',
       'is_student', 'n_residents_with_license', 'distance_miles', 'age',
       'is_overnight_trip', 'n_working_residents', 'is_male', 'start_lat',
       'start_lng', 'end_lat', 'end_lng', 'start:sin_HOD', 'start:cos_HOD',
       'end:sin_HOD', 'end:cos_HOD', 'temperature_2m (°F)',
       'relative_humidity_2m (%)', 'dew_point_2m (°F)', 'rain (inch)',
       'snowfall (inch)', 'cloud_cover (%)', 'wind_speed_10m (mp/h)',
       'wind_gusts_10m (mp/h)', 'section_distance_argmax',
       'section_duration_argmax', 'mph', 'chosen', 'av_car', 'av_s_car',
       'av_no_trip', 'av_walk', 'av_transit', 'av_s_micro', 'av_p_micro',
       'av_ridehail', 'av_unknown', 'cost_p_micro', 'cost_no_trip',
       'cost_s_car', 'cost_transit', 'cost_car', 'cost_s_micro',
       'cost_ridehail', 'cost_walk', 'cost_unknown', 'tt_p_micro',
       'tt_no_trip', 'tt_s_car', 'tt_t

In [20]:
len(train_data.chosen.unique())

9

In [21]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from pprint import pprint
from sklearn.inspection import permutation_importance
from time import perf_counter

## Random Forest classifier

In [22]:
CV = True

In [23]:
from sklearn.ensemble import RandomForestClassifier

# exp question - compute sample weights using user_id.

rf_train = train_data.drop(columns=['chosen', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'user_id'])
rf_test = test_data.drop(columns=['chosen', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'user_id'])

if CV:

    model = RandomForestClassifier(random_state=SEED)

    # We want to build bootstrapped trees that would not always use all the features.

    param_set2 = {
        'n_estimators': [150, 200, 250],
        'min_samples_split': [2, 3],
        'class_weight': ['balanced_subsample'],
        'max_features': [None, 'sqrt'],
        'bootstrap': [True]
    }

    cv_set2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

    clf_set2 = GridSearchCV(model, param_set2, cv=cv_set2, n_jobs=-1, scoring='f1_weighted', verbose=1)

    start = perf_counter()

    clf_set2.fit(
        rf_train,
        train_data.chosen.values.ravel()
    )

    time_req = (perf_counter() - start)/60.

    best_model = clf_set2.best_estimator_
else:
    best_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=3,
        bootstrap=True,
        class_weight='balanced_subsample',
        random_state=SEED,
        n_jobs=-1
    ).fit(rf_train, train_data.chosen.values.ravel())

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [24]:
# tr_f1_set1 = f1_score(
#     y_true=train_data.chosen.values,
#     y_pred=model_set1.predict(rf_train),
#     average='weighted'
# )

tr_f1_set2 = f1_score(
    y_true=train_data.chosen.values,
    y_pred=best_model.predict(rf_train),
    average='weighted'
)

# print(tr_f1)

In [25]:
# te_f1_set1 = f1_score(
#     y_true=test_data.chosen.values,s
#     y_pred=model_set1.predict(rf_test),
#     average='weighted'
# )

te_f1_set2 = f1_score(
    y_true=test_data.chosen.values,
    y_pred=best_model.predict(rf_test),
    average='weighted'
)

# print(te_f1)

In [26]:
# print(f"[NON BOOTSTRAPPED] | Train F1: {tr_f1_set1}, Test F1: {te_f1_set1}")
print(f"[BOOTSTRAPPED] | Train F1: {tr_f1_set2}, Test F1: {te_f1_set2}")

[BOOTSTRAPPED] | Train F1: 1.0, Test F1: 0.7344136324607913


In [27]:
# Feature importances - gini entropy

pprint(
    sorted(
        zip(
            best_model.feature_names_in_, 
            best_model.feature_importances_
        ), 
        key=lambda x: x[-1], reverse=True
    )
)

[('age', 0.13052971325233453),
 ('income_category', 0.05563529282438796),
 ('n_motor_vehicles', 0.05117069898186447),
 ('mph', 0.04901096852140811),
 ('dew_point_2m (°F)', 0.046445523590839706),
 ('temperature_2m (°F)', 0.04332851959366878),
 ('n_residents_u18', 0.04079428459138862),
 ('cost_transit', 0.03599739479849181),
 ('distance_miles', 0.03410054518532979),
 ('wind_gusts_10m (mp/h)', 0.02664916084517161),
 ('relative_humidity_2m (%)', 0.026557460924608728),
 ('wind_speed_10m (mp/h)', 0.02621237413168378),
 ('n_residence_members', 0.024503799663918274),
 ('section_duration_argmax', 0.024244750804804545),
 ('n_working_residents', 0.023773443113737733),
 ('n_residents_with_license', 0.023628029234229537),
 ('cloud_cover (%)', 0.02246590194480327),
 ('cost_s_micro', 0.022458268111427003),
 ('tt_p_micro', 0.02038772534963909),
 ('start:cos_HOD', 0.019229222189042564),
 ('is_male', 0.01922412856640586),
 ('end:cos_HOD', 0.019203897735717918),
 ('section_distance_argmax', 0.01525389631

In [15]:
importance = permutation_importance(
    best_model,
    rf_test,
    test_data.chosen.values,
    n_repeats=5,
    random_state=SEED,
    n_jobs=-1,
    scoring='f1_weighted'
)

NameError: name 'permutation_importance' is not defined

In [None]:
pd.DataFrame(
    {
        'feature names': test_data.columns.delete(
            test_data.columns.isin(['chosen'])
        ),
        'imp_mean': importance.importances_mean, 
        'imp_std': importance.importances_std
    }
).sort_values(by=['imp_mean'], axis='rows', ascending=False).head(20)

In [None]:
# fig, ax = plt.subplots(nrows=1, ncols=2)
y_pred = best_model.predict(rf_test)
pred_df = pd.DataFrame(
    {
        'y_pred': y_pred.ravel(),
        'y_true': test_data.chosen.values.ravel()
    }
)

# pred_df.y_pred.hist(ax=ax[0])
# pred_df.y_true.hist(ax=ax[1])

# ax[0].set(
#     xlabel="Label",
#     ylabel="Count",
#     title="Prediction"
# )

# ax[1].set(
#     xlabel="Label",
#     ylabel="Count",
#     title="GT"
# )

# plt.tight_layout()
# plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
cm = ConfusionMatrixDisplay.from_estimator(
    best_model,
    X=rf_test,
    y=test_data[['chosen']],
    ax=ax
)
# ax.set_xticklabels(TARGETS, rotation=45)
# ax.set_yticklabels(TARGETS)
fig.tight_layout()
plt.show()

In [None]:
print(classification_report(y_true=pred_df.y_true, y_pred=pred_df.y_pred))

## XGBoost

In [None]:
# from sklearn.utils.class_weight import compute_sample_weight

# sample_weights = compute_sample_weight(class_weight='balanced', y=train_data.user_id.values.ravel())

In [None]:
from xgboost import XGBClassifier

y_train = train_data.chosen.values.ravel() - 1
y_test = test_data.chosen.values.ravel()

# weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_pred), y_pred)

xgm = XGBClassifier(
    n_estimators=250,
    max_depth=None,
    tree_method='hist',
    objective='multi:softmax',
    num_class=9
).fit(rf_train, y_train)

In [None]:
preds = xgm.predict(rf_test) + 1

print(classification_report(y_true=y_test, y_pred=preds))

In [None]:
# import pickle

# # RF_RM.pkl = 0.8625 on test.
# # RF_RM_1.pkl = 0.77 on test.
# with open('../models/RF_RM_1.pkl', 'wb') as f:
#     f.write(pickle.dumps(model))

## TODO:


- Explain why location might not be a good feature to add (plot start and end on map and explain how model might just overfit to the raw coordinates)
- Merge `unknown` and `no_trip` into one category and validate against models trained on (a) separate labels (b) dropped labels
- Explore more of the abnormal `walking` trips