### Some important points to remember:



In [3]:
# import warnings

# warnings.simplefilter('ignore', Warning)

In [27]:
import pandas as pd
import numpy as np
import random
import os
import ast
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from enum import Enum
from scipy.stats import uniform
from typing import List

pd.set_option('display.max_columns', None) 

In [32]:
SEED = 13210

np.random.seed(SEED)
random.seed(SEED)

SimilarityMetric = Enum('SimilarityMetric', ['COSINE', 'EUCLIDEAN', 'KNN', 'KMEANS'])

In [6]:
df = pd.read_csv('../data/ReplacedMode_Fix_02142024.csv')

  df = pd.read_csv('../data/ReplacedMode_Fix_02142024.csv')


In [7]:
print(df.columns.tolist())

['source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation', 'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', 'start:day', 'start:hour', 'start_local_dt_minute', 'start_local_dt_second', 'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', 'end:hour', 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', 'end_local_dt_timezone', '_id', 'user_id', 'metadata_write_ts', 'additions', 'mode_confirm', 'purpose_confirm', 'distance_miles', 'Mode_confirm', 'Trip_purpose', 'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', 'primary_job_commute_time', 'income_category', 'n_residence_members', 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles', 'available_modes', 'age', 'gender_Man', 'gender_Man;Nonbinary/genderqueer/gende

### We want to experiment with two types of models:


1. have one row per user, so that when predicting modes for a new user, we pick the "similar user" or users and determine the replaced mode
    - In this, the traditional approach would only use demographics for the user features, we may experiment with some summaries of the trip data that will function as some level of "fingerprint" for the user. Ideally we would be able to show that this performs better than demographics alone
    - Note also that the original method that you had outlined where the training set is a list of trips (O()) is a third approach which we will be comparing these two against

In [8]:
def encode_availability(df: pd.DataFrame):
    
    # Borrowed directly from the cost_time_avl_preprocessing notebook.
    available = {
        'Bicycle': 'p_micro',
        'Do not have vehicle': 'unknown',
        'Get a ride from a friend or family member': 's_car',
        'None': 'no_trip',
        'Public transportation (bus, subway, light rail, etc.)': 'transit',
        'Rental car (including Zipcar/ Car2Go)': 'car',
        'Shared bicycle or scooter': 's_micro',
        'Skateboard': 'p_micro',
        'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail',
        'Walk/roll': 'walk',
        'Prefer not to say': 'unknown'
    }
    
    newcols = list(available.values())
    
    # Start by creating dummy indicators.
    df[newcols] = 0
    
    for i, row in df.iterrows():
        modes = [e.strip() for e in row.available_modes.split(';')]
        mapped_modes = [available[mode] for mode in modes]
        df.loc[i, mapped_modes] = 1
    
    df.drop(columns=['available_modes'], inplace=True)
    df.columns = ['av_' + str(c) if c in newcols else str(c) for c in df.columns]
    
    return df

In [9]:
def get_mode_coverage(df: pd.DataFrame):
    
    coverage_df = df.groupby(['user_id', 'section_mode_argmax']).size().unstack(fill_value=0)
    coverage_df.columns = ['coverage_' + str(c) for c in coverage_df.columns]
    
    # As a preventative measure.
    coverage_df.fillna(0, inplace=True)
    
    # Normalize over rows.
    coverage_df.iloc[:, 1:] = coverage_df.iloc[:, 1:].div(coverage_df.iloc[:, 1:].sum(axis=1), axis=0)
    
    return coverage_df

In [10]:
def get_trip_summaries(df: pd.DataFrame, group_key: str, feature_list: List[str]):
    
    def get_feature_summaries(trip_feature: str, is_ordinal: bool = False):
        
        if not is_ordinal:
            # A mean of 0 is an actual value.
            mean = df.groupby(['user_id', group_key])[trip_feature].mean().unstack(level=-1, fill_value=-1.)
            mean.columns = [f'{trip_feature}_mean_' + str(c) for c in mean.columns]
            
            # Same with percentiles - 0 is an actual value.
            median = df.groupby(['user_id', group_key])[trip_feature].median().unstack(level=-1, fill_value=-1.)
            median.columns = [f'{trip_feature}_median_' + str(c) for c in median.columns]
            
            iqr_temp = df.groupby(['user_id', group_key])[trip_feature].quantile([0.25, 0.75]).unstack(level=-1)
            iqr = (iqr_temp[0.75] - iqr_temp[0.25]).unstack(level=-1)
            iqr.fillna(-1., inplace=True)
            iqr.columns = [f'{trip_feature}_iqr_' + str(c) for c in iqr.columns]

            # Now merge.
            merged = mean.copy()
            merged = merged.merge(right=median, left_index=True, right_index=True)
            merged = merged.merge(right=iqr, left_index=True, right_index=True)

            return merged
        
        # 0 is OK to indicate NaN values.
        f_mode = df.groupby(['user_id', group_key])[trip_feature].apply(
            lambda x: x.value_counts().idxmax()
        ).unstack(fill_value=0.)
        
        f_mode.columns = [f'{trip_feature}_mode_' + str(c) for c in f_mode.columns]
        
        return f_mode
    
    # This will be the dataframe that all subsequent features will join to.
    feature_df = None
    
    for ix, feature in enumerate(feature_list):
        is_ordinal = feature == 'start:hour' or feature == 'end:hour'
        if ix == 0:
            feature_df = get_feature_summaries(feature, is_ordinal)
        else:
            next_feature_df = get_feature_summaries(feature, is_ordinal)
            feature_df = feature_df.merge(right=next_feature_df, left_index=True, right_index=True)
    
    return feature_df

In [11]:
def get_demographic_data(df: pd.DataFrame, **trip_kwargs):
    
    '''
    A method that returns a U x (D + t) matrix, where U = number of users,
    D = number of demographic features, t (optional) = number of trip summary features.
    
    When use_trip_summaries=True, the 'available_modes' column is dropped in favor of
    the already-preprocessed av_ columns. This is because we want to incorporate trip-level
    information into the data. When the argument is False, we want to SOLELY use demographics.
    '''
    
    trip_features_to_use = trip_kwargs.pop('trip_features', None)
    trip_group_key = trip_kwargs.pop('trip_group_key', 'section_mode_argmax')
    
    demographics = [
        'user_id', 'primary_job_commute_time', 'income_category', 'n_residence_members', 'n_residents_u18', 
        'n_residents_with_license', 'n_motor_vehicles', 'available_modes', 'age', 'gender_Man', 
        'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', 
        'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', 
        'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', 
        'has_multiple_jobs_No', 'has_multiple_jobs_Prefer not to say', 'has_multiple_jobs_Yes', 
        "highest_education_Bachelor's degree", 'highest_education_Graduate degree or professional degree', 
        'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', 
        'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', 
        'primary_job_type_Full-time', 'primary_job_type_Part-time', 'primary_job_type_Prefer not to say', 
        'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', 
        'primary_job_description_Education', 'primary_job_description_Food service', 
        'primary_job_description_Manufacturing, construction, maintenance, or farming', 
        'primary_job_description_Medical/healthcare', 'primary_job_description_Other', 
        'primary_job_description_Professional, managerial, or technical', 
        'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', 
        'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', 
        'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', 
        'primary_job_commute_mode_WFH', 'is_overnight_trip', 'n_working_residents'
    ]
    
    # Retain only the first instance of each user and subset the columns.
    filtered = df.groupby('user_id').first().reset_index(drop=False)[demographics]
    
    # Get the targets.
    targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax()).reset_index(drop=False)
    
    filtered = filtered.merge(right=targets, left_on='user_id', right_on='user_id')
    
    if not trip_features_to_use or len(trip_features_to_use) == 0:
        # Use the available modes as indicators.
        return encode_availability(filtered)
    
    # -----------------------------------------------------------
    # Reaching here means that we need to include trip summaries
    # -----------------------------------------------------------
    
    # If trip summaries are to be used, then re-use the preprocessed availability features.
    availability = df[['user_id'] + [c for c in df.columns if 'av_' in c]]
    availability = availability.groupby('user_id').first()
    
    # For every user, generate the global trip-level summaries.
    global_aggs = df.groupby('user_id').agg({'duration': 'mean', 'distance': 'mean'})
    
    # coverage.
    coverage = get_mode_coverage(df)
    
    # Trip-level features.
    trip_features = get_trip_summaries(
        df=df, 
        group_key=trip_group_key, 
        feature_list=trip_features_to_use
    )
    
    targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())
    
    trip_features = trip_features.merge(right=coverage, left_index=True, right_index=True)
    trip_features = trip_features.merge(right=global_aggs, left_index=True, right_index=True)
    
    # Finally, join with availability indicators and targets.
    trip_features = trip_features.merge(right=availability, left_index=True, right_on='user_id')
    trip_features = trip_features.merge(right=targets, left_index=True, right_index=True)
    
    return trip_features.reset_index(drop=False)

## Experiment 1: Only demographics

In [12]:
demo_df = get_demographic_data(df)

In [13]:
# No stratification, pure random.
train, test = train_test_split(demo_df, test_size=0.2, random_state=SEED)

In [14]:
print(train.shape[0], test.shape[0])

132 33


In [40]:
def evaluate_using_similarity(test_df, train_df, metric=SimilarityMetric.COSINE, **metric_kwargs):
    
    '''
    This method treats each user row as a 'fingerprint' (embedding vector). We assume that we
    have no idea about the test set labels. To find which replaced mode is most likely for the test
    users, we compute the cosine similarity of each test user against the users in the training set.
    For the most similar user, we use their target as a proxy for the test user's replaced mode.
    This operates on the following intuition: If User A and User B are similar, then their replaced
    modes are also similar.
    '''
    
    tr_targets = train_df.target.values
    tr = train_df.drop(columns=['target', 'user_id'], inplace=False).reset_index(drop=True, inplace=False)
    
    te_targets = test_df.target.values
    te = test_df.drop(columns=['target', 'user_id'], inplace=False).reset_index(drop=True, inplace=False)
    
    if metric == SimilarityMetric.COSINE:
        # Use cosine similarity to determine which element in the train set this user is closest to.
        # Offset the columns from the second entry to exclude the user_id column.
        # Returns a (n_te, n_tr) matrix.
        sim = cosine_similarity(te.values, tr.values)
        
        # Compute the argmax across the train set.
        argmax = np.argmax(sim, axis=1)

        # Index into the training targets to retrieve predicted label.
        y_test_pred = tr_targets[argmax]
        
    elif metric == SimilarityMetric.EUCLIDEAN:
        
        # Here, we choose the embedding with the smallest L2 distance.
        distances = euclidean_distances(te.values, tr.values)
        
        # We choose argmin
        argmin = np.argmin(distances, axis=1)
        
        # Index into the targets.
        y_test_pred = tr_targets[argmin]
    
    elif metric == SimilarityMetric.KNN:
        
        # Build the KNN classifier. By default, let it be 3.
        knn = KNeighborsClassifier(
            n_neighbors=metric_kwargs.pop('n_neighbors', 3),
            weights='distance',
            metric=metric_kwargs.pop('knn_metric', 'cosine'),
            n_jobs=os.cpu_count()
        )
        
        # Fit the data to the KNN model
        knn.fit(tr, tr_targets)
        
        y_test_pred = knn.predict(te)
    
    elif metric == SimilarityMetric.KMEANS:
        
        # Build the model.
        kmeans = KMeans(
            n_clusters=metric_kwargs.pop('n_clusters', 8),
            max_iter=metric_kwargs.pop('max_iter', 300),
            n_init='auto'
        )
        
        # Fit the clustering model
        kmeans.fit(tr)
        
        # Construct the auxiliary df and merge with the training set.
        label_df = pd.DataFrame({'label': kmeans.labels_, 'target': tr_targets}, index=tr.index)
        
        # Now, perform an inference on the test set.
        predicted_labels = kmeans.predict(te)
        
        y_test_pred = []
        for prediction in predicted_labels:
            most_likely = label_df.loc[label_df.label == prediction, 'target'].value_counts().idxmax()
            y_test_pred.append(most_likely)
        
    else:
        raise NotImplementedError("Unknown similarity metric")
    
    
    f1 = f1_score(y_true=te_targets, y_pred=y_test_pred, average='weighted')
    print(f"Test F1 score using {metric.name} = {f1}")

In [29]:
evaluate_using_similarity(test, train, SimilarityMetric.COSINE)

Test F1 score using COSINE = 0.32806324110671936


In [30]:
evaluate_using_similarity(test, train, SimilarityMetric.EUCLIDEAN)

Test F1 score using EUCLIDEAN = 0.2742577288031834


In [31]:
evaluate_using_similarity(test, train, SimilarityMetric.KNN)

Test F1 score using KNN = 0.35950413223140487


In [48]:
evaluate_using_similarity(test, train, SimilarityMetric.KMEANS, n_clusters=3)

Test F1 score using KMEANS = 0.33347902097902093


Not bad - using just a simple random split gives us the following results:

$F1_{cosine} = 0.32$, $F1_{euclidean} = 0.27$, $F1_{KNN} = 0.36$, $F1_{kmeans} = 0.33$

In [23]:
def custom_nll_scorer(clf, X, y):
    y_pred = clf.predict_proba(X)
    return -log_loss(y_true=y, y_pred=y_pred, labels=sorted(np.unique(y)))

In [24]:
def estimate_using_model(train, test):
    
    # Define the train-val splitter.
    splitter = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
    
    params = {
        'n_estimators': np.arange(100, 1001, 50),
        'max_depth': [i for i in range(5, 101, 5)],
        'ccp_alpha': np.linspace(0, 1, 10),
        'class_weight': ['balanced', 'balanced_subsample', None],
        'min_samples_split': np.arange(2, 25, 2),
        'min_samples_leaf': np.arange(1, 25)
    }
    
    rf = RandomForestClassifier(random_state=SEED)
    
    # Search over hparams to minimize negative log likelihood.  
    clf = RandomizedSearchCV(
        rf, params, n_iter=500, scoring=custom_nll_scorer, 
        n_jobs=os.cpu_count(), cv=splitter, random_state=SEED,
        verbose=0
    )
    
    X_tr = train.drop(columns=['user_id', 'target'])
    y_tr = train.target.values.ravel()
    
    scorer = clf.fit(X_tr, y_tr)
    
    best_model = scorer.best_estimator_
    
    print(f"Best val score = {scorer.best_score_}")
    
    X_te = test.drop(columns=['user_id', 'target'])
    
    # Use the best model to compute F1 on the test set.
    test_f1 = f1_score(y_true=test.target.values, y_pred=best_model.predict(X_te), average='weighted')
    
    print(f"Test F1 = {test_f1}")

In [20]:
estimate_using_model(train, test)

Best val score = -1.7985827701200345
Test F1 = 0.32794612794612793


Interesting! The model is slightly on par with the cosine similarity, but actually worse than the KNN model!

To recap, $F1_{cosine} = 0.32806$, $F1_{euclidean} = 0.27$, $F1_{KNN} = 0.36$, $F1_{kmeans} = 0.33$, $F1_{RF} = 0.328$

## Experiment 2: Demographics with trip summaries

Now that we've performed experiments with solely demographic data, let's expand the feature set by including 
trip summary statistics. We would like this approach to do better than the aforementioned baselines.

In [49]:
demo_plus_trips = get_demographic_data(
    df, 
    trip_features=['mph', 'section_duration_argmax', 'section_distance_argmax', 'start:hour', 'end:hour']
)

In [50]:
train, test = train_test_split(demo_plus_trips, test_size=0.2, random_state=SEED)

print(train.shape[0], test.shape[0])

132 33


In [51]:
evaluate_using_similarity(test, train, SimilarityMetric.COSINE)

Test F1 score using COSINE = 0.3686868686868687


In [52]:
evaluate_using_similarity(test, train, SimilarityMetric.EUCLIDEAN)

Test F1 score using EUCLIDEAN = 0.3338758428272495


In [53]:
evaluate_using_similarity(test, train, SimilarityMetric.KNN)

Test F1 score using KNN = 0.30201171377641967


In [54]:
evaluate_using_similarity(test, train, SimilarityMetric.KMEANS, n_clusters=3)

Test F1 score using KMEANS = 0.36007130124777187


Great! Marked improvement from last time. Simply incorporating trip-level features seems to increase the
accuracy of finding similar users. Now, $F1_{cosine} = 0.37$, $F1_{euclidean} = 0.33$, $F1_{knn} = 0.3$, $F1_{kmeans} = 0.36$

In [25]:
# Now, we try with the model
estimate_using_model(train, test)

Best val score = -1.8201049789158268
Test F1 = 0.42158426368952684


Great! Compared to the previous model, we see definite improvements! I'm sure we can squeeze some more juice out of the models using fancy optimization, but as a baseline, these are good enough.


So, to recap:
$F1_{cosine} = 0.37$, $F1_{euclidean} = 0.33$, $F1_{knn} = 0.3$, $F1_{kmeans} = 0.36$, $F1_{RF} = 0.4215$

### Next objectives:

1. Try grouping by other features, such as duration or distance
2. For similarity search, use other techniques such as clustering or KNN

# Multi-level modeling

In this approach, we want to piece together the similarity search and modeling processes. Here's a rough sketch of how it should be implemented:

1. For every user in the training set, build a model using their entire trip history.
2. Consolidate these user-level models in data structure, preferably a dictionary.
3. Now, when we want to perform inference on a new user with no prior trips, we use the similarity search to get the user ID in the training set who is the most similar to the user in question.
4. We retrieve the model for this corresponding user and perform an inference. The hypothesis is that since the two users are similar, their trip substitution patterns are also similar.

In [None]:
# def construct_model_dictionary(train: pd.DataFrame):
    
#     def train_on_user(user_id: str):
#         '''
#         Given the training set and the user ID to query, filter the dataset and
#         retain only the relevant trips. Then, create folds and optimize a model for this user.
#         Return the trained model instance.
#         '''
        
#         user_data = train.loc[train.user_id == user_id, :].reset_index(drop=True)
        
        