## Some important points to remember:

### We want to experiment with two types of models:


1. have one row per user, so that when predicting modes for a new user, we pick the "similar user" or users and determine the replaced mode
    - In this, the traditional approach would only use demographics for the user features, we may experiment with some summaries of the trip data that will function as some level of "fingerprint" for the user. Ideally we would be able to show that this performs better than demographics alone
    - Note also that the original method that you had outlined where the training set is a list of trips (O()) is a third approach which we will be comparing these two against

Target order:

```
['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']
```

In [None]:
import pandas as pd
import numpy as np
import random
import os
import pickle
import ast
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from enum import Enum
from scipy.stats import uniform
from typing import List, Dict, Union
from pandas.api.types import is_numeric_dtype
from sklearn.manifold import TSNE
from multiprocessing import cpu_count

pd.set_option('display.max_columns', 100)

In [None]:
SEED = 13210

np.random.seed(SEED)
random.seed(SEED)

SimilarityMetric = Enum('SimilarityMetric', ['COSINE', 'EUCLIDEAN', 'KNN', 'KMEANS'])
GroupType = Enum('GroupType', ['GROUPBY', 'CUT'])

In [None]:
DATA_SOURCE = [
    ('./data/filtered_data/preprocessed_data_Stage_database.csv', 'allceo'),
    ('./data/filtered_data/preprocessed_data_openpath_prod_durham.csv', 'durham'),
    ('./data/filtered_data/preprocessed_data_openpath_prod_mm_masscec.csv', 'masscec'),
    ('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv', 'ride2own'),
    ('./data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv', 'nicr')
]

In [None]:
## CHANGE THE DB INDEX HERE.
DB_NUMBER = 0

PATH = DATA_SOURCE[DB_NUMBER][0]
CURRENT_DB = DATA_SOURCE[DB_NUMBER][1]

In [None]:
df = pd.read_csv(PATH)

In [None]:
not_needed = ['deprecatedID', 'data.key']

for col in not_needed:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

In [None]:
print(df.columns.tolist())

In [None]:
def generate_tsne_plots(df: pd.DataFrame, **kwargs):
    
    df = df.copy()
    
    # Important - if not cast as a category, seaborn considers this as a numerical value.
    df.target = df.target.astype('category')
    
    # print("Unique targets: ", df.target.unique())
    
    # According to the docs, > consider choosing a perplexity between 5 and 50.
    tsne = TSNE(
        n_components=2,
        perplexity=kwargs.pop('perplexity', 5),
        n_iter=kwargs.pop('n_iter', 2000),
        metric=kwargs.pop('metric', 'cosine'),
        random_state=SEED,
        n_jobs=os.cpu_count()
    )
    
    if df.index.name == 'user_id':
        df.reset_index(drop=False, inplace=True)
    
    if 'user_id' in df.columns:
        df.drop(columns=['user_id'], inplace=True)
    
    targets = df.target.values
    df.drop(columns=['target'], inplace=True)
    
    projected = tsne.fit_transform(df)
    
    fig, ax = plt.subplots()
    sns.scatterplot(x=projected[:, 0], y=projected[:, 1], hue=targets, ax=ax)
    ax.set(xlabel='Embedding dimension 1', ylabel='Embedding dimension 2', title='t-SNE plot for data')
    plt.show()
    
    return projected

In [None]:
def get_mode_coverage(df: pd.DataFrame):
    
    coverage_df = df.groupby(['user_id', 'section_mode_argmax']).size().unstack(fill_value=0)
    coverage_df.columns = ['coverage_' + str(c) for c in coverage_df.columns]
    
    # As a preventative measure.
    coverage_df.fillna(0, inplace=True)
    
    # Normalize over rows.
    coverage_df.iloc[:, 1:] = coverage_df.iloc[:, 1:].div(coverage_df.iloc[:, 1:].sum(axis=1), axis=0)
    
    return coverage_df

In [None]:
def get_trip_summaries(df: pd.DataFrame, group_key: str, feature_list: List[str], **kwargs):
    
    def get_feature_summaries(trip_feature: str, is_ordinal: bool = False):
        
        if is_numeric_dtype(df[group_key]):
            col_prefix = f'{trip_feature}_mean_cut'
            if not use_qcut:
                grouper = df.groupby(['user_id', pd.cut(df[group_key], n_cuts)])[trip_feature]
            else:
                grouper = df.groupby(['user_id', pd.qcut(df[group_key], n_cuts)])[trip_feature]
        else:
            grouper = df.groupby(['user_id', group_key])[trip_feature]
        
        if not is_ordinal:
            # A mean of 0 is an actual value.
            
            mean = grouper.mean().unstack(level=-1, fill_value=-1.)
            
            mean.columns = [f'{trip_feature}_mean_' + str(c) for c in mean.columns]
            
            # Same with percentiles - 0 is an actual value.
            median = grouper.median().unstack(level=-1, fill_value=-1.)
            median.columns = [f'{trip_feature}_median_' + str(c) for c in median.columns]
            
            iqr_df = grouper.quantile([0.25, 0.75]).unstack(level=-1)
            iqr = (iqr_df[0.75] - iqr_df[0.25]).unstack(level=-1)
            iqr.fillna(-1., inplace=True)
            iqr.columns = [f'{trip_feature}_iqr_' + str(c) for c in iqr.columns]

            # Now merge.
            merged = mean.copy()
            merged = merged.merge(right=median, left_index=True, right_index=True)
            merged = merged.merge(right=iqr, left_index=True, right_index=True)
            
            merged.fillna(-1., inplace=True)

            return merged
        
        # 0 is OK to indicate NaN values.
        f_mode = grouper.apply(
            lambda x: x.value_counts().idxmax()
        ).unstack(fill_value=0.)
        
        f_mode.columns = [f'{trip_feature}_mode_' + str(c) for c in f_mode.columns]
        f_mode.fillna(0., inplace=True)
        
        return f_mode
    
    assert group_key not in feature_list, "Cannot perform grouping and summarization of the same feature."
    
    # Optional kwarg for number of cuts for numeric dtype grouping.
    # Default is 3: short, medium, long trip types:
    # For e.g., if the group key is 'section_duration', it will be cut into three equally-sized bins,
    # However, an alternative is also present - we could use qcut() instead, which would ensure that
    # each bin has roughly the same number of samples.
    n_cuts = kwargs.pop('n_cuts', 3)
    use_qcut = kwargs.pop('use_qcut', False)
    
    # This will be the dataframe that all subsequent features will join to.
    feature_df = None
    
    for ix, feature in enumerate(feature_list):
        is_ordinal = feature == 'start_local_dt_hour' or feature == 'end_local_dt_hour'
        if ix == 0:
            feature_df = get_feature_summaries(feature, is_ordinal)
        else:
            next_feature_df = get_feature_summaries(feature, is_ordinal)
            feature_df = feature_df.merge(right=next_feature_df, left_index=True, right_index=True)
    
    return feature_df

In [None]:
def get_demographic_data(df: pd.DataFrame, **trip_kwargs):
    
    '''
    A method that returns a U x (D + t) matrix, where U = number of users,
    D = number of demographic features, t (optional) = number of trip summary features.
    
    When use_trip_summaries=True, the 'available_modes' column is dropped in favor of
    the already-preprocessed av_ columns. This is because we want to incorporate trip-level
    information into the data. When the argument is False, we want to SOLELY use demographics.
    '''
    
    trip_features_to_use = trip_kwargs.pop('trip_features', None)
    trip_group_key = trip_kwargs.pop('trip_grouping', 'section_mode_argmax')
    
    demographics = {
        'allceo': [
            'has_drivers_license', 'is_student', 'is_paid', 'income_category',
            'n_residence_members', 'n_residents_u18', 'n_residents_with_license',
            'n_motor_vehicles', 'has_medical_condition',
            'ft_job', 'multiple_jobs', 'n_working_residents',
            "highest_education_Bachelor's degree",
            'highest_education_Graduate degree or professional degree',
            'highest_education_High school graduate or GED',
            'highest_education_Less than a high school graduate',
            'highest_education_Prefer not to say',
            'highest_education_Some college or associates degree',
            'primary_job_description_Clerical or administrative support',
            'primary_job_description_Custodial',
            'primary_job_description_Education',
            'primary_job_description_Food service',
            'primary_job_description_Linecook',
            'primary_job_description_Manufacturing, construction, maintenance, or farming',
            'primary_job_description_Medical/healthcare',
            'primary_job_description_Non-profit program manager',
            'primary_job_description_Other',
            'primary_job_description_Professional, managerial, or technical',
            'primary_job_description_Sales or service',
            'primary_job_description_Self employed',
            'primary_job_description_food service', 'gender_Man',
            'gender_Nonbinary/genderqueer/genderfluid', 'gender_Prefer not to say',
            'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid',
            'age_16___20_years_old', 'age_21___25_years_old',
            'age_26___30_years_old', 'age_31___35_years_old',
            'age_36___40_years_old', 'age_41___45_years_old',
            'age_46___50_years_old', 'age_51___55_years_old',
            'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old',
            'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail',
            'av_unknown', 'av_walk', 'av_car', 'av_s_car'
        ],
        'durham': [
            'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18',
            'n_residence_members', 'income_category',
            'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles',
            'has_medical_condition', 'ft_job', 'multiple_jobs',
            'highest_education_bachelor_s_degree',
            'highest_education_graduate_degree_or_professional_degree',
            'highest_education_high_school_graduate_or_ged',
            'highest_education_less_than_a_high_school_graduate',
            'highest_education_some_college_or_associates_degree',
            'primary_job_description_Clerical or administrative support',
            'primary_job_description_Manufacturing, construction, maintenance, or farming',
            'primary_job_description_Other',
            'primary_job_description_Professional, Manegerial, or Technical',
            'primary_job_description_Sales or service', 'gender_man',
            'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman',
            'age_16___20_years_old', 'age_21___25_years_old',
            'age_26___30_years_old', 'age_31___35_years_old',
            'age_36___40_years_old', 'age_41___45_years_old',
            'age_51___55_years_old', 'age_56___60_years_old', 'av_walk',
            'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car',
            'av_ridehail', 'av_s_micro', 'av_s_car'
        ],
        'nicr': [
            'is_student', 'is_paid',
            'has_drivers_license', 'n_residents_u18', 'n_residence_members',
            'income_category', 'n_residents_with_license',
            'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',
            'ft_job', 'multiple_jobs',
            'highest_education_high_school_graduate_or_ged',
            'highest_education_prefer_not_to_say', 'primary_job_description_Other',
            'gender_man', 'gender_woman', 'age_16___20_years_old', 'av_p_micro',
            'av_car', 'av_transit', 'av_ridehail', 'av_no_trip', 'av_s_car',
            'av_s_micro', 'av_unknown', 'av_walk'
        ],
        'masscec': [
            'is_student', 'is_paid',
            'has_drivers_license', 'n_residents_u18', 'n_residence_members',
            'income_category', 'n_residents_with_license',
            'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',
            'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree',
            'highest_education_graduate_degree_or_professional_degree',
            'highest_education_high_school_graduate_or_ged',
            'highest_education_less_than_a_high_school_graduate',
            'highest_education_prefer_not_to_say',
            'highest_education_some_college_or_associates_degree',
            'primary_job_description_Clerical or administrative support',
            'primary_job_description_Manufacturing, construction, maintenance, or farming',
            'primary_job_description_Other',
            'primary_job_description_Prefer not to say',
            'primary_job_description_Professional, Manegerial, or Technical',
            'primary_job_description_Sales or service', 'gender_man',
            'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old',
            'age_21___25_years_old', 'age_26___30_years_old',
            'age_31___35_years_old', 'age_36___40_years_old',
            'age_41___45_years_old', 'age_46___50_years_old',
            'age_51___55_years_old', 'age_56___60_years_old',
            'age_61___65_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car',
            'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown',
            'av_ridehail', 'av_walk'
        ],
        'ride2own': [
            'has_drivers_license', 'is_student',
            'is_paid', 'income_category', 'n_residence_members',
            'n_working_residents', 'n_residents_u18', 'n_residents_with_license',
            'n_motor_vehicles', 'has_medical_condition',
            'ft_job', 'multiple_jobs',
            'highest_education_bachelor_s_degree',
            'highest_education_high_school_graduate_or_ged',
            'highest_education_less_than_a_high_school_graduate',
            'highest_education_some_college_or_associates_degree',
            'primary_job_description_Other',
            'primary_job_description_Professional, Manegerial, or Technical',
            'gender_man', 'gender_woman', 'age_31___35_years_old',
            'age_36___40_years_old', 'age_41___45_years_old',
            'age_51___55_years_old', 'av_no_trip', 'av_s_micro', 'av_transit',
            'av_car', 'av_ridehail', 'av_p_micro', 'av_s_car', 'av_walk',
            'av_unknown'
        ]
    }
    
    # Retain only the first instance of each user and subset the columns.
    filtered = df.groupby('user_id').first()[demographics[CURRENT_DB]]
    
    # Get the targets.
    targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())
    
    filtered = filtered.merge(right=targets, left_index=True, right_index=True)
    
    if trip_features_to_use is None or len(trip_features_to_use) == 0:
#         # Use the available modes as indicators.
#         return encode_availability(filtered)
        return filtered
    
    # -----------------------------------------------------------
    # Reaching here means that we need to include trip summaries
    # -----------------------------------------------------------
    
    # For every user, generate the global trip-level summaries.
    global_aggs = df.groupby('user_id').agg({'duration': 'mean', 'distance': 'mean'})
    
    # coverage.
    coverage = get_mode_coverage(df)
    
    # Trip-level features.
    trip_features = get_trip_summaries(
        df=df, 
        group_key=trip_group_key, 
        feature_list=trip_features_to_use,
        use_qcut=trip_kwargs.pop('use_qcut', False)
    )
    
    targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())
    
    trip_features = trip_features.merge(right=coverage, left_index=True, right_index=True)
    trip_features = trip_features.merge(right=global_aggs, left_index=True, right_index=True)
    
    # Finally, join with availability indicators and targets.
    trip_features = trip_features.merge(right=targets, left_index=True, right_index=True)
    
    return trip_features.reset_index(drop=False)

## Experiment 1: Only demographics

In [None]:
## Educated suburban woman -> 
# An embedding where:
# "highest_education_Bachelor's degree" == 1 or 'highest_education_Graduate degree or professional degree' == 1
# income_category >= 4 ( + more features that define 'suburban-ness')
# gender_Woman == 1

demo_df = get_demographic_data(df)

In [None]:
display(demo_df.head())

In [None]:
tsne_kwargs = {
    'perplexity': min(len(demo_df)-1, 6),
    'n_iter': 7500,
    'metric': 'cosine'
}

# ## PLOT BY THE WAY IN WHICH PEOPLE USE THE SAME REPLACED MODE AND CHECK THE SIMILARITY.

projections = generate_tsne_plots(demo_df, **tsne_kwargs)

In [None]:
# No stratification, pure random.
demo_df.reset_index(drop=False, inplace=True)
train, test = train_test_split(demo_df, test_size=0.2, random_state=SEED)

TRAIN_USERS = train.user_id.unique().tolist()
TEST_USERS = test.user_id.unique().tolist()

In [None]:
print(train.shape[0], test.shape[0])

In [None]:
# Ensuring that no user information is leaked across sets.
assert train.shape[0] + test.shape[0] == len(df.user_id.unique())

In [None]:
def evaluate_using_similarity(test_df, train_df, metric=SimilarityMetric.COSINE, **metric_kwargs):
    
    '''
    This method treats each user row as a 'fingerprint' (embedding vector). We assume that we
    have no idea about the test set labels. To find which replaced mode is most likely for the test
    users, we compute the cosine similarity of each test user against the users in the training set.
    For the most similar user, we use their target as a proxy for the test user's replaced mode.
    This operates on the following intuition: If User A and User B are similar, then their replaced
    modes are also similar.
    '''
    
    tr_targets = train_df.target.values
    tr = train_df.drop(columns=['target', 'user_id'], inplace=False).reset_index(drop=True, inplace=False)
    
    te_targets = test_df.target.values
    te = test_df.drop(columns=['target', 'user_id'], inplace=False).reset_index(drop=True, inplace=False)
    
    if metric == SimilarityMetric.COSINE:
        # Use cosine similarity to determine which element in the train set this user is closest to.
        # Offset the columns from the second entry to exclude the user_id column.
        # Returns a (n_te, n_tr) matrix.
        sim = cosine_similarity(te.values, tr.values)
        
        # Compute the argmax across the train set.
        argmax = np.argmax(sim, axis=1)

        # Index into the training targets to retrieve predicted label.
        y_test_pred = tr_targets[argmax]
        
    elif metric == SimilarityMetric.EUCLIDEAN:
        
        # Here, we choose the embedding with the smallest L2 distance.
        distances = euclidean_distances(te.values, tr.values)
        
        # We choose argmin
        argmin = np.argmin(distances, axis=1)
        
        # Index into the targets.
        y_test_pred = tr_targets[argmin]
    
    elif metric == SimilarityMetric.KNN:
        
        n_neighbors = metric_kwargs.pop('n_neighbors', 3)
        
        if n_neighbors >= len(tr):
            return -1.
        
        # Build the KNN classifier. By default, let it be 3.
        knn = KNeighborsClassifier(
            n_neighbors=n_neighbors,
            weights='distance',
            metric=metric_kwargs.pop('knn_metric', 'cosine'),
            n_jobs=os.cpu_count()
        )
        
        # Fit the data to the KNN model
        knn.fit(tr, tr_targets)
        
        y_test_pred = knn.predict(te)
    
    elif metric == SimilarityMetric.KMEANS:
        
        n_clusters = metric_kwargs.pop('n_clusters', 8)
        
        if n_clusters >= len(tr):
            return -1
        
        # Build the model.
        kmeans = KMeans(
            n_clusters=n_clusters,
            max_iter=metric_kwargs.pop('max_iter', 300),
            n_init='auto',
            random_state=SEED
        )
        
        # Fit the clustering model
        kmeans.fit(tr)
        
        # Construct the auxiliary df and merge with the training set.
        label_df = pd.DataFrame({'label': kmeans.labels_, 'target': tr_targets}, index=tr.index)
        
        # Now, perform an inference on the test set.
        predicted_labels = kmeans.predict(te)
        
        y_test_pred = []
        for prediction in predicted_labels:
            most_likely = label_df.loc[label_df.label == prediction, 'target'].value_counts().idxmax()
            y_test_pred.append(most_likely)
        
    else:
        raise NotImplementedError("Unknown similarity metric")
    
    
    f1 = f1_score(y_true=te_targets, y_pred=y_test_pred, average='weighted')
    print(f"Test F1 score using {metric.name} = {f1}")

In [None]:
for metric in [
    SimilarityMetric.COSINE, SimilarityMetric.EUCLIDEAN, SimilarityMetric.KNN, SimilarityMetric.KMEANS
]:
    evaluate_using_similarity(test, train, metric, n_clusters=3)

Not bad - using just a simple random split gives us the following results:

$allCEO$:

```
Test F1 score using COSINE = 0.42692939244663386
Test F1 score using EUCLIDEAN = 0.4126984126984127
Test F1 score using KNN = 0.4393241167434716
Test F1 score using KMEANS = 0.4733893557422969
```

In [None]:
def custom_nll_scorer(clf, X, y):
    
    # [[yp1, yp2, yp3, ...], [yp1, yp3, ...]]
    y_pred = clf.predict_proba(X)
    
    return -log_loss(y_true=y, y_pred=y_pred, labels=sorted(np.unique(y)))

In [None]:
def estimate_using_model(train, test, **model_kwargs):
    
    cv = model_kwargs.pop('cv', None)
    n_splits = model_kwargs.pop('n_splits', 5)
    n_iter = model_kwargs.pop('n_iter', 500)
    
    if cv is None:
        # Define the train-val splitter.
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    params = {
        'n_estimators': np.arange(100, 1001, 50),
        'max_depth': [i for i in range(5, 101, 5)],
        'ccp_alpha': np.linspace(0, 1, 10),
        'class_weight': ['balanced', 'balanced_subsample', None],
        'min_samples_split': np.arange(2, 25, 2),
        'min_samples_leaf': np.arange(1, 25)
    }
    
    rf = RandomForestClassifier(random_state=SEED)
    
    # Search over hparams to minimize negative log likelihood.  
#     clf = RandomizedSearchCV(
#         rf, params, n_iter=n_iter, scoring=custom_nll_scorer, 
#         n_jobs=os.cpu_count(), cv=cv, random_state=SEED,
#         verbose=0
#     )
    
    clf = RandomizedSearchCV(
        rf, params, n_iter=n_iter, scoring='f1_weighted', 
        n_jobs=cpu_count(), cv=cv, random_state=SEED,
        verbose=0
    )
    
    X_tr = train.drop(columns=['user_id', 'target'])
    y_tr = train.target.values.ravel()
    
    scorer = clf.fit(X_tr, y_tr)
    
    best_model = scorer.best_estimator_
    
    print(f"Best val score = {scorer.best_score_}")
    
    X_te = test.drop(columns=['user_id', 'target'])
    
    # Use the best model to compute F1 on the test set.
    test_f1 = f1_score(y_true=test.target.values, y_pred=best_model.predict(X_te), average='weighted')
    
    print(f"Test F1 = {test_f1}")
    
    return best_model

### Uncomment to run the model 

In [None]:
# model = estimate_using_model(train, test)

Interesting! The model is slightly on par with K-Means!

## Experiment 2: Demographics with trip summaries

Now that we've performed experiments with solely demographic data, let's expand the feature set by including 
trip summary statistics. We would like this approach to do better than the aforementioned baselines.

In [None]:
demo_plus_trips = get_demographic_data(
    df, 
    trip_features=['mph', 'section_duration_argmax', 'section_distance_argmax', 'start_local_dt_hour', 'end_local_dt_hour']
)

demo_plus_trips.fillna(0., inplace=True)

In [None]:
demo_plus_trips.head()

In [None]:
train = demo_plus_trips.loc[demo_plus_trips.user_id.isin(TRAIN_USERS), :]
test = demo_plus_trips.loc[demo_plus_trips.user_id.isin(TEST_USERS), :]

print(train.shape[0], test.shape[0])

In [None]:
for metric in [
    SimilarityMetric.COSINE, SimilarityMetric.EUCLIDEAN, SimilarityMetric.KNN, SimilarityMetric.KMEANS
]:
    evaluate_using_similarity(test, train, metric, n_clusters=4)

Great! Some improvement here and there.

$allCEO$
```
Test F1 score using COSINE = 0.32098765432098775
Test F1 score using EUCLIDEAN = 0.36684303350970027
Test F1 score using KNN = 0.41269841269841273
Test F1 score using KMEANS = 0.4877344877344878
```

### Uncomment this to run the model

In [None]:
# Now, we try with the model
# estimate_using_model(train, test)

Great! Compared to the previous model, we see definite improvements! I'm sure we can squeeze some more juice out of the models using fancy optimization, but as a baseline, these are good enough.


So, to recap:
$F1_{cosine} = 0.37$, $F1_{euclidean} = 0.33$, $F1_{knn} = 0.3$, $F1_{kmeans} = 0.36$, $F1_{RF} = 0.4215$

### Different groupings.

In [None]:
# trip_features = ['mph', 'section_duration_argmax', 'section_distance_argmax', 'start:hour', 'end:hour']

# for group_mode in ['section_mode_argmax', 'section_distance_argmax', 'section_duration_argmax', 'duration', 'distance']:
    
#     if group_mode in trip_features:
#         _ = trip_features.pop(trip_features.index(group_mode))
    
#     exp_df = get_demographic_data(
#         df, 
#         trip_grouping=group_mode,
#         trip_features=trip_features,
#         use_qcut=True
#     )
    
#     train, test = train_test_split(exp_df, test_size=0.2, random_state=SEED)
    
#     for sim in [
#         SimilarityMetric.COSINE, SimilarityMetric.EUCLIDEAN, SimilarityMetric.KNN, SimilarityMetric.KMEANS
#     ]:
#         evaluate_using_similarity(test, train, sim, n_clusters=3)
    
#     # estimate_using_model(train, test, n_iter=200)
    
#     print(50*'=')

In [None]:
_ = generate_tsne_plots(
    demo_plus_trips, 
    perplexity=min(len(demo_plus_trips)-1, 6), 
    n_iter=7500
)

# (Experimental) Multi-level modeling

## The code below onwards is not tested.

In this approach, we want to piece together the similarity search and modeling processes. Here's a rough sketch of how it should be implemented:

1. For every user in the training set, build a model using their entire trip history.
2. Consolidate these user-level models in data structure, preferably a dictionary.
3. Now, when we want to perform inference on a new user with no prior trips, we use the similarity search to get the user ID in the training set who is the most similar to the user in question.
4. We retrieve the model for this corresponding user and perform an inference. The hypothesis is that since the two users are similar, their trip substitution patterns are also similar.

In [None]:
# def drop_columns(df: pd.DataFrame):
#     to_drop = [
#         'source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', 
#         'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', 
#         'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation',
#         'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', 
#         'start:day', 'start_local_dt_minute', 'start_local_dt_second', 
#         'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', 
#         'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', 
#         'end_local_dt_timezone', '_id', 'metadata_write_ts', 'additions', 
#         'mode_confirm', 'purpose_confirm', 'Mode_confirm', 'Trip_purpose', 
#         'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', 
#         'available_modes', 'section_coordinates_argmax', 'section_mode_argmax'
#     ]
    
#     # Drop section_mode_argmax and available_modes.
#     return df.drop(
#         columns=to_drop, 
#         inplace=False
#     )

In [None]:
# def construct_model_dictionary(train: pd.DataFrame):
    
#     def train_on_user(user_id: str):
#         '''
#         Given the training set and the user ID to query, filter the dataset and
#         retain only the relevant trips. Then, create folds and optimize a model for this user.
#         Return the trained model instance.
#         '''
        
#         user_data = train.loc[train.user_id == user_id, :].reset_index(drop=True)
        
#         # Split user trips into train-test folds.
#         u_train, u_test = train_test_split(user_data, test_size=0.2, shuffle=True, random_state=SEED)
        
#         user_model = estimate_using_model(
#             u_train, u_test, 
#             n_iter=100
#         )
        
#         return user_model
    
#     for user in train.user_id.unique():
#         MODEL_DICT[user]['warm_start'] = train_on_user(user)
#         print(50*'=')
    
#     print("\nDone!")

## Warm start:

If the queried user has prior trips, we know that we we can harness the additional information. So if we encounter such a user, we will first find the most similar user (using only demographics). Once the most similar user is found, we query the trip model for the user and run inference through it.

## Cold start:

If the queried user has no prior trips, we will use the demo-only model. We first perform a similarity search and then run user inference through the demo-only model.

In [None]:
# class MultiLevelModel:
#     def __init__(self, model_dict: Dict, train: pd.DataFrame, test: pd.DataFrame, **model_kwargs):
        
#         self._demographics = [
#             'primary_job_commute_time', 'income_category', 'n_residence_members', 'n_residents_u18', 
#             'n_residents_with_license', 'n_motor_vehicles', 'available_modes', 'age', 'gender_Man', 
#             'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', 
#             'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', 
#             'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', 
#             'has_multiple_jobs_No', 'has_multiple_jobs_Prefer not to say', 'has_multiple_jobs_Yes', 
#             "highest_education_Bachelor's degree", 'highest_education_Graduate degree or professional degree', 
#             'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', 
#             'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', 
#             'primary_job_type_Full-time', 'primary_job_type_Part-time', 'primary_job_type_Prefer not to say', 
#             'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', 
#             'primary_job_description_Education', 'primary_job_description_Food service', 
#             'primary_job_description_Manufacturing, construction, maintenance, or farming', 
#             'primary_job_description_Medical/healthcare', 'primary_job_description_Other', 
#             'primary_job_description_Professional, managerial, or technical', 
#             'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', 
#             'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', 
#             'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', 
#             'primary_job_commute_mode_WFH', 'is_overnight_trip', 'n_working_residents'
#         ]
        
#         assert all([c in test.columns for c in self._demographics]), "[test] Demographic features are missing!"
#         assert all([c in train.columns for c in self._demographics]), "[train] Demographic features are missing!"
        
#         self._mdict = model_dict
#         self._train = train
#         self._test = test
#         self.metric = model_kwargs.pop('metric', SimilarityMetric.COSINE)
        
    
#     def _phase1(self):
        
#         tr = self._train.copy()
#         te = self._test.copy()
        
#         if tr.columns.isin(['user_id', 'target']).sum() == 2:
#             tr = tr.drop(columns=['user_id', 'target']).reset_index(drop=True)
        
#         if te.columns.isin(['user_id', 'target']).sum() == 2:
#             te = te.drop(columns=['user_id', 'target']).reset_index(drop=True)

#         te_users = self._test.user_id.tolist()

#         if self.metric == SimilarityMetric.COSINE:

#             sim = cosine_similarity(te.values, tr.values)

#             # Compute the argmax across the train set.
#             argmax = np.argmax(sim, axis=1)

#             # Retrieve the user_id at these indices.
#             train_users = self._train.loc[argmax, 'user_id']

#         elif self.metric == SimilarityMetric.EUCLIDEAN:

#             sim = euclidean_distances(te.values, tr.values)

#             # Compute the argmin here!
#             argmin = np.argmin(sim, axis=1)

#             # Retrieve the train user_ids.
#             train_users = self._train.loc[argmin, 'user_id']

#         return pd.DataFrame({'test_user_id': te_users, 'train_user_id': train_users})
    
    
#     def _phase2(self, sim_df: pd.DataFrame, cold_start: bool):
        
#         prediction_df = list()
        
#         # Now, we use the sim_df to run inference based on whether 
#         for ix, row in sim_df.iterrows():
#             train_user = row['train_user_id']
            
#             # Retrieve the appropriate model.
#             user_models = self._mdict.get(train_user, None)
            
#             start_type = 'cold_start' if cold_start else 'warm_start'
            
#             # which specific model?
#             sp_model = user_models.get(start_type, None)
            
#             # Now get the test user data.
#             test_user = row['test_user_id']
            
#             if cold_start:
#                 test_data = self._test.loc[self._test.user_id == test_user, self._demographics]
#                 test_data = test_data.iloc[0, :]
#             else:
#                 test_data = self._test.loc[self._test.user_id == test_user, :]
            
#             predictions = sp_model.predict(test_data)
            
#             print(f"test: [{test_user}], predictions: {predictions}")
    
    
#     def execute_pipeline(self, cold_start: bool = False):
#         # For each test user, get the most similar train user.
#         sim_df = self._phase1()
        
#         predictions = self._phase2(sim_df, cold_start)

In [None]:
# # FULL DATA.
# train = df.loc[df.user_id.isin(TRAIN_USERS), :]
# test = df.loc[df.user_id.isin(TEST_USERS), :]

# train_counts = train.user_id.value_counts()

In [None]:
# ## We only want to train on users who have a good number of trips.
# good_users = train_counts[train_counts >= 100].index

# bad_users = train_counts[train_counts < 100].index

# print(f"Number of users filtered out of training: {len(bad_users)}")

# filtered_train = train.loc[train.user_id.isin(good_users), :]

In [None]:
# # Full data.

# train_df = drop_columns(filtered_train)
# test_df = drop_columns(test)

In [None]:
# print(train_df.shape, test_df.shape)

In [None]:
# model_dict = construct_model_dictionary(train_df)