# Evaluate the use of the similarity code for trip clustering

In the `Explore sim usage (common trips -> labeling) unrolled` notebook, we determined some changes to the similarity settings to work better with our use case. We will now explore the impact of those changes, and of the radius parameter.

Overall, we will experiment with three parameters:
1. filter or not
2. radius
3. cutoff or not

This notebook is intended to be run against the participant-only version of the CanBikeCO Jan 31 minipilot dataset.
If you have the older version that includes data from non-participants as well, please replace

```
all_users = esta.TimeSeries.get_uuid_list()
```

with 

```
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]
```

### First, we read the data and extract the most common purpose labels

In [None]:
import pandas as pd
import numpy as np
import geojson as gj
import sklearn.cluster as sc
import sklearn.metrics.pairwise as smp
import sklearn.metrics as sm

In [None]:
import json
import copy
import itertools

In [None]:
import folium
import branca.element as bre

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import seaborn as sns

In [None]:
from IPython import display
from uuid import UUID

import bson.json_util as bju
import bson.objectid as boi

In [None]:
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.modelling.tour_model.similarity as eamts

In [None]:
import emission.core.wrapper.entry as ecwe
import emission.core.wrapper.confirmedtrip as ecwct

### Read data and setup variables

In [None]:
all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_trip_df_map = {}
for u in all_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")
    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])

In [None]:
RADIUS_CHOICES = [100, 300, 500]
REGIME_CHOICES = ["no_filter_no_cutoff", "no_filter_cutoff", "filter_no_cutoff", "filter_cutoff"]

In [None]:
USER_INPUT_COLS = ["mode_confirm", "purpose_confirm", "replaced_mode"]

### Standard functions (currently copied over from other notebooks; should be refactored into a python file)

In [None]:
def get_loc_df(loc_series):
    loc_df = pd.DataFrame(loc_series.apply(lambda p: p["coordinates"]).to_list(), columns=["longitude", "latitude"])
    # display.display(end_loc_df.head())
    return loc_df

### First, we pick a participant to work with

In [None]:
n_trips_df = pd.DataFrame([[u, len(confirmed_trip_df_map[u]), len(labeled_trip_df_map[u])] for u in all_users], columns=["user_id", "all_trips", "labeled_trips"]); n_trips_df

In [None]:
median_user = n_trips_df[n_trips_df.labeled_trips == n_trips_df.labeled_trips.median()].user_id.iloc[0]; median_user

In [None]:
median_user_df = expanded_trip_df_map[median_user]

In [None]:
user_id = median_user
user_trip_df = median_user_df
user_trip_list = [ecwe.Entry({"data": ecwct.Confirmedtrip(tr), "_id": tr["_id"], "metadata": {"key": "analysis/confirmed_trip"}}) for tr in user_trip_df.to_dict("records")]

In [None]:
# add_trip_clusters_oursim(participant_df.index[0], all_expanded_df)
for r in RADIUS_CHOICES:
    curr_sim = eamts.similarity(user_trip_list, r, shouldFilter=False, cutoff=False)
    curr_sim.fit()
    user_trip_df[f"nofilter_nocutoff_{r}"] = curr_sim.labels_
    
    curr_sim = eamts.similarity(user_trip_list, r, shouldFilter=False, cutoff=True)
    curr_sim.fit()
    user_trip_df[f"nofilter_cutoff_{r}"] = curr_sim.labels_
    
    curr_sim = eamts.similarity(user_trip_list, r, shouldFilter=True, cutoff=False)
    curr_sim.fit()
    user_trip_df[f"filter_nocutoff_{r}"] = curr_sim.labels_
    
    curr_sim = eamts.similarity(user_trip_list, r, shouldFilter=True, cutoff=True)
    curr_sim.fit()
    user_trip_df[f"filter_cutoff_{r}"] = curr_sim.labels_    

In [None]:
user_trip_df[["nofilter_nocutoff_100", "nofilter_cutoff_100", "filter_nocutoff_100", "filter_cutoff_100"]]

### Assign "ground truth" labels for either tuples or individual columns

In order to get the ground truth labels, we need to find all unique combinations and assign labels based on that. I wonder if we can just send the n-tuple of the user labels directly. Probably better to do a conversion first. Conversion also lets us experiment with individual columns instead of tuples.

In [None]:
def add_ground_truth(trip_df, columns, gt_label):
    unique_tuples = dict(trip_df.groupby(by=columns).groups)
    for i, idxlist in enumerate(unique_tuples.values()):
    # print(i, idxlist)
        trip_df.loc[idxlist, gt_label] = i

In [None]:
add_ground_truth(user_trip_df, ["mode_confirm", "purpose_confirm", "replaced_mode"], "ground_truth_tuple")

In [None]:
user_trip_df.ground_truth_tuple.max()

In [None]:
add_ground_truth(user_trip_df, ["mode_confirm"], "ground_truth_mc")
add_ground_truth(user_trip_df, ["purpose_confirm"], "ground_truth_pc")
add_ground_truth(user_trip_df, ["replaced_mode"], "ground_truth_rm")

In [None]:
user_trip_df.ground_truth_mc.max(), user_trip_df.ground_truth_pc.max(), user_trip_df.ground_truth_rm.max(), 

### Let us now compute the homogeneity score and the request percentage in case of no cutoff

- request %: the request % is just the number of clusters, since we will ask the user once for each cluster.
- homogeneity score: we just use the built-in sklearn method.

In [None]:
nfnc = user_trip_df["nofilter_nocutoff_100"].unique(); nfnc

In [None]:
sm.homogeneity_score(user_trip_df.ground_truth_tuple, user_trip_df["nofilter_nocutoff_100"])

In [None]:
sm.homogeneity_score(user_trip_df.ground_truth_mc, user_trip_df["nofilter_nocutoff_100"])

In [None]:
sm.homogeneity_score(user_trip_df.ground_truth_pc, user_trip_df["nofilter_nocutoff_100"])

In [None]:
sm.homogeneity_score(user_trip_df.ground_truth_rm, user_trip_df["nofilter_nocutoff_100"])

In [None]:
two_trip_cluster_labels = [c for c in nfnc if np.count_nonzero(user_trip_df.nofilter_nocutoff_100 == c) > 1]; two_trip_cluster_labels

In [None]:
user_trip_df[user_trip_df.nofilter_nocutoff_100 == 23]

In [None]:
two_trip_cluster_trips = user_trip_df[user_trip_df.nofilter_nocutoff_100.isin(two_trip_cluster_labels)]
two_trip_cluster_trips

In [None]:
sm.homogeneity_score(two_trip_cluster_trips.ground_truth_tuple, two_trip_cluster_trips.nofilter_nocutoff_100)

In [None]:
len(nfnc)

### Let us now compute the homogeneity score and the request percentage in case of cutoff

- request %: we will need to add the "noisy" trips (trips with labels of -1) since they will not be in clusters
- homogeneity score: we just use the built-in sklearn method

In [None]:
nfc = user_trip_df["nofilter_cutoff_100"].unique(); nfc

In [None]:
sm.homogeneity_score(user_trip_df.ground_truth_tuple, user_trip_df["nofilter_cutoff_100"])

In [None]:
len(nfc[nfc != -1]), np.count_nonzero(user_trip_df["nofilter_cutoff_100"] == -1), len(nfc[nfc != -1]) + np.count_nonzero(user_trip_df["nofilter_cutoff_100"] == -1)

### Let us now compute the homogeneity score and the request percentage with filtering

- request %: we will need to add the filtered trips (trips with labels of -2) since they will not be in clusters
- homogeneity score: we just use the built-in sklearn method.

In [None]:
fc = user_trip_df["filter_cutoff_100"].unique(); fc

In [None]:
sm.homogeneity_score(user_trip_df.ground_truth_tuple, user_trip_df["filter_cutoff_100"])

In [None]:
(np.count_nonzero(fc >= 0),
np.count_nonzero(user_trip_df["filter_cutoff_100"].unique() >= 0),
np.count_nonzero(user_trip_df["filter_cutoff_100"] == -1),
np.count_nonzero(user_trip_df["filter_cutoff_100"] == -2), 
np.count_nonzero(fc >= 0) + np.count_nonzero(user_trip_df["filter_cutoff_100"] == -1) + np.count_nonzero(user_trip_df["filter_cutoff_100"] == -2))

### Let us compare the cluster : trip ratio with the request %

In [None]:
len(user_trip_df.nofilter_nocutoff_300.unique()), len(user_trip_df.nofilter_nocutoff_300.unique())/len(user_trip_df)

## Generalizing to multiple users

In [None]:
def all_ground_truth_cols(trip_df):
    add_ground_truth(trip_df, ["mode_confirm", "purpose_confirm", "replaced_mode"], "ground_truth_tuple")
    add_ground_truth(trip_df, ["mode_confirm"], "ground_truth_mc")
    add_ground_truth(trip_df, ["purpose_confirm"], "ground_truth_pc")
    add_ground_truth(trip_df, ["replaced_mode"], "ground_truth_rm")

In [None]:
def add_predicted_cols(trip_df):
    trip_list = [ecwe.Entry({"data": ecwct.Confirmedtrip(tr), "_id": tr["_id"], "metadata": {"key": "analysis/confirmed_trip"}}) for tr in trip_df.to_dict("records")]
    for r in RADIUS_CHOICES:
        curr_sim = eamts.similarity(trip_list, r, shouldFilter=False, cutoff=False)
        curr_sim.fit()
        trip_df[f"no_filter_no_cutoff_{r}"] = curr_sim.labels_.to_list()
        # print(f"Got labels {curr_sim.labels_} of length {len(curr_sim.labels_.dropna())}")

        curr_sim = eamts.similarity(trip_list, r, shouldFilter=False, cutoff=True)
        curr_sim.fit()
        trip_df[f"no_filter_cutoff_{r}"] = curr_sim.labels_.to_list()

        curr_sim = eamts.similarity(trip_list, r, shouldFilter=True, cutoff=False)
        curr_sim.fit()
        trip_df[f"filter_no_cutoff_{r}"] = curr_sim.labels_.to_list()

        curr_sim = eamts.similarity(trip_list, r, shouldFilter=True, cutoff=True)
        curr_sim.fit()
        trip_df[f"filter_cutoff_{r}"] = curr_sim.labels_.to_list()
    print(f"For {trip_df.user_id.iloc[0]}, returning df with cols {trip_df.columns}")

Some subset of the user labels may be missing, in which case it will be represented by NaN
This won't match any unique values, so the ground truth will be NaN and the score calculation will break.
We can try to `dropna()` but if the other values are present, it doesn't make sense to drop the entire trip from
similarity modeling. Let's do a separate check instead.

```
mode_confirm 	purpose_confirm 	replaced_mode 	ground_truth_rm
64 	drove_alone 	school 	NaN 	NaN
71 	drove_alone 	work 	NaN 	NaN
```

In [None]:
def h_score_no_na(labels_true, labels_pred):
    na_index = labels_true[pd.isna(labels_true)].index
    # Before we set the index to nan; we don't want to have a side effect here!
    new_labels_true = labels_true.copy()
    new_labels_pred = labels_pred.copy()
    new_labels_pred.loc[na_index] = np.nan
    if (len(na_index) > 0):
        print(f"Dropping nan indices {na_index} before calculating score")
        # print(f"{labels_true.dropna()}, {new_labels_pred.dropna()}")
    return sm.homogeneity_score(new_labels_true.dropna(), new_labels_pred.dropna())

In [None]:
def request_count(labels_pred):
    # once per real cluster
    # once per noisy point (since it is not in a cluster)
    # once per filtered trip (not really necessary for our current regime, but good to be prepared)
    return np.count_nonzero(labels_pred.unique() >= 0) \
                    + np.count_nonzero(labels_pred == -1) \
                    + np.count_nonzero(labels_pred == -2)

In [None]:
def get_all_metrics(trip_df):
    curr_result = {"n_trips": len(trip_df)}
    curr_result["ground_truth_tuple_unique"] = trip_df.ground_truth_tuple.unique()
    curr_result["ground_truth_mode_unique"] = trip_df.ground_truth_mc.unique()
    curr_result["ground_truth_purpose_unique"] = trip_df.ground_truth_pc.unique()
    curr_result["ground_truth_replaced_mode_unique"] = trip_df.ground_truth_rm.unique()

    curr_result["ground_truth_tuple_lower_bound"] = len(trip_df.ground_truth_tuple.unique())
    curr_result["ground_truth_mode_lower_bound"] = len(trip_df.ground_truth_mc.unique())
    curr_result["ground_truth_purpose_lower_bound"] = len(trip_df.ground_truth_pc.unique())
    curr_result["ground_truth_replaced_mode_lower_bound"] = len(trip_df.ground_truth_rm.unique())

    # print(f"after computing lower bounds: {curr_result}")
    for r in RADIUS_CHOICES:
        for regime in ["no_filter_no_cutoff", "no_filter_cutoff", "filter_no_cutoff", "filter_cutoff"]:
            for gts in ["tuple", "mc", "pc", "rm"]:
                print(f"About to calculate score by comparing {trip_df[f'ground_truth_{gts}'].unique()} for ground_truth_{gts} with {trip_df[f'{regime}_{r}'].unique()}")
                curr_result[f"{regime}_{r}_homogeneity_score_{gts}"] = \
                    h_score_no_na(trip_df[f"ground_truth_{gts}"], trip_df[f"{regime}_{r}"])
            # request pct doesn't depend on ground truth, only on the predicted clustering
            curr_result[f"{regime}_{r}_request_count"] = request_count(user_trip_df[f"{regime}_{r}"])
            curr_result[f"{regime}_{r}_request_pct"] = curr_result[f"{regime}_{r}_request_count"] / curr_result["n_trips"]
            curr_result[f"{regime}_{r}_cluster_trip_ratio"] = len(curr_trip_df[f"{regime}_{r}"].unique()) / curr_result["n_trips"]
            

    # print(f"For {trip_df.user_id.iloc[0]}, returning result {curr_result}")
    return curr_result

In [None]:
# test_trip_df = expanded_trip_df_map[UUID("576e37c7-ab7e-4c03-add7-02486bc3f42e")]
# test_trip_df["no_filter_no_cutoff_100"].unique()

In [None]:
# we assume at least two trip per day (to/from work?!) forexpanded_trip_df_mapweek
valid_users = [u for u in all_users if len(expanded_trip_df_map[u]) > 14]

In [None]:
# **NOTE**: this cell will take a long time to execute
# Splitting this out from the result map generation to make it easier to generate different lists
for user_id in valid_users:
    print(f"Computing results for user {user_id}")
    curr_trip_df = expanded_trip_df_map[user_id]
    # May be needed because the input dataframe may not have contiguous indices (e.g. since we have filtered it to only trips with user labels
    # However, not sure how that will work in the overall DF where we have all the trips in one giant dataframe
    # so let us avoid for now
    # curr_trip_df.reset_index(inplace=True)
    all_ground_truth_cols(curr_trip_df)
    add_predicted_cols(curr_trip_df)

In [None]:
standard_result_list = []
for user_id in valid_users:
    print(f"Computing results for user {user_id}")
    curr_trip_df = expanded_trip_df_map[user_id]
    curr_metrics = get_all_metrics(curr_trip_df)
    curr_metrics["user_id"] = user_id
    standard_result_list.append(curr_metrics)

In [None]:
result_df = pd.DataFrame(standard_result_list); result_df.head()

In [None]:
result_df.columns

## Basic comparisons of the homogeneity score v/s request pct tradeoffs for different regimes and radii

In [None]:
# Let's use
fig = plt.Figure(figsize=(20,5))
axarr = fig.subplots(1,3)
colors = plt.get_cmap("Accent", 4).colors

for i, r in enumerate(RADIUS_CHOICES):
    ax = axarr[i]
    # ax = result_df.plot.scatter(x=f"no_filter_no_cutoff_{r}_homogeneity_score_tuple", y=f"no_filter_no_cutoff_{r}_request_pct", label=f"no_filter_no_cutoff_{r}")
    for j, regime in enumerate(["no_filter_no_cutoff", "no_filter_cutoff", "filter_no_cutoff", "filter_cutoff"]):
        result_df.plot.scatter(x=f"{regime}_{r}_homogeneity_score_tuple", y=f"{regime}_{r}_request_pct", color=colors[j], label=f"{regime}_{r}", ax=ax)
    ax.set_xlabel("homogeneity score")
    ax.set_ylabel("request pct")
fig

In [None]:
fig = plt.Figure(figsize=(20,5))
axarr = fig.subplots(1,3)
for i, r in enumerate(RADIUS_CHOICES):
    result_df.boxplot(column=[f"{regime}_{r}_request_pct" for regime in REGIME_CHOICES], ax=axarr[i])
    axarr[i].set_xticklabels(REGIME_CHOICES)
fig

In [None]:
result_df.boxplot(column=["no_filter_no_cutoff_500_request_pct", "no_filter_no_cutoff_500_cluster_trip_ratio"])

In [None]:
fig = plt.Figure(figsize=(20,5))
axarr = fig.subplots(1,3)
for i, r in enumerate(RADIUS_CHOICES):
    result_df.boxplot(column=[f"{regime}_{r}_homogeneity_score_tuple" for regime in REGIME_CHOICES], ax=axarr[i])
    axarr[i].set_xticklabels(REGIME_CHOICES)
fig

There is a very clear difference between no_cutoff and cutoff. The cutoff results are significantly worse wrt their homogeneity scores, which are clearly being propped up by the single trip clusters. There is not a significant difference between the various radii.

These values are worse than the cluster:trip ratio that we found in the full dataset. But the no_filter_no_cluster cases should be almost the same! Let's compute the cluster:trip ratio and confirm.

In [None]:
# for r in RADIUS_CHOICES:
#     for regime in ["no_filter_no_cutoff", "no_filter_cutoff", "filter_no_cutoff", "filter_cutoff"]:
#         result_df[f"{regime}_{r}_cluster_trip_ratio"] = result_df.user_id.apply(lambda u: len(expanded_trip_df_map[u][f"{regime}_{r}"].unique())) / result_df["n_trips"]

In [None]:
fig = plt.Figure(figsize=(20,5))
axarr = fig.subplots(1,3)
colors = plt.get_cmap("Accent", 4).colors

for i, r in enumerate(RADIUS_CHOICES):
    ax = axarr[i]
    # ax = result_df.plot.scatter(x=f"no_filter_no_cutoff_{r}_homogeneity_score_tuple", y=f"no_filter_no_cutoff_{r}_request_pct", label=f"no_filter_no_cutoff_{r}")
    for j, regime in enumerate(["no_filter_no_cutoff", "no_filter_cutoff", "filter_no_cutoff", "filter_cutoff"]):
        result_df.plot.scatter(x=f"{regime}_{r}_request_pct", y=f"{regime}_{r}_cluster_trip_ratio", color=colors[j], label=f"{regime}_{r}", ax=ax)
    ax.set_xlabel("request pct")
    ax.set_ylabel("cluster trip ratio")
fig

As we can see, the no_filter_no_cutoff results are pretty much identical to the cluster_trip_ratio, which is not surprising because in that regime, we include all trips, and all bins. But even the no_filter_no_cluster 500m ratio (median > ~0.6, max 0.85) is worse than the DBSCAN results (median < 0.4, max ~ 0.7).

On the other hand, the sim v/s DBSCAN comparion indicates that DBSCAN can have weird results sometimes, specially for the triangular trips. Let's stick with binning for now, but keep DBSCAN as an option for later.

## Selected radius: 500, selected regime: no filter, no cutoff

Based on the tradeoff results so far, the radius = 500 + no filter, no cutoff is clearly the way to go.
It has the lowest request % and highest homogeneity score.

## Pick correct metrics

### Fix the h-score calculation

While looking at the comparison between the DBSCAN algorithm and the similarity code, I noticed that the homogeneity score was significantly lower for DBSCAN, which was surprising. On reflection, I realized that this was because the noisy trips were all being put into one cluster instead of multiple clusters.

For example, consider the case in which we have two clusters of length 2 each, and 4 single trip clusters.

If all the single trip clusters are labeled with -1 for noise, we will end up with

```
>>> sm.homogeneity_score([1,1,2,2,3,4,5,6], [0,0,1,1,-1,-1,-1,-1])
0.5999999999999999
```

because it looks like the -1 predicted cluster actually munges entries from 4 different ground truth clusters.
If we replace them with unique cluster labels, we get a perfect score, as expected.

```
>>> sm.homogeneity_score([1,1,2,2,3,4,5,6], [0,0,1,1,2,3,4,5])
1.0
```


We can and should fix this, but this leads me to wonder whether we are in fact picking the correct metrics. Because once we replace the noisy points with unique labels, the non-cutoff and cutoff results will pretty much be identical (unless there are clusters with size > 1 which are cutoff, which seems unlikely). And the request % will be identical since we will ask the user once for each single trip cluster (whether retained or cutoff as noisy).

The main reasons why these are identical is because the distributions appear to be long-tailed, so the cutoff step basically only deletes 1 trip clusters. If this were not true, there would actually be a difference in the noisy trips between the two approaches and a difference in the results.

However, the choice of cutting off **can potentially** affect the modeling. If 50% of the trips are below the cutoff (as we saw in the similarity deep dive, we have a much smaller pool of entries to work with. And if one of the test entries does in fact match the 50% of the trips that are deleted, we won't have to ask the user.

To capture this, we need to build a model with or without the cutoff and match **something else** to model. That will give us an idea of whether those infrequent trips really do matter when matching with trips not in the model.

I can think of two options for **something else**.
- Split the data: we should check the performance with both a large model and small data (state for each iteration) and a small model and large data (state overall/when we reset the pipeline).
- Use the unlabeled data: while the unlabeled data doesn't have labels, it certainly has start and end points. We should be able to 


So it seems like the homogeneity score is a property of the model (which makes sense since it requires labeled data), and the request pct can/should be a property of the **unlabeled data** tested against that model.

In [None]:
h_score_no_na(expanded_trip_df_map[all_users[0]].no_filter_no_cutoff_100, expanded_trip_df_map[all_users[0]].no_filter_cutoff_100)

## Experiment with a new version of the function that treats trips below cutoff as single trip clusters

In [None]:
# Reimplemented version
def h_score_no_na(labels_true, labels_pred):
    na_index = labels_true[pd.isna(labels_true)].index
    pred_noise_index = labels_pred[labels_pred < 0].index
    # Copy before we set the index to nan; we don't want to have a side effect here!
    new_labels_true = labels_true.copy()
    new_labels_pred = labels_pred.copy()
    # Create a set of unique clusters for the noisy trips
    new_labels_pred.loc[pred_noise_index] = range(len(pred_noise_index))
    # Set all the indices corresponding to NaN in the ground truth to NaN in the pred
    # Note that we want to do this **after** replacing noisy trips
    # Since otherwise, if label_true = NaN and label_pred = -1,
    # we would end up with a unique cluster
    new_labels_pred.loc[na_index] = np.nan
    if (len(na_index) > 0):
        print(f"Dropping nan indices {na_index} before calculating score")
        # print(f"{labels_true.dropna()}, {new_labels_pred.dropna()}")
    return sm.homogeneity_score(new_labels_true.dropna(), new_labels_pred.dropna())

In [None]:
h_score_no_na(expanded_trip_df_map[all_users[0]].no_filter_no_cutoff_100, expanded_trip_df_map[all_users[0]].no_filter_cutoff_100)

In [None]:
# This should use the new version of the function
# We are OK with this weird format where we change the function implementation because
# it is an unrolled notebook
standard_result_list = []
for user_id in valid_users:
    print(f"Computing results for user {user_id}")
    curr_trip_df = expanded_trip_df_map[user_id]
    curr_metrics = get_all_metrics(curr_trip_df)
    curr_metrics["user_id"] = user_id
    standard_result_list.append(curr_metrics)

In [None]:
result_df = pd.DataFrame(standard_result_list)

In [None]:
fig = plt.Figure(figsize=(20,5))
axarr = fig.subplots(1,3)
for i, r in enumerate(RADIUS_CHOICES):
    result_df.boxplot(column=[f"{regime}_{r}_homogeneity_score_tuple" for regime in REGIME_CHOICES], ax=axarr[i])
    axarr[i].set_xticklabels(REGIME_CHOICES)
fig

## Experiment with a new version of the function that drops trips below cutoff

In [None]:
# Reimplemented version
def h_score_no_na(labels_true, labels_pred):
    pred_noise_index = labels_pred[labels_pred < 0].index
    # Copy before we set the index to nan; we don't want to have a side effect here!
    new_labels_true = labels_true.copy()
    new_labels_pred = labels_pred.copy()
    
    # print(f"Before dropping labels without noise {new_labels_true}, {new_labels_pred}")
    # print(f"Dropping labels without noise {pred_noise_index}")
    # only retain labels from non-noisy trips
    new_labels_true.drop(pred_noise_index, inplace=True)
    new_labels_pred.drop(pred_noise_index, inplace=True)
    
    # print(f"After dropping labels without noise {new_labels_true}, {new_labels_pred}")
    
    # Set all the indices corresponding to NaN in the ground truth to NaN in the pred
    # Note that is it probably best to do this *after** dropping noisy trips
    # Since, if label_true = NaN and label_pred = -1,
    # we would have dropped it anyway
    na_index = new_labels_true[pd.isna(labels_true)].index
    new_labels_pred.loc[na_index] = np.nan
    if (len(na_index) > 0):
        print(f"Dropping nan indices {na_index} before calculating score")
        # print(f"{labels_true.dropna()}, {new_labels_pred.dropna()}")
    return sm.homogeneity_score(new_labels_true.dropna(), new_labels_pred.dropna())

In [None]:
# This should use the new version of the function
# We are OK with this weird format where we change the function implementation because
# it is an unrolled notebook
standard_result_list = []
for user_id in valid_users:
    print(f"Computing results for user {user_id}")
    curr_trip_df = expanded_trip_df_map[user_id]
    curr_metrics = get_all_metrics(curr_trip_df)
    curr_metrics["user_id"] = user_id
    standard_result_list.append(curr_metrics)

In [None]:
result_df = pd.DataFrame(standard_result_list)

In [None]:
fig = plt.Figure(figsize=(20,5))
axarr = fig.subplots(1,3)
for i, r in enumerate(RADIUS_CHOICES):
    result_df.boxplot(column=[f"{regime}_{r}_homogeneity_score_tuple" for regime in REGIME_CHOICES], ax=axarr[i])
    axarr[i].set_xticklabels(REGIME_CHOICES)
fig