### imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID
import matplotlib.pyplot as plt

# import logging
# logging.basicConfig(level=logging.DEBUG)

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.modelling.trip_model.run_model as eamtr
from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS

### load data

In [None]:
all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_labeled_trip_df_map = {}
expanded_all_trip_df_map = {}
ct_entry={}
for u in all_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_entry[u]=eamtr._get_training_data(u,None)
    ct_df = ts.to_data_df("analysis/confirmed_trip",ct_entry[u])
    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(
        labeled_trip_df_map[u])
    expanded_all_trip_df_map[u] = esdtq.expand_userinputs(
        confirmed_trip_df_map[u])

check how many labeled/unlabeled trips there are:

In [None]:
n_trips_df = pd.DataFrame(
    [[u, len(confirmed_trip_df_map[u]),
      len(labeled_trip_df_map[u])] for u in all_users],
    columns=["user_id", "all_trips", "labeled_trips"])

all_trips = n_trips_df.all_trips.sum()
labeled_trips = n_trips_df.labeled_trips.sum()
unlabeled_trips = all_trips - labeled_trips
n_users = len(n_trips_df)

print('{} ({:.2f}%) unlabeled, {} ({:.2f}%) labeled, {} total trips'.format(
    unlabeled_trips, unlabeled_trips / all_trips, labeled_trips,
    labeled_trips / all_trips, all_trips))

n_users_too_few_trips = len(n_trips_df[n_trips_df.labeled_trips < 5])
print(
    '{}/{} ({:.2f}%) users have less than 5 labeled trips and cannot do cross-validation'
    .format(n_users_too_few_trips, n_users, n_users_too_few_trips / n_users))

trips_for_crossval = n_trips_df[
    n_trips_df.labeled_trips >= 5].labeled_trips.sum()
print('{} trips usable for 5-fold cross-validation from {}/{} ({:.2f}%) users'.
      format(trips_for_crossval, n_users - n_users_too_few_trips, n_users,
             (n_users - n_users_too_few_trips) / n_users))


### evaluate performance in aggregate

The following cell will load the cross-validation results for the listed models. (Parameters for the models being tested can be found in tour_model_eval/performance_eval.py)

If the cross-validation results for a model have already been generated, it will attempt to load it from the csv file to avoid the time-consuming process of re-running it. Otherwise, it will run the cross-validation from scratch. (This feature can be toggled with the override_prior_runs parameter - if True, it will ignore existing csv's and re-run from scratch.)

WARNING! The following cell will take *insanely long* to run - as in, potentially up to 2 days - largely due to the fixed-width clustering algorithms. If you don't care about those, I suggest removing them from the list of models to evaluate. The other models took me ~20min each to run.

In [None]:
# load in all runs
model_names = list(PREDICTORS.keys())
cv_results = cv_for_all_algs(
    ct_entry,
    uuid_list=all_users,
    expanded_trip_df_map=expanded_labeled_trip_df_map,
    model_names=model_names,
    override_prior_runs=False,
    raise_errors=False,
    random_state=42,
)


In [None]:
# store results for all models in a nice dataframe
all_model_results = {
    ('model_name', ''): [],
    ('trips without prediction', 'purpose'): [],
    ('trips without prediction', 'mode'): [],
    ('trips without prediction', 'replaced'): [],
    ('accuracy overall', 'purpose'): [],
    ('accuracy overall', 'mode'): [],
    ('accuracy overall', 'replaced'): [],
    ('accuracy of trips w predictions', 'purpose'): [],
    ('accuracy of trips w predictions', 'mode'): [],
    ('accuracy of trips w predictions', 'replaced'): [],
    ('f1 weighted', 'purpose'): [],
    ('f1 weighted', 'mode'): [],
    ('f1 weighted', 'replaced'): [],
}

purpose_results = {}

for model_name in cv_results.keys():
    print(f'now evaluating: {model_name}')
    all_model_results[('model_name', '')] += [model_name]
    for label_type in ['purpose', 'mode', 'replaced']:
        # get results
        results = get_clf_metrics(cv_results[model_name],
                                  label_type,
                                  keep_nopred=True,
                                  ignore_custom=False)

        if label_type == "purpose":
            purpose_results["%s_pred" % model_name] = results['label_pred']
            purpose_results["%s_true" % model_name] = results['label_true']

        # update our dict of aggregate results
        all_model_results[('trips without prediction', label_type)] += [
            results['n_trips_without_prediction']
        ]
        all_model_results[('accuracy overall',
                           label_type)] += [results['accuracy']]
        all_model_results[('accuracy of trips w predictions', label_type)] += [
            results['accuracy'] * len(results['label_true']) /
            (len(results['label_true']) -
             results['n_trips_without_prediction'])
        ]
        all_model_results[('f1 weighted',
                           label_type)] += [results['weighted_f_score']]

all_model_results_df = pd.DataFrame(all_model_results)
all_model_results_df.to_csv('all_model_results.csv')

In [None]:
purpose_results_df = pd.concat(purpose_results, axis=1, join='outer')
purpose_results_df.to_csv("compare_true_pred_models.csv")

In [None]:
purpose_results_df[['fixed-width (O-D)_pred', 'fixed-width (O-D)_true', 'fixed-width (O-D, destination)_pred', 'fixed-width (O-D, destination)_true']].head(n=20)

In [None]:
purpose_results_df[['fixed-width (O-D)_pred', 'fixed-width (O-D)_true', 'fixed-width (O-D, destination)_pred', 'fixed-width (O-D, destination)_true']].loc[:2000].sample(n=20)

In [None]:
all_model_results_df.sort_values(by=[('accuracy overall', 'purpose')], axis=0)

In [None]:
# dropping the fixed-width (O-D, destination) results since they are significantly worse than the others
# and we don't have the time to figure out why
all_model_results_df = all_model_results_df[all_model_results_df['model_name'] != 'fixed-width (O-D, destination)']
all_model_results_df.to_csv('all_model_results.csv')

### visualize performance (bar graphs)

(code copied from tour_model_eval/eval_comparison_plots.ipynb)

#### prep the dataframe

In [None]:
multi_index = pd.MultiIndex.from_product([[
    "trips without prediction", "accuracy overall",
    "accuracy of trips with predictions", "f1 weighted"
], ["mode", "purpose", "replaced"]])

all_eval_results = pd.read_csv("all_model_results.csv",
                               header=[0,
                                       1]).drop(columns=["Unnamed: 0_level_0"])

all_eval_results.set_index("model_name", inplace=True)
all_eval_results.head()
all_eval_results = all_eval_results.transpose().reset_index()
all_eval_results.rename(columns={
    "level_0": "metric",
    "level_1": "label_type"
},
                        inplace=True)


def remove_brackets(cn):
    if type(cn) == tuple:
        return cn[0]
    else:
        return cn


all_eval_results = all_eval_results.rename(
    mapper=lambda cn: remove_brackets(cn), axis=1)

all_eval_results.set_index("label_type", inplace=True)

all_eval_results = all_eval_results[
    all_eval_results.metric != "trips without prediction"]
all_eval_results = all_eval_results[
    all_eval_results.metric != "accuracy of trips w predictions"]


#### plot

In [None]:
plt.style.use('default')
fig, ax_arr = plt.subplots(nrows=1,
                           ncols=2,
                           sharex=True,
                           sharey=False,
                           figsize=(12, 3))
ax_list = ax_arr
titles = ['Accuracy', 'Weighted F-score']
labels = ['Purpose', 'Mode', 'Replaced Mode']

for i, (metric, result_df) in enumerate(
        all_eval_results[all_eval_results.metric != "trips without prediction"]
        .groupby("metric")):
    print(f"plotting {metric} at location {i}")
    result_df.plot(kind="bar",
                   ax=ax_list[i],
                   title=titles[i],
                   legend=False,
                   ylim=(0, 1))
    plt.draw()
    ax_list[i].set_xticklabels(labels, rotation=0)
ax_list[0].legend(loc="lower left", bbox_to_anchor=(0.0, -0.5), ncol=3)

ax_list[0].set_xlabel('')
ax_list[1].set_xlabel('')

# plt.tight_layout()
plt.show()

### evaluate performance of different models against the size of each user's dataset

evaluate each algorithm's purpose prediction accuracy against the number of trips per user

In [None]:
results_by_user = []

results_by_user = {
    ('', 'user_id'): [],
    ('', 'num_labeled_trips'): [],
}
# populate the dictionary
for model_name in model_names:
    results_by_user[('accuracy', model_name)] = []
for model_name in model_names:
    results_by_user[('f1 weighted', model_name)] = []

for user in all_users:
    results_by_user[('', 'user_id')] += [user]
    results_by_user[('', 'num_labeled_trips')] += [
        len(expanded_labeled_trip_df_map[user])
    ]

    for model_name in model_names:
        cv_for_model = cv_results[model_name]

        if isinstance(cv_for_model.user_id.unique()[0], UUID):
            pass
        else:
            user = str(user)

        if user in cv_for_model.user_id.unique() and not all(
                cv_for_model.loc[cv_for_model.user_id == user,
                                 'purpose_pred'].isna()):
            results = get_clf_metrics(
                cv_for_model.loc[cv_for_model.user_id == user],
                label_type='purpose',
                keep_nopred=True,
                ignore_custom=False)

            results_by_user[('accuracy', model_name)] += [results['accuracy']]
            results_by_user[('f1 weighted',
                             model_name)] += [results['weighted_f_score']]
        else:
            results_by_user[('accuracy', model_name)] += [np.nan]
            results_by_user[('f1 weighted', model_name)] += [np.nan]

size_performance_df = pd.DataFrame(results_by_user)
size_performance_df.to_csv("dataset size vs performance for all algs.csv")
size_performance_df

In [None]:
plt.style.use('default')

model_names = [
    'DBSCAN+SVM (O-D, destination)',
    'random forests (O-D, destination clusters)',
    'random forests (coordinates)'
]

fig, axs = plt.subplots(1, 2, figsize=(11, 4), sharex=True)
for i in range(len(model_names)):
    model_name = model_names[i]
    print(model_name)
    axs[0].scatter(
        size_performance_df[('', 'num_labeled_trips')],
        size_performance_df[('accuracy', model_name)],
        s=10,
        label=model_names[i])
    axs[1].scatter(
        size_performance_df[('', 'num_labeled_trips')],
        size_performance_df[('f1 weighted', model_name)],
        s=10,
        label=model_names[i])
axs[0].set_ylabel('Accuracy for Purpose')
axs[1].set_ylabel('Weighted F-score for Purpose')
axs[0].set_ylim(-0.1, 1.1)
axs[1].set_ylim(-0.1, 1.1)

axs[0].set_xlabel('Number of Labeled Trips')
axs[1].set_xlabel('Number of Labeled Trips')

# fig.suptitle(
#     'Comparison of Purpose Prediction Performance Against User Dataset Size')
axs[0].legend(loc='upper left', bbox_to_anchor=(0, -0.15), ncol=3)
# plt.tight_layout()
plt.show()

In [None]:
import scipy as scipy

In [None]:
def func(x, a, b, c):
    return a * np.log(b * x) + c

In [None]:
model_names = [
    'DBSCAN+SVM (O-D, destination)',
    'random forests (O-D, destination clusters)',
    'random forests (coordinates)'
]

size_performance_curve_fit = size_performance_df.dropna().sort_values(by=('', 'num_labeled_trips'))
fig, axs = plt.subplots(1, 3, figsize=(11, 4), sharex=True, sharey=True)
for i in range(len(model_names)):
    model_name = model_names[i]
    print(model_name)
    xdata = size_performance_curve_fit[('', 'num_labeled_trips')]
    ydata = size_performance_curve_fit[('f1 weighted', model_name)]
    print("xdata has the following nan %s" % np.nonzero(np.isnan(xdata).to_numpy()))
    print("ydata has the following nan %s" % np.nonzero(np.isnan(ydata).to_numpy()))
    popt, pcov = scipy.optimize.curve_fit(func, xdata, ydata)
    print("After curve fitting, parameters are %s" % popt)
    axs[i].scatter(xdata, ydata, s=10)
    axs[i].plot(xdata, func(xdata, *popt), "r-")
    axs[i].set_xlabel('Number of Labeled Trips')
    axs[i].set_title(model_names[i])

axs[0].set_ylabel('Weighted F-score for Purpose')
axs[0].set_ylim(-0.1, 1.1)

# fig.suptitle(
#     'Comparison of Purpose Prediction Performance Against User Dataset Size')
# plt.tight_layout()
plt.show()    

In [None]:
model_names = [
    'DBSCAN+SVM (O-D, destination)',
    'random forests (O-D, destination clusters)',
    'random forests (coordinates)'
]

size_performance_curve_fit = size_performance_df.dropna().sort_values(by=('', 'num_labeled_trips'))
fig, axs = plt.subplots(1, 1, figsize=(12, 6), sharex=True, sharey=True)
for i in range(len(model_names)):
    model_name = model_names[i]
    print(model_name)
    xdata = size_performance_curve_fit[('', 'num_labeled_trips')]
    ydata = size_performance_curve_fit[('f1 weighted', model_name)]
    print("xdata has the following nan %s" % np.nonzero(np.isnan(xdata).to_numpy()))
    print("ydata has the following nan %s" % np.nonzero(np.isnan(ydata).to_numpy()))
    popt, pcov = scipy.optimize.curve_fit(func, xdata, ydata)
    # kneedle = kneed.KneeLocator(xdata, ydata, S=1.0, curve="concave", direction="increasing")
    # print(kneedle.knee, kneedle.knee_y)
    print("After curve fitting, parameters are %s" % popt)
    axs.scatter(xdata, ydata, s=10, label=model_names[i])
    axs.plot(xdata, func(xdata, *popt), label=("%0.3f ln (%0.3f x) + %0.3f" % tuple(popt)))
    # x_knees = [125, 375]
    # y_knees = func(np.array([x_knees]), *popt)
    # axs.vlines(x_knees, -0.1, y_knees, linestyles="dashed")

axs.set_xlabel('Number of Labeled Trips')
axs.set_ylabel('Weighted F-score for Purpose')
axs.set_ylim(-0.1, 1.1)
# axs.set_xticks(list(axs.get_xticks()) + x_knees)

axs.legend()
# fig.suptitle(
#     'Comparison of Purpose Prediction Performance Against User Dataset Size')
# plt.tight_layout()
plt.show()    