### imports


In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID

import matplotlib.pyplot as plt

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq

from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS

### load data


In [None]:
all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_labeled_trip_df_map = {}
expanded_all_trip_df_map = {}
for u in all_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")

    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(
        labeled_trip_df_map[u])
    expanded_all_trip_df_map[u] = esdtq.expand_userinputs(
        confirmed_trip_df_map[u])

check how many labeled/unlabeled trips there are:


In [None]:
n_trips_df = pd.DataFrame(
    [[u, len(confirmed_trip_df_map[u]),
      len(labeled_trip_df_map[u])] for u in all_users],
    columns=["user_id", "all_trips", "labeled_trips"])

all_trips = n_trips_df.all_trips.sum()
labeled_trips = n_trips_df.labeled_trips.sum()
unlabeled_trips = all_trips - labeled_trips
print('{} ({:.2f}%) unlabeled, {} ({:.2f}%) labeled, {} total trips'.format(
    unlabeled_trips, unlabeled_trips / all_trips, labeled_trips,
    labeled_trips / all_trips, all_trips))

n_users_too_few_trips = len(n_trips_df[n_trips_df.labeled_trips < 5])
print(
    '{}/{} ({:.2f}%) users have less than 5 labeled trips and cannot do cross-validation'
    .format(n_users_too_few_trips, len(n_trips_df),
            n_users_too_few_trips / len(n_trips_df)))

### evaluate performance in aggregate


In [None]:
# load in all runs
model_names = list(PREDICTORS.keys())
model_names = [
    'random forests (O-D, destination clusters)',
    'random forests (coordinates)'
]
cv_results = cv_for_all_algs(
    uuid_list=all_users,
    expanded_trip_df_map=expanded_labeled_trip_df_map,
    model_names=model_names,
    override_prior_runs=True,
    raise_errors=False,
    random_state=42,
)


In [None]:
# store results for all models in a nice dataframe
all_model_results = {
    ('model_name', ''): [],
    ('trips without prediction', 'purpose'): [],
    ('trips without prediction', 'mode'): [],
    ('trips without prediction', 'replaced'): [],
    ('accuracy overall', 'purpose'): [],
    ('accuracy overall', 'mode'): [],
    ('accuracy overall', 'replaced'): [],
    ('accuracy of trips w predictions', 'purpose'): [],
    ('accuracy of trips w predictions', 'mode'): [],
    ('accuracy of trips w predictions', 'replaced'): [],
    ('f1 weighted', 'purpose'): [],
    ('f1 weighted', 'mode'): [],
    ('f1 weighted', 'replaced'): [],
}

for model_name in cv_results.keys():
    print(f'now evaluating: {model_name}')
    all_model_results[('model_name', '')] += [model_name]
    for label_type in ['purpose', 'mode', 'replaced']:
        # get results
        results = get_clf_metrics(cv_results[model_name],
                                  label_type,
                                  keep_nopred=True,
                                  ignore_custom=False)

        # update our dict of aggregate results
        all_model_results[('trips without prediction', label_type)] += [
            results['n_trips_without_prediction']
        ]
        all_model_results[('accuracy overall',
                           label_type)] += [results['accuracy']]
        all_model_results[('accuracy of trips w predictions', label_type)] += [
            results['accuracy'] * len(results['label_true']) /
            (len(results['label_true']) - results['n_trips_without_prediction'])
        ]
        all_model_results[('f1 weighted',
                           label_type)] += [results['weighted_f_score']]

all_model_results_df = pd.DataFrame(all_model_results)
all_model_results_df.to_csv('all_model_results.csv')

In [None]:
all_model_results_df.sort_values(by=[('accuracy overall', 'purpose')], axis=0)