This notebook runs 5-fold cross validation to assess the label assist performance (first round clustering only). Ouputs include confusion matrices and tables of standard performance metrics (precision, recall, f-score, accuracy). 

### imports and setup

In [None]:
import pandas as pd
import numpy as np

import models
from performance_eval import cross_val_predict, print_metrics

In [None]:
# our imports
import emission.storage.timeseries.abstract_timeseries as esta
import emission.core.get_database as edb
import emission.storage.decorations.trip_queries as esdtq


### Read data and set up variables

In [None]:
uuid_list = edb.get_uuid_db().distinct("uuid")

In [None]:
print(len(uuid_list))
uuid_list

Let's see how many labeled/unlabeled trips there are

In [None]:
all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_trip_df_map = {}
for u in all_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")
    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])
n_trips_df = pd.DataFrame(
    [[u, len(confirmed_trip_df_map[u]),
      len(labeled_trip_df_map[u])] for u in all_users],
    columns=["user_id", "all_trips", "labeled_trips"])
n_trips_df

In [None]:
all_trips = n_trips_df.all_trips.sum()
labeled_trips = n_trips_df.labeled_trips.sum()
unlabeled_trips = all_trips - labeled_trips
print('{:.2f}% unlabeled, {:.2f}% labeled, {} total trips'.format(
    unlabeled_trips / all_trips, labeled_trips / all_trips, all_trips))


### get results

In [None]:
# cross_val_all = pd.DataFrame()
dfs = []
excluded_user_count = 0
total_users = len(uuid_list)

for user in uuid_list:
    try:
        results = cross_val_predict(models.first_round_cluster, user)
        if results == None:
            excluded_user_count += 1

    except Exception as e:
        excluded_user_count += 1
        print('error for user', user)
        print(repr(e))
        continue

    cross_val_results = pd.DataFrame(data=results)
    cross_val_results['user_id'] = user
    cross_val_results['program'] = 'minipilot'
    dfs += [cross_val_results]

print('using {}/{} users, excluded {}'.format(
    total_users - excluded_user_count, total_users, excluded_user_count))

cross_val_all = pd.concat(dfs, ignore_index=True)
cross_val_all['top_pred'] = True
cross_val_all

In [None]:
print_metrics(cross_val_all, 'mode', keep_nopred=False, ignore_custom=True)

In [None]:
print_metrics(cross_val_all, 'purpose', keep_nopred=False, ignore_custom=True)

In [None]:
print_metrics(cross_val_all, 'replaced', keep_nopred=True, ignore_custom=True)