This notebook runs a randomizedsearchCV to find optimal hyperparameters for the ClusterForestPredictor (where we use both end and trip-level clusters). 

### imports and load data

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.model_selection import ParameterSampler

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
from performance_eval import cross_val_predict, get_clf_metrics
import models

import logging
# logging.basicConfig()
# logger = logging.getLogger()
# logger.setLevel(logging.WARNING)


storage not configured, falling back to sample, default configuration
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost


In [None]:
all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_labeled_trip_df_map = {}
expanded_all_trip_df_map = {}
for u in all_users:
    # print(u)
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")

    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(
        labeled_trip_df_map[u])
    expanded_all_trip_df_map[u] = esdtq.expand_userinputs(
        confirmed_trip_df_map[u])

### hyperparameter tuning via randomized search CV

In [3]:
# helper function to run a bunch of cross-validations and generate performance metrics
def run_cv(model, model_params=None, uuid_list=all_users):
    # TODO: rename this
    dfs = []
    excluded_user_count = 0
    total_users = len(uuid_list)

    for user in uuid_list:
        try:
            results = cross_val_predict(
                model,
                model_params,
                data_type='dataframe',
                user_df=expanded_labeled_trip_df_map[user])
            if results == None:
                excluded_user_count += 1
        except Exception as e:
            excluded_user_count += 1
            logging.info(f'error for user {user}')
            logging.info(repr(e))
            # raise e
            continue

        cross_val_results = pd.DataFrame(data=results)
        cross_val_results['user_id'] = user
        cross_val_results['program'] = 'minipilot'
        dfs += [cross_val_results]

    print('using {}/{} users, excluded {}'.format(
        total_users - excluded_user_count, total_users, excluded_user_count))

    cross_val_all = pd.concat(dfs, ignore_index=True)
    cross_val_all['top_pred'] = True

    mode_results = get_clf_metrics(cross_val_all,
                                   'mode',
                                   keep_nopred=False,
                                   ignore_custom=False)
    purpose_results = get_clf_metrics(cross_val_all,
                                      'purpose',
                                      keep_nopred=False,
                                      ignore_custom=False)
    replaced_results = get_clf_metrics(cross_val_all,
                                       'replaced',
                                       keep_nopred=False,
                                       ignore_custom=False)

    return {
        'mode_acc': mode_results['accuracy'],
        'mode_f_macro': mode_results['macro_f_score'],
        'mode_f_weighted': mode_results['weighted_f_score'],
        'purpose_acc': purpose_results['accuracy'],
        'purpose_f_macro': purpose_results['macro_f_score'],
        'purpose_f_weighted': purpose_results['weighted_f_score'],
        'replaced_acc': replaced_results['accuracy'],
        'replaced_f_macro': replaced_results['macro_f_score'],
        'replaced_f_weighted': replaced_results['weighted_f_score'],
    }


We can't use the sklearn RandomizedSearchCV because our model doesn't meet the requirements of an sklearn Estimator class (among other issues, we're producing 3 predictions, not 1). We'll just re-implement what we need here.

In [21]:
results = {
    'param': [],
    'mode_acc': [],
    'mode_f_macro': [],
    'mode_f_weighted': [],
    'purpose_acc': [],
    'purpose_f_macro': [],
    'purpose_f_weighted': [],
    'replaced_acc': [],
    'replaced_f_macro': [],
    'replaced_f_weighted': [],
}


def tune_label_classifier(model,
                          param_grid,
                          prev_results_df=None,
                          uuid_list=all_users,
                          n_iter=10,
                          random_state=42):
    results = {
        # params
        'n_estimators': [],
        'max_depth': [],
        'min_samples_split': [],
        'min_samples_leaf': [],
        'max_features': [],
        'bootstrap': [],

        # performance metrics
        'mode_acc': [],
        'mode_f_macro': [],
        'mode_f_weighted': [],
        'purpose_acc': [],
        'purpose_f_macro': [],
        'purpose_f_weighted': [],
        'replaced_acc': [],
        'replaced_f_macro': [],
        'replaced_f_weighted': [],
}
    params = list(
        ParameterSampler(param_grid, n_iter,
                         random_state=random_state))  # list of dictionaries
    for param in params:
        print('running with params', param)
        # run CV and get metrics
        try:
            result = run_cv(model, param, uuid_list)
        except KeyboardInterrupt:
            print('stopping due to KeyboardInterrupt; saving existing results')
            break 
        except Exception as e:
            print(repr(e) + '; saving existing results')
            break 
        print(result)
        print()

        for key in ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap']:
            results[key] += [param[key]]
        for key in list(result.keys()):
            results[key] += [result[key]]

    results_df = pd.DataFrame(results)
    if prev_results_df is not None:
        results_df = pd.concat([prev_results_df, results_df], ignore_index=True).drop_duplicates()

    return results_df

NOTE: the following code will take an insanely long time to run so I've commented it out. It can be run repeatedly (but change the random seed - or just increase the n_iter value) to generate more results. Also, the process will attempt to exit gracefully if it comes across any errors or keyboardinterrupt, so it's fine to stop the cell in the middle - the earlier results will still be saved. 

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [None, 5, 10, 25, 50, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'use_start_clusters': [False],
    'use_trip_clusters': [True],
    'use_base_clusters': [False],
    'drop_unclustered': [False],
}

# results_df = tune_label_classifier(models.ClusterForestPredictor,
#                                 param_grid,
#                                 prev_results_df=None,
#                                 uuid_list=all_users,
#                                 n_iter=10,
#                                 random_state=42)

we can also load the results dataframe if we saved it to a csv (as we do below)

In [38]:
results_df = pd.read_csv("forest_tuning_results.csv")

In [40]:
results_df.sort_values(by='purpose_acc', axis=0, ascending=False).head(10)

Unnamed: 0.1,Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,bootstrap,mode_acc,mode_f_macro,mode_f_weighted,purpose_acc,purpose_f_macro,purpose_f_weighted,replaced_acc,replaced_f_macro,replaced_f_weighted
36,36,500.0,100.0,2.0,1.0,sqrt,0.0,0.706012,0.241247,0.698719,0.656427,0.216679,0.641678,0.743372,0.280101,0.73902
92,92,200.0,100.0,2.0,1.0,sqrt,0.0,0.703536,0.234369,0.69614,0.655044,0.208304,0.639811,0.744364,0.282833,0.74036
24,24,200.0,500.0,2.0,1.0,sqrt,0.0,0.703536,0.234369,0.696143,0.654905,0.208298,0.639675,0.744364,0.282851,0.740348
33,33,300.0,,5.0,1.0,sqrt,0.0,0.702022,0.222827,0.692716,0.654352,0.194423,0.636333,0.739402,0.245967,0.733902
6,6,300.0,100.0,5.0,1.0,sqrt,0.0,0.702022,0.222827,0.692716,0.654352,0.194423,0.636333,0.739402,0.245967,0.733902
11,11,200.0,,5.0,1.0,log2,0.0,0.690604,0.218469,0.680862,0.65366,0.195087,0.633735,0.734865,0.235507,0.728929
88,88,100.0,,2.0,1.0,sqrt,0.0,0.704774,0.240439,0.697006,0.65283,0.213761,0.638361,0.744222,0.2901,0.740257
18,18,500.0,,2.0,1.0,sqrt,1.0,0.697482,0.210354,0.686876,0.651861,0.164918,0.632054,0.738267,0.235841,0.73176
94,94,200.0,100.0,2.0,1.0,log2,0.0,0.694318,0.234524,0.686629,0.651723,0.212654,0.635241,0.738409,0.252198,0.733763
60,60,200.0,,2.0,1.0,sqrt,1.0,0.695694,0.211631,0.685188,0.650339,0.166968,0.630126,0.738267,0.237035,0.731955


In [36]:
# save the results because it took so long to get these 
results_df.to_csv("forest_tuning_results.csv")