### imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import performance_eval
import data_wrangling


### load data

In [None]:
all_users = esta.TimeSeries.get_uuid_list()

confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_labeled_trip_df_map = {}
expanded_all_trip_df_map = {}
for u in all_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")

    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(
        labeled_trip_df_map[u])
    expanded_all_trip_df_map[u] = esdtq.expand_userinputs(
        confirmed_trip_df_map[u])

### run DBSCAN at varying distance thresholds

In [None]:
try:
    all_results_df = pd.read_csv('DBSCAN_vary_distance_threshold.csv')
except:
    param_grid = {
        'DBSCAN': {
            'SVM': [False],
            'min_samples': [1],
            'gamma': [0.05],
            'C': [1],
            'size_thresh': [1],
            'purity_thresh': [1],
            # 'radii': [50, 100, 150]
        },
    }
    radii = [50, 100, 150, 200]

    all_results_df = performance_eval.run_eval_cluster_metrics(
        expanded_labeled_trip_df_map,
        user_list=all_users,
        radii=radii,
        loc_type='end',
        algs=['DBSCAN'],
        param_grid=param_grid,
        n_iter=1,
        random_state=42)

    all_results_df = data_wrangling.expand_df_dict(all_results_df, 'params')
    all_results_df.to_csv('DBSCAN_vary_distance_threshold.csv', index=False)

### plot resulting cluster metrics

In [None]:
plt.style.use("default")
fig, axs = plt.subplots(1,
                        3,
                        figsize=(9, 4),
                        gridspec_kw={'width_ratios': [4, 2, 2]})

purities = []
req_pcts = []

radii = [50, 100, 150, 200]

for i in range(len(radii)):
    r = radii[i]
    results = all_results_df.loc[all_results_df['radius'] == r]
    hom = results.loc[:, 'homogeneity']
    mod_hom = results.loc[:, 'modified_homogeneity']
    purity = results.loc[:, 'purity']
    req_pct = results.loc[:, 'n_clusters'] / results.loc[:, 'n_trips']
    purities += [purity]
    req_pcts += [req_pct]
    axs[0].scatter(x=purity,
                   y=req_pct,
                   s=12,
                   color=plt.cm.tab10(i),
                   alpha=0.8,
                   edgecolors='white',
                   label=r)
axs[0].set_aspect('equal', adjustable='box')
axs[0].legend(title='Distance Threshold (m)')._legend_box.align = 'left'

axs[0].set_ylabel('Request Percentage')
axs[0].set_xlabel('Mean Cluster Purity by User')
axs[0].set_xlim(-0.05, 1.05)
axs[0].set_ylim(-0.05, 1.05)
axs[1].set_ylim(-0.05, 1.05)
axs[2].set_ylim(-0.05, 1.05)

axs[1].boxplot(purities)
axs[2].boxplot(req_pcts)

axs[1].set_xticklabels(radii)
axs[2].set_xticklabels(radii)

axs[1].set_ylabel('Mean Cluster Purity by User')
axs[2].set_ylabel('Request Percentage')

axs[1].set_xlabel('Distance Threshold (m)')
axs[2].set_xlabel('Distance Threshold (m)')

# fig.suptitle('DBSCAN Cluster Quality Across All Users')
plt.tight_layout()
plt.show()
