### imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from uuid import UUID

from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt


# hack because jupyter notebook doesn't work properly through my vscode for
# some reason and therefore cant import stuff from emission? remove this before
# pushing
###
import sys

sys.path.append('/Users/hlu2/Documents/GitHub/e-mission-server/')
###

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.core.get_database as edb
import mapping

### load data

In [None]:
# to see the same outputs I described, put in the unique tokens for these users
email0 = "replace this"  # shankari
email1 = "replace this"  # tom
user0 = list(edb.get_uuid_db().find({"user_email": email0}))[0]['uuid']
user1 = list(edb.get_uuid_db().find({"user_email": email1}))[0]['uuid']
user2 = UUID('replace this')  # hannah

all_users = esta.TimeSeries.get_uuid_list()
user_list = np.append([user0, user1, user2],
                      np.random.choice(all_users, size=10, replace=False))
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_labeled_trip_df_map = {}
expanded_all_trip_df_map = {}
for i in range(len(user_list)):
    u = user_list[i]
    print(u)
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")

    confirmed_trip_df_map[i] = ct_df
    labeled_trip_df_map[i] = esdtq.filter_labeled_trips(ct_df)
    expanded_labeled_trip_df_map[i] = esdtq.expand_userinputs(
        labeled_trip_df_map[i])
    expanded_all_trip_df_map[i] = esdtq.expand_userinputs(
        confirmed_trip_df_map[i])


### SVM exploration for single clusters

#### Set up data so that we can look at some specific clusters first. 

In [None]:
import data_wrangling
from clustering import *


def setup(user_df,
          alg,
          loc_type,
          radii=[50, 100, 150, 200],
          cluster_unlabeled=False):
    """ copied and modified from plot_clusters() in mapping """
    # clean up the dataframe by dropping entries with NaN locations and
    # resetting index because oursim needs the position of each trip to match
    # its nominal index
    all_trips_df = user_df.dropna(subset=['start_loc', 'end_loc']).reset_index(
        drop=True)

    # expand the 'start/end_loc' column into 'start/end_lat/lon' columns
    all_trips_df = data_wrangling.expand_coords(all_trips_df)

    labeled_trips_df = all_trips_df.loc[all_trips_df.user_input != {}]
    df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df

    df_for_cluster = add_loc_clusters(df_for_cluster,
                                      radii=radii,
                                      alg=alg,
                                      loc_type=loc_type,
                                      min_samples=2)

    return df_for_cluster


shankari = expanded_all_trip_df_map[0]
tom = expanded_all_trip_df_map[1]
hannah = expanded_all_trip_df_map[2]

shankari = setup(shankari,
                 alg='DBSCAN',
                 loc_type='end',
                 radii=[100, 150, 200, 250],
                 cluster_unlabeled=False)
tom = setup(tom,
            alg='DBSCAN',
            loc_type='end',
            radii=[100, 150, 200, 250],
            cluster_unlabeled=False)
hannah = setup(hannah,
               alg='DBSCAN',
               loc_type='end',
               radii=[100, 150, 200, 250],
               cluster_unlabeled=False)


In [None]:
alg = 'DBSCAN'
loc_type = 'end'

c1 = hannah.loc[(hannah[f'{loc_type}_{alg}_clusters_150_m'] == 1
                 )].loc[:, ['end_lat', 'end_lon', 'purpose_confirm']].dropna()

c2 = shankari.loc[(shankari[f'{loc_type}_{alg}_clusters_200_m'] == 1
                   )].loc[:,
                          ['end_lat', 'end_lon', 'purpose_confirm']].dropna()
c3 = shankari.loc[(shankari[f'{loc_type}_{alg}_clusters_150_m'] == 1
                   )].loc[:,
                          ['end_lat', 'end_lon', 'purpose_confirm']].dropna()
c4 = shankari.loc[(shankari[f'{loc_type}_{alg}_clusters_150_m'] == 4
                   )].loc[:,
                          ['end_lat', 'end_lon', 'purpose_confirm']].dropna()

clusters = [c1, c2, c3, c4]


In [None]:
c4[['purpose_confirm']].value_counts() / len(c4)


#### kernel comparison

Intuitively, I think it makes sense to use an RBF kernel since we expect our clusters to be shaped round-ish-ly. Let's do a quick comparison just to see the different results. 

Plot decision functions

In [None]:
for c in range(len(clusters)):
    cluster = clusters[c]
    # setup up model
    X = cluster.loc[:, ['end_lon', 'end_lat']]
    Y = cluster.loc[:, 'purpose_confirm'].to_list()
    # fit() wants Y as an array, not a column vector

    linear_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='linear', C=1, decision_function_shape='ovr'))
    radial_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='rbf', gamma=0.1, C=1, decision_function_shape='ovr'))
    polynom2_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='poly', degree=2, C=1, decision_function_shape='ovr'))
    polynom4_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='poly', degree=4, C=1, decision_function_shape='ovr'))

    models = [linear_model, radial_model, polynom2_model, polynom4_model]
    model_names = [
        'linear_kernel', 'radial_kernel', 'polynom2_kernel', 'polynom4_kernel'
    ]

    # vars for visualizing decision functions
    min_lat = X[['end_lat']].min()
    max_lat = X[['end_lat']].max()
    min_lon = X[['end_lon']].min()
    max_lon = X[['end_lon']].max()
    xx, yy = np.meshgrid(np.linspace(min_lon, max_lon, 500),
                         np.linspace(min_lat, max_lat, 500))

    for i in range(len(models)):
        model = models[i]
        model_name = model_names[i]
        print(f'model {model_name} for cluster {c}')

        # train models
        model.fit(X, Y)

        num_classes = len(model.classes_)
        decisions = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
        decisions = decisions.reshape((xx.shape[0], xx.shape[1], num_classes))

        fig, axs = plt.subplots(num_classes // 2 + num_classes % 2,
                                2,
                                sharex=True,
                                sharey=True,
                                figsize=(6, 6))

        axs = axs.flatten()
        for i in range(num_classes):
            axs[i].set_title(model.classes_[i])
            axs[i].imshow(
                decisions[:, :, i],
                interpolation="nearest",
                extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                aspect="auto",
                origin="lower",
            )

            axs[i].scatter(
                X['end_lon'],
                X['end_lat'],
                # c=Y['purpose_confirm'].map(colors).to_list(),
                edgecolors="k")
        plt.axis('scaled')
        plt.tight_layout()
        plt.show()


Plot actual label predictions:

In [None]:
for c in range(len(clusters)):
    cluster = clusters[c]
    # setup up model
    X = cluster.loc[:, ['end_lon', 'end_lat']]
    Y = cluster.loc[:, 'purpose_confirm'].to_list()
    # fit() wants Y as an array, not a column vector

    linear_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='linear', C=1, decision_function_shape='ovr'))
    radial_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='rbf', gamma=0.1, C=1, decision_function_shape='ovr'))
    polynom2_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='poly', degree=2, C=1, decision_function_shape='ovr'))
    polynom4_model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='poly', degree=4, C=1, decision_function_shape='ovr'))

    models = [linear_model, radial_model, polynom2_model, polynom4_model]
    model_names = [
        'linear_kernel', 'radial_kernel', 'polynom2_kernel', 'polynom4_kernel'
    ]

    # vars for visualizing decision functions
    min_lat = X[['end_lat']].min()
    max_lat = X[['end_lat']].max()
    min_lon = X[['end_lon']].min()
    max_lon = X[['end_lon']].max()
    xx, yy = np.meshgrid(np.linspace(min_lon, max_lon, 500),
                         np.linspace(min_lat, max_lat, 500))

    for i in range(len(models)):
        model = models[i]
        model_name = model_names[i]
        print(f'model {model_name} for cluster {c}')

        # train models
        model.fit(X, Y)

        num_classes = len(model.classes_)
        label_map = {model.classes_[i]: i for i in range(num_classes)}

        pred = model.predict(np.c_[xx.ravel(), yy.ravel()])
        pred = [label_map[p] for p in pred]
        pred = np.array(pred).reshape((xx.shape[0], xx.shape[1]))

        fig, ax = plt.subplots()

        im = ax.imshow(pred,
                       interpolation="none",
                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                       aspect="auto",
                       origin="lower",
                       cmap=plt.cm.tab10,
                       vmin=0,
                       vmax=9)

        ax.scatter(
            X['end_lon'],
            X['end_lat'],
            # s=30,
            # c=Y['purpose_confirm'].map(colors).to_list(),
            edgecolors="k")
        plt.axis('scaled')
        plt.tight_layout()
        # plt.legend()
        print(label_map)
        plt.show()


Yep, we definitely want to use a radial kernel. 

#### SVM issues

We should do some hyperparameter tuning, but tbh these default values are doing pretty well so I'm going to push that off for later. 

A bigger concern: sometimes, the cluster for two labels naturally overlap - for example, home may overlap with roundtrips because the end destination is exactly the same. SVM is going to attempt to distinguish between these but epicly fail.

See below for a demo of the 'overlap problem'. In the map, notice how orange 'home' points totally overlap with green 'pick_drop_person' points. 

In [None]:
fig = mapping.plot_clusters(
    expanded_all_trip_df_map[0],
    alg='DBSCAN',
    loc_type='end',
    # cluster_unlabeled=True,
    # plot_unlabeled=True,
    radii=[150])
fig

Now, look at how SVM attempts to separate the sub-clusters:

In [None]:
# this is just copying the above code into a convenient function
def vis_pred(cluster):
    # setup up model
    X = cluster.loc[:, ['end_lon', 'end_lat']]
    Y = cluster.loc[:, 'purpose_confirm'].to_list()
    # fit() wants Y as an array, not a column vector

    model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='rbf', gamma=0.05, C=1, decision_function_shape='ovr'))

    # train models
    model.fit(X, Y)

    # vars for visualizing decision functions
    min_lat = X[['end_lat']].min()
    max_lat = X[['end_lat']].max()
    min_lon = X[['end_lon']].min()
    max_lon = X[['end_lon']].max()
    xx, yy = np.meshgrid(np.linspace(min_lon, max_lon, 500),
                         np.linspace(min_lat, max_lat, 500))

    num_classes = len(model.classes_)
    label_map = {model.classes_[i]: i for i in range(num_classes)}

    pred = model.predict(np.c_[xx.ravel(), yy.ravel()])
    print(np.unique(np.array(pred)))
    pred = [label_map[p] for p in pred]
    pred = np.array(pred).reshape((xx.shape[0], xx.shape[1]))

    fig, ax = plt.subplots()

    im = ax.imshow(pred,
                   interpolation="none",
                   extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                   aspect="auto",
                   origin="lower",
                   cmap=plt.cm.tab10,
                   vmin=0,
                   vmax=9)

    ax.scatter(
        X['end_lon'],
        X['end_lat'],
        # s=30,
        # c=Y['purpose_confirm'].map(colors).to_list(),
        edgecolors="k")
    plt.axis('scaled')
    plt.tight_layout()
    # plt.legend()
    print(label_map)
    plt.show()

In [None]:
vis_pred(c3)

Couple comments: 
- there are only 3 unique labels in the output prediction for this cluster (out of 7 input labels). This is nice, because we may have some rare labels that we mostly want to ignore for now, and we won't have to pre-filter them because SVM already takes care of it. 
- SVM is able to distinguish between shopping and home, which is good. However, it can't distinguish between home and pick_drop because those destinations are inherently the same place. I think this is ok because we can just handle that using other trip features later. The concern would only be if SVM attempted to actually separate home and pick_drop (which could potentially happen if we choose bad hyperparameters or our data is sparse/noisy.)

ok, let's visualize the decision boundaries for some more clusters:

In [None]:
vis_pred(c1)

In [None]:
vis_pred(c2)

In [None]:
vis_pred(c4)

#### pipeline! 

New pipeline:
- make clusters via DBSCAN (containing both labeled and unlabeled trips)
- - If we use SVM, we want to use DBSCAN because that creates better clusters that actually revolve around density cores. Oursim frequently has the issue of splitting up density cores because an outlier was added to the bin early on. (note to self, this is not a good explanation, clarify later)
- if cluster has enough points and a low enough purity, then fit an SVM using labeled data
- then, use SVM to make predictions on the unlabeled (using just their lat lon data)
- (so calling it a second round of 'clustering' isn't quite the correct term, but oh well)

In [None]:
# quick test to make sure pipeline is working

fig = mapping.plot_clusters(expanded_labeled_trip_df_map[0],
                            alg='SVM',
                            loc_type='end',
                            cluster_unlabeled=False,
                            plot_unlabeled=True,
                            radii=[100, 150, 200, 250, 300, 500])
fig

In [None]:
# compare with original DBSCAN results

fig = mapping.plot_clusters(expanded_labeled_trip_df_map[0],
                            alg='DBSCAN',
                            loc_type='end',
                            cluster_unlabeled=False,
                            plot_unlabeled=True,
                            radii=[100, 150, 200, 250, 300, 500])
fig

Yay, its working! and it seems to be doing decently well, actually. For instance, at 150m, the SVM was able to split the home/shopping clusters and library/shopping clusters that DBSCAN had merged. At 200m, it was able to separate home, library and shopping. The shopping cluster *looks* pretty weird, because there are inherently two separate shopping clusters - but SVM gave them the same 'shopping' label and thus we drew an elongated convex hull. However, if we look above, back to where we plotted the prediction boundaries, we see two separate islands of shopping. So it's lacking a little bit in principle - but would probably perform fine in predicting purpose. It's possible, though, that the two clusters could have different modes since they're located in different places (not in this data, but possible in general).

Also, there some overfitting with 1-2 trip library clusters at 250m (but this is gone at 300m, weird). We also have an issue at 400m+ that the big shopping cluster is not being split up, maybe because the impurity threshold is too low? 

ok now the real test: will the ***same*** alg+hyperparameters work for *my* data, which is on a college campus where all the buildings are much closer to each other? (This is the main issue with all the other clustering algorithms I've tried so far – I can hardcode good hyperparameters for an individual user/region, but they may not work for other regions – e.g. sprawling suburbs vs dense college campus or city downtown)

In [None]:
fig = mapping.plot_clusters(expanded_labeled_trip_df_map[2],
                            alg='SVM',
                            loc_type='end',
                            cluster_unlabeled=False,
                            plot_unlabeled=True,
                            radii=[100, 150, 200, 250, 300, 500])
fig

In [None]:
# dbscan for user2

fig = mapping.plot_clusters(expanded_labeled_trip_df_map[2],
                            alg='DBSCAN',
                            loc_type='end',
                            cluster_unlabeled=False,
                            plot_unlabeled=True,
                            radii=[100, 150, 200, 250, 300, 500])
fig

Interesting... so it definitely did help, but still isn't the most optimal. ~~For 150m, it was able to split meal/school, which DBSCAN had merged. At 200m, it was able to split ac end from res end, which DBSCAN had merged; however, it didn't do any subsplitting of ac end. Maybe we want repeated iterations of SVM. This should be feasible, since we would use the same purity threshold.~~ Subsplitting is now implemented. 

omg, it worked. though maybe it overfit itself for 200m (like, why is there a cluster with a single lonely meal trip, sitting right next to a big meal cluster?). Also, it doesn't really fix the issue that some clusters have too-large diameters, but few enough points/high enough purity that SVM is not triggered (or maybe SVM is triggered, but isn't able to distinguish between them). I guess this is something we can try and address by tuning the hyperparameters. 

more checks

In [None]:
fig = mapping.plot_clusters(expanded_labeled_trip_df_map[1],
                            alg='SVM',
                            loc_type='end',
                            cluster_unlabeled=False,
                            plot_unlabeled=True,
                            radii=[100, 150, 200, 250, 300, 500])
fig

#### hyperparameter tuning

parameters to tune:
- cluster size threshold
- purity threshold
- gamma (size of RBF kernel)
- C (boundary softness/regularization)

The SVM is currently using the following parameters, which are yielding decent results but were chosen somewhat arbitrarily: 
- size threshold = 6
- purity threshold = 0.7
- gamma = 0.05
- C = 1