This notebook contains code to generate data/figures needed to finish the TRB poster.


### imports


In [1]:
%load_ext autoreload
%autoreload 2

from uuid import UUID
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.core.get_database as edb
import emission.analysis.modelling.trip_model.run_model as eamtr
import mapping
import data_wrangling
from clustering import add_loc_clusters

storage not configured, falling back to sample, default configuration
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do m

### load data


In [3]:
user1_email = "stage_bvDUDQJ44w8"
user2_email = "stage_d3GBLDSVzn4"
user1_uuid = list(edb.get_uuid_db().find({"user_email": user1_email}))[0]['uuid']
user2_uuid = list(edb.get_uuid_db().find({"user_email":
                                             user2_email}))[0]['uuid']

uuids = [user2_uuid, user1_uuid]
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_trip_df_map = {}
ct_entry={}
for u in uuids:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")
    ct_entry[u]=eamtr._get_training_data(u,None)
    confirmed_trip_df_map[u] = ct_df
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])

found 972 training rows
found 796 training rows


### naive fixed-width clustering from the first user's data


To check: is there is a split purple cluster a bit northwest of the library?

In [4]:
fig = mapping.find_plot_clusters(expanded_trip_df_map[user1_uuid],
                                 ct_entry[user1_uuid],
                                 alg='naive',
                                 loc_type='end',
                                 clustering_way='destination',
                                 plot_unlabeled=False,
                                 cluster_unlabeled=False,
                                 radii=[50, 100, 150])
fig

### Shankari's data


#### maps with clusters

DBSCAN without SVM: home cluster with a blue cluster to the south that was merged in

In [6]:
fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],
                                 ct_entry[user2_uuid],
                                 alg='DBSCAN',
                                 SVM=False,
                                 loc_type='end',
                                 clustering_way='destination',
                                 plot_unlabeled=False,
                                 cluster_unlabeled=False,
                                 radii=[150])
fig

DBSCAN + SVM: home cluster and blue cluster to the south have been separated

In [9]:
fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],
                                 ct_entry[user2_uuid],
                                 alg='DBSCAN',
                                 SVM=True,
                                 loc_type='end',
                                 clustering_way='destination',
                                 plot_unlabeled=False,
                                 cluster_unlabeled=False,
                                 radii=[150])
fig

#### SVM decision boundaries

##### setup

In [None]:
plt.style.use("default")


def vis_pred(cluster):
    # setup up model
    X = cluster.loc[:, ['end_lon', 'end_lat']]
    Y = cluster.loc[:, 'purpose_confirm']

    model = make_pipeline(
        StandardScaler(),
        svm.SVC(kernel='rbf', gamma=0.05, C=1, decision_function_shape='ovr'))

    # train models
    # fit() wants Y as an array, not a column vector
    model.fit(X, Y.to_list())

    # vars for visualizing decision functions
    min_lat = X[['end_lat']].min()
    max_lat = X[['end_lat']].max()
    min_lon = X[['end_lon']].min()
    max_lon = X[['end_lon']].max()
    width = max_lon - min_lon
    height = max_lat - min_lat
    xx, yy = np.meshgrid(
        np.linspace(min_lon - 0.05 * width, max_lon + 0.05 * width, 500),
        np.linspace(min_lat - 0.05 * height, max_lat + 0.05 * height, 500))

    num_classes = len(model.classes_)
    label_map = {model.classes_[i]: i for i in range(num_classes)}

    pred = model.predict(np.c_[xx.ravel(), yy.ravel()])
    pred = [label_map[p] for p in pred]
    pred = np.array(pred).reshape((xx.shape[0], xx.shape[1]))

    fig, ax = plt.subplots(figsize=(11, 8))

    ## Prepare bins for the normalizer
    ## normalize the colors
    norm_bins = np.sort([*label_map.values()]) + 0.5
    norm_bins = np.insert(norm_bins, 0, np.min(norm_bins) - 1.0)
    norm = matplotlib.colors.BoundaryNorm(norm_bins, num_classes, clip=True)

    if num_classes <= 10:
        cm = plt.cm.tab10
    else:
        cm = plt.cm.tab20

    im = ax.imshow(
        pred,
        interpolation="none",
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        aspect="auto",
        origin="lower",
        cmap=cm,
        norm=norm,
    )

    ax.scatter(
        X['end_lon'],
        X['end_lat'],
        c=Y.map(label_map).to_list(),
        cmap=cm,
        edgecolors="k",
        norm=norm,
    )
    ax.set_xticks([])
    ax.set_yticks([])
    fig.subplots_adjust(bottom=0.1, top=0.9, left=0.5, right=1)

    plt.axis('scaled')
    # plt.tight_layout()
    plt.show()

In [None]:
def setup(user_df,
          alg,
          loc_type,
          radii=[50, 100, 150, 200],
          cluster_unlabeled=False):
    """ copied and modified from find_plot_clusters() in mapping """
    # clean up the dataframe by dropping entries with NaN locations and
    # resetting index because oursim needs the position of each trip to match
    # its nominal index
    all_trips_df = user_df.dropna(subset=['start_loc', 'end_loc']).reset_index(
        drop=True)

    # expand the 'start/end_loc' column into 'start/end_lat/lon' columns
    all_trips_df = data_wrangling.expand_coords(all_trips_df)

    labeled_trips_df = all_trips_df.loc[all_trips_df.user_input != {}]
    df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df

    df_for_cluster = add_loc_clusters(df_for_cluster,
                                      radii=radii,
                                      alg=alg,
                                      loc_type=loc_type,
                                      min_samples=2)

    return df_for_cluster

In [None]:
user2_df = setup(expanded_trip_df_map[user2_uuid],
                  alg='DBSCAN',
                  loc_type='end',
                  radii=[150],
                  cluster_unlabeled=False)

cluster_1 = user2_df.loc[(shankari_df[f'end_DBSCAN_clusters_150_m'] == 1
                    )].loc[:,
                           ['end_lat', 'end_lon', 'purpose_confirm']].dropna()
cluster_4 = user2_df.loc[(shankari_df[f'end_DBSCAN_clusters_150_m'] == 4
                    )].loc[:,
                           ['end_lat', 'end_lon', 'purpose_confirm']].dropna()

#### plots

I forgot which index corresponded to the cluster I used, but it should be one of the two below. 

In [None]:
vis_pred(cluster_1)

In [None]:
vis_pred(cluster_4)