# Explore multiple datasets

In this notebook, we are going to experiment with characterising the three datasets that we have in terms of data quality and demographic characteristics.

This notebook is intended to be run on the exported, federated csv file. The file should be exported using `Federating and saving multiple datasets.ipynb`

### First, we read the data and extract the most common purpose labels

In [None]:
import pandas as pd
import numpy as np
import geojson as gj
import sklearn.cluster as sc
import sklearn.metrics.pairwise as smp

In [None]:
import json
import copy

In [None]:
import folium
import branca.element as bre

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import seaborn as sns

In [None]:
from IPython import display
from uuid import UUID

import bson.json_util as bju
import bson.objectid as boi

In [None]:
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq

### Read data and setup variables

In [None]:
all_expanded_df = pd.read_json(open("/tmp/federated_trip_only_dataset.json"), orient="records", typ="frame")
for id_col in ["_id", "raw_trip", "start_place", "end_place", "cleaned_trip"]:
    all_expanded_df[id_col] = all_expanded_df[id_col].apply(lambda i: boi.ObjectId(i["$oid"]))
    
all_expanded_df["user_id"] = all_expanded_df["user_id"].apply(lambda u: UUID(u["$uuid"]))
all_expanded_df.tail()

In [None]:
all_expanded_df.columns

In [None]:
def get_unique_program(user_id):
    all_programs = all_expanded_df[all_expanded_df.user_id == user_id]["program"].unique()
    assert len(all_programs) == 1, f"all_programs = {all_programs}"
    return all_programs[0]

participant_df = pd.DataFrame(all_expanded_df.user_id.unique(), columns=["user_id"])
participant_df = participant_df[participant_df.user_id != 0]
participant_df.set_index("user_id", inplace=True, drop=True)
participant_df["program"] = [get_unique_program(u) for u in participant_df.index]
participant_df

In [None]:
modeling_support_objects = {}

In [None]:
FINAL_RADIUS = 500
FINAL_POINT_DBSCAN = sc.DBSCAN(FINAL_RADIUS, min_samples=2, metric="precomputed")
FINAL_TRIP_DBSCAN = sc.DBSCAN(FINAL_RADIUS * 2, min_samples=2, metric="precomputed")

### Standard functions (currently copied over from other notebooks; should be refactored into a python file)

In [None]:
def get_loc_df(loc_series):
    loc_df = pd.DataFrame(loc_series.apply(lambda p: p["coordinates"]).to_list(), columns=["longitude", "latitude"])
    # display.display(end_loc_df.head())
    return loc_df

In [None]:
def get_distance_matrix(loc_df):
    EARTH_RADIUS = 6371000
    radians_lat_lon = np.radians(loc_df[["latitude", "longitude"]])
    dist_matrix_meters = pd.DataFrame(smp.haversine_distances(radians_lat_lon, radians_lat_lon) * 6371000)
    return dist_matrix_meters

In [None]:
def add_loc_clusters(user_id, modeling_support_objects, trip_df):
    user_trip_df = trip_df[trip_df.user_id == user_id]
    start_distance_matrix = get_distance_matrix(get_loc_df(user_trip_df.start_loc))
    end_distance_matrix = get_distance_matrix(get_loc_df(user_trip_df.end_loc))
    start_loc_model = copy.copy(FINAL_POINT_DBSCAN).fit(start_distance_matrix)
    end_loc_model = copy.copy(FINAL_POINT_DBSCAN).fit(end_distance_matrix)
    trip_df.loc[user_trip_df.index, "start_loc_cluster"] = start_loc_model.labels_
    trip_df.loc[user_trip_df.index, "end_loc_cluster"] = end_loc_model.labels_

    curr_model_support = modeling_support_objects.get(user_id)
    if curr_model_support is None:
        modeling_support_objects[user_id] = {}
        curr_model_support = modeling_support_objects[user_id]
    curr_model_support["start_distance_matrix"] = start_distance_matrix
    curr_model_support["end_distance_matrix"] = end_distance_matrix   
    curr_model_support["start_loc_model"] = start_loc_model
    curr_model_support["end_loc_model"] = end_loc_model

    return trip_df

In [None]:
def add_trip_clusters_dbscan(user_id, trip_df):
    user_trip_df = trip_df[trip_df.user_id == user_id]
    all_combos = user_trip_df.groupby(["start_loc_cluster", "end_loc_cluster"])
    valid_combos = [p for p in all_combos.groups if p[0] != -1 and p[1] != -1]
    print(f"After validating, all_combos {len(all_combos.groups)} -> {len(valid_combos)}")
    all_combos_dict = dict(all_combos.groups)
    valid_combos_series = pd.Series(valid_combos)
    for g, idxlist in all_combos_dict.items():
        print(g, idxlist)
        match = valid_combos_series[valid_combos_series == g]
        if len(match) == 0:
            print(f"invalid combo {g} found for entries {idxlist}, trip is not in a cluster")
            trip_df.loc[idxlist, "trip_cluster_dbscan"] = -1
        else:
            print(f"valid combo {g} found for entries {idxlist}, setting trip cluster to {match.index[0]}")
            trip_df.loc[idxlist, "trip_cluster_dbscan"] = int(match.index[0])
    return trip_df

In [None]:
def update_basic_stats(user_id, participant_df, trip_df):
    user_trip_df = trip_df[trip_df.user_id == user_id]
    basic_stats = {}
    basic_stats["n_labeled_trips"] = len(user_trip_df)
    basic_stats["unique_label_combos"] = list(user_trip_df.groupby(["mode_confirm", "purpose_confirm", "replaced_mode"]).groups)
    basic_stats["start_loc_in_cluster"] = np.count_nonzero(user_trip_df.start_loc_cluster != -1)
    basic_stats["end_loc_in_cluster"] = np.count_nonzero(user_trip_df.end_loc_cluster != 1)
    basic_stats["trip_in_cluster_dbscan"] = np.count_nonzero(user_trip_df.trip_cluster_dbscan != -1)
    basic_stats["n_clusters_dbscan"] = user_trip_df.trip_cluster_dbscan.max()
    # print(f"Adding cols {basic_stats.keys()} with vals {basic_stats.values()}")
    participant_df.loc[user_id, basic_stats.keys()] = basic_stats.values()
    return participant_df

Target exploratory analysis:

- number of users
- number of trips
- labeled trip/user distribution
- number of unique combinations of labels
- distribution of unique combination of labels (overall)
- distribution of unique combination of labels (per-user)
- number of trips whose end point is in a cluster
- number of trips whose start point is in a cluster
- number of trips where trip is in a cluster
- number of clusters

In [None]:
for u in participant_df.index:
    all_expanded_df = add_trip_clusters_dbscan(u, add_loc_clusters(u, modeling_support_objects,all_expanded_df))
    participant_df = update_basic_stats(u, participant_df, all_expanded_df)

### Again, let's focus on one dataset before generalizing to other datasets

In [None]:
minipilot_df = participant_df[participant_df.program == "minipilot"]
minipilot_df.head(n=2)

In [None]:
minipilot_df[["n_labeled_trips", "start_loc_in_cluster", "end_loc_in_cluster", "trip_in_cluster_dbscan", "n_clusters_dbscan"]].plot(kind="bar", figsize=(20,5))

# Final results, generalized to the entire dataset



### First, let's just display everything, without grouping by program

In [None]:
for u in participant_df.index:
    all_expanded_df = add_trip_clusters_dbscan(u, add_loc_clusters(u, modeling_support_objects,all_expanded_df))
    participant_df = update_basic_stats(u, participant_df, all_expanded_df)

In [None]:
participant_df[["n_labeled_trips", "start_loc_in_cluster", "end_loc_in_cluster", "trip_in_cluster_dbscan", "n_clusters_dbscan"]].plot(kind="bar", use_index=False, figsize=(30,10))

### Next, let's group by dataframe to see if there are consistent program level differences

In [None]:
participant_df[participant_df.program == "minipilot"][["n_labeled_trips", "start_loc_in_cluster", "end_loc_in_cluster", "trip_in_cluster_dbscan", "n_clusters_dbscan"]].plot(kind="bar", figsize=(20,5), use_index=False)

In [None]:
participant_df[participant_df.program == "nrel_lh"][["n_labeled_trips", "start_loc_in_cluster", "end_loc_in_cluster", "trip_in_cluster_dbscan", "n_clusters_dbscan"]].plot(kind="bar", figsize=(20,5), use_index=False)

In [None]:
participant_df[participant_df.program == "stage"][["n_labeled_trips", "start_loc_in_cluster", "end_loc_in_cluster", "trip_in_cluster_dbscan", "n_clusters_dbscan"]].plot(kind="bar", figsize=(20,5), use_index=False)

### Assessing clustering effectiveness

Assuming that fewer clusters are better than more because there is more commonality, we can display the ratio of clusters to trips in clusters.

In [None]:
participant_df["cluster_trip_ratio"] = participant_df["n_clusters_dbscan"] / participant_df["trip_in_cluster_dbscan"]

In [None]:
# using plt.scatter here instead of pandas.plot since it is non-trivial to use the index as the x axis
# https://stackoverflow.com/questions/49834883/scatter-plot-form-dataframe-with-index-on-x-axis
# x=df.index does not work for me, may be due to an older version of pandas
color_list = plt.get_cmap("Accent", 3).colors
fig = plt.Figure(figsize=(10,5))
for i, p in enumerate(participant_df.program.unique()):
    curr_p_df = participant_df[participant_df.program==p]
    fig = plt.scatter([str(u) for u in curr_p_df.index], curr_p_df["cluster_trip_ratio"], color=color_list[i], label=p)
fig.axes.set_xticklabels(range(0,len(participant_df)))
fig.axes.legend()

The NREL LH program does in fact have a better cluster ratio overall than the other two programs. But even in the other two programs, most of the ratios are pretty low. Still, we can't help everybody, and there are going to be a large number of people who are going to have to label more than half their trips. Still, it is gratifying to see that the max overall is just a bit higher than 0.7.

The same data with a slightly different visualization.

In [None]:
participant_df.boxplot("cluster_trip_ratio", by="program")