In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy
import emission.core.get_database as edb
import logging
import folium
import math
import branca.colormap as cm
import emission.storage.timeseries.abstract_timeseries as esta
import emission.analysis.modelling.tour_model.cluster_pipeline as eamtc
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.featurization as featurization
import viz_bin_cluster as viz

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
# copied from mode_purpose_share.ipynb
# We select participants from real data to be users at this time
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
logging.debug('There are %s users, we will plot the graphs for one of them' % len(all_users))
# We can choose one user without too many bins/clusters (easier to identify by colors later)
user = all_users[1]

In [None]:
# we should experiment with different values here
radius = 300

In [None]:
trips = pipeline.read_data(uuid=user)

## Visualizing trips from all bins

In [None]:
sim = similarity.similarity(trips, radius)
sim.bin_data()

In [None]:
logging.debug('There are %s bins before filtering' % len(sim.bins))

In [None]:
# the second parameter is for selected bins, set it to None to plot all bins
viz.bins_map(sim.bins, None, trips)

## Visualizing clusters on all data

Note: should run "Trips from all bins" first

In [None]:
# copied from the cluster pipeline code so we can work with the featurization code directly
feat = featurization.featurization(trips)
min = 0
max = int(math.ceil(1.5 * len(sim.bins)))
feat.cluster(min_clusters=min, max_clusters=max)

In [None]:
logging.debug('number of clusters: %d' % feat.clusters)

In [None]:
logging.debug('The list of labels is %s'% feat.labels)

In [None]:
# the second parameter is for selected cluster, set it to None to plot all clusters
viz.clusters_map(feat.labels, None, feat.points, feat.clusters)

## Data collection

In [None]:
import pandas as pd

In [None]:
# user12 is not typical, so we filter it out

In [None]:
all_data_bins = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    sim = similarity.similarity(trips, radius)
    sim.bin_data()
    bins_num_all = len(sim.bins)
    all_data_bins.append(bins_num_all )
all_data_bins

In [None]:
all_data_clusters = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    sim = similarity.similarity(trips, radius)
    sim.bin_data()
    feat = featurization.featurization(trips)
    min = 0
    max = int(math.ceil(1.5 * len(sim.bins)))
    feat.cluster(min_clusters=min, max_clusters=max)
    all_clusters = feat.clusters
    all_data_clusters.append(all_clusters)
all_data_clusters

In [None]:
user_index = ['user1','user2','user3','user4','user5','user6','user7','user8','user9','user10','user11','user13']

In [None]:
all_data = {'all bins':[i for i in all_data_bins],'all clusters':[i for i in all_data_clusters]}

In [None]:
df_all = pd.DataFrame(data = all_data, index = user_index)
df_all

In [None]:
df_all.plot(kind = 'bar')