In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy
import emission.core.get_database as edb
import logging
import folium
import math
import branca.colormap as cm
import emission.storage.timeseries.abstract_timeseries as esta
import emission.analysis.modelling.tour_model.cluster_pipeline as eamtc
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.featurization as featurization
import viz_bin_cluster as viz

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
# copied from mode_purpose_share.ipynb
# We select participants from real data to be users at this time
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
logging.debug('There are %s users, we will plot the graphs for one of them' % len(all_users))
# We use user11 as an example 
# since the bins number after filtering are obviously different from the number of clusters
user = all_users[10]

In [None]:
# we should experiment with different values here
radius = 300

In [None]:
trips = pipeline.read_data(uuid=user)

## Visualizing trips from bins above cutoff

In [None]:
bin_trips, bins = pipeline.remove_noise(trips, radius)

In [None]:
logging.debug('The list of bins is %s' % bins)

In [None]:
# bins_map takes three parameters
# set the second parameter to None or [] since we need to plot all trips above cutoff
viz.bins_map(bins,None,trips)

## Visualizing clusters above cutoff

In [None]:
clusters,labels,cluster_trips, points = pipeline.cluster(bin_trips, len(bins))

In [None]:
logging.debug('number of clusters: %d' % clusters)

In [None]:
# clusters_map takes four parameters
# set the second parameter to None or [] since we need to plot all clusters above cutoff
viz.clusters_map(labels,None,points,clusters)

## Visualizing the difference between specific bins and clusters above cutoff 

### Trips from filtered bins

In [None]:
# put the indices of selected bin(s) into a list as the second parameter
# from the output above, trips in the first three bins have the same label, we can compare them here
viz.bins_map(bins,[0,1,2],trips)

### Trips in clusters

In [None]:
# put the selected label(s) into a list as the second parameter
viz.clusters_map(labels,[1],points,clusters)

## Data collection

In [None]:
import pandas as pd

user12 is not typical, so we filter it out

In [None]:
above_cutoff_bins = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    bin_trips, bins = pipeline.remove_noise(trips, radius)
    above_cutoff_bins.append(len(bins))
above_cutoff_bins

In [None]:
above_cutoff_clusters = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    bin_trips, bins = pipeline.remove_noise(trips, radius)
    feat = featurization.featurization(bin_trips)
    min = 0
    max = int(math.ceil(1.5 * len(bins)))
    feat.cluster(min_clusters=min, max_clusters=max)
    above_cutoff_clusters.append(feat.clusters)
above_cutoff_clusters

In [None]:
# since we just need a simple graph, setting the indices to users' name is more clear to readers 
user_index = ['user1','user2','user3','user4','user5','user6','user7','user8','user9','user10','user11','user13']

In [None]:
cutoff_data = {'above cutoff bins':[i for i in above_cutoff_bins],'above cutoff clusters':[i for i in above_cutoff_clusters]}

In [None]:
df_cutoff = pd.DataFrame(data = cutoff_data, index = user_index)
df_cutoff

In [None]:
df_cutoff.plot(kind = 'bar')