In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy
import emission.core.get_database as edb
import logging
import folium
import math
import branca.colormap as cm
import emission.storage.timeseries.abstract_timeseries as esta
import emission.analysis.modelling.tour_model.cluster_pipeline as eamtc
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.featurization as featurization

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
# copied from mode_purpose_share.ipynb
# We select participants from real data to be users at this time
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
logging.debug('There are %s users, we will plot the graphs for one of them' % len(all_users))

user = all_users[1]

In [None]:
# we should experiment with different values here
radius = 300

In [None]:
trips = pipeline.read_data(uuid=user)

Note: Run trips on all data and trips in bins/clusters above cutoff point separately 

## Visualizing trips from bins

### Trips from all bins

In [None]:
sim = similarity.similarity(trips, radius)
sim.bin_data()

In [None]:
logging.debug('There are %s bins before filtering' % len(sim.bins))

In [None]:
#Create colormap for all bins
cmp_all_bins = cm.linear.Set1_07.to_step(len(sim.bins),index=[i for i in range (len(sim.bins))])
cmp_all_bins

In [None]:
# Plot all the bin trips on the map, use different color for different bins.
m = folium.Map(location=[trips[0].data.start_loc["coordinates"][1], trips[0].data.start_loc["coordinates"][0]],zoom_start=12, max_zoom= 30, control_scale=True)
for t in range (len(sim.bins)):    
    for i in range (len(sim.bins[t])):
        layer = folium.PolyLine([[trips[sim.bins[t][i]].data.start_loc["coordinates"][1],trips[sim.bins[t][i]].data.start_loc["coordinates"][0]],
                                [trips[sim.bins[t][i]].data.end_loc["coordinates"][1],trips[sim.bins[t][i]].data.end_loc["coordinates"][0]]],weight=2, 
                                color=cmp_all_bins(t))
        layer.add_to(m)
m.add_child(cmp_all_bins)     
m

### Trips from bins above the cutoff only

In [None]:
bin_trips, bins = pipeline.remove_noise(trips, radius)

In [None]:
logging.debug('The list of bins is %s' % bins)

In [None]:
cmp_cutoff = cm.linear.Set1_07.to_step(len(bins),index=[i for i in range (len(bins))])
cmp_cutoff

In [None]:
# Plot the bin trips above the cutoff point on the map, use different color for different bins.
m = folium.Map(location=[bin_trips[0].data.start_loc["coordinates"][1], bin_trips[0].data.start_loc["coordinates"][0]],zoom_start=12, max_zoom= 30, control_scale=True)
t_index = 0
for t in range (len(bins)):    
    for i in range (len(bins[t])):
        layer = folium.PolyLine([[bin_trips[t_index].data.start_loc["coordinates"][1],bin_trips[t_index].data.start_loc["coordinates"][0]],
                                [bin_trips[t_index].data.end_loc["coordinates"][1],bin_trips[t_index].data.end_loc["coordinates"][0]]],weight=2, 
                                color=cmp_cutoff(t))
        t_index += 1
        layer.add_to(m)
m.add_child(cmp_cutoff)
        
m

## Visualizing clusters

### on all data

Note: should run "Trips from all bins" first

In [None]:
# copied from the cluster pipeline code so we can work with the featurization code directly
feat = featurization.featurization(trips)
min = 0
max = int(math.ceil(1.5 * len(sim.bins)))
feat.cluster(min_clusters=min, max_clusters=max)

In [None]:
logging.debug('number of clusters: %d' % feat.clusters)

In [None]:
#labels have to be in order in the colormap index
labels_clt = feat.labels
labels_clt = list(set(sorted(labels_clt)))
labels_clt

In [None]:
cmp_all_clusters = cm.linear.Set1_07.to_step((feat.clusters),index=[i for i in labels_clt])
cmp_all_clusters

In [None]:
#Plot all clusters with different colors on the map

cluster_all = folium.Map(location=[feat.points[0][1], feat.points[0][0]],zoom_start=12, max_zoom= 30, control_scale=True)

print(feat.labels)
if feat.labels:             
    for i in range(len(feat.points)):
        start_lat = feat.points[i][1]
        start_lon = feat.points[i][0]
        end_lat = feat.points[i][3]
        end_lon = feat.points[i][2]
        layer = folium.PolyLine([[start_lat,start_lon],
                                 [end_lat,end_lon]],weight=2, color=cmp_all_clusters(feat.labels[i]))
        layer.add_to(cluster_all)
cluster_all.add_child(cmp_all_clusters)    
cluster_all

### above the cutoff only

Note: should run "Trips from bins above the cutoff only" first

In [None]:
# copied from the cluster pipeline code so we can work with the featurization code directly
# min clusters set to 0 instead of len(bins), to compare with the bins
feat = featurization.featurization(bin_trips)
min = 0
max = int(math.ceil(1.5 * len(bins)))
feat.cluster(min_clusters=min, max_clusters=max)

In [None]:
logging.debug('number of clusters: %d' % feat.clusters)

In [None]:
#labels have to be in order in the colormap index
labels_clt = feat.labels
labels_clt = list(set(sorted(labels_clt)))
labels_clt

In [None]:
cmp_clusters_cutoff = cm.linear.Set1_07.to_step((feat.clusters),index=[i for i in labels_clt])
cmp_clusters_cutoff

In [None]:
#Plot the clusters above the cutoff only with different colors,to see if the algorithm wrongly puts different trips into the same cluster
cluster_cutoff = folium.Map(location=[feat.points[0][1], feat.points[0][0]],zoom_start=12, max_zoom= 30, control_scale=True)

print(feat.labels)
if feat.labels:             
    for i in range(len(feat.points)):
        start_lat = feat.points[i][1]
        start_lon = feat.points[i][0]
        end_lat = feat.points[i][3]
        end_lon = feat.points[i][2]
        layer = folium.PolyLine([[start_lat,start_lon],
                                 [end_lat,end_lon]],weight=2, color=cmp_clusters_cutoff(feat.labels[i]))
        layer.add_to(cluster_cutoff)
cluster_cutoff.add_child(cmp_clusters_cutoff)    
cluster_cutoff

## Visualizing the difference between bins and clusters above cutoff 

Note: this part can be run alone, after importing packages

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]
radius = 300

In [None]:
# We use user11 as an example
#-above cutoff only
#DEBUG:root:number of bins after filtering: 14
#DEBUG:root:number of clusters: 8

In [None]:
user = all_users[10]
trips = pipeline.read_data(uuid=user)

### Trips from filtered bins

In [None]:
#Show the list of bins from user11 for convenience

#DEBUG:root:The list of bins is [[12, 22, 49, 51, 53, 63, 72, 175], [13, 25, 38, 54, 73, 139, 144], 
#                                [24, 35, 37, 90, 127, 143, 172], [7, 17, 27, 29, 118, 165], 
#                                [78, 111, 133, 150, 201], [8, 28, 119, 166], [23, 36, 52, 126], 
#                                [32, 87, 132, 146], [56, 92, 97, 117], [112, 134, 151, 202], [113, 182, 189, 195],
#                                [26, 140, 145], [50, 62, 64], [131, 154, 161]]

In [None]:
bin_trips, bins = pipeline.remove_noise(trips, radius)
logging.debug('The list of bins is %s' % bins)

In [None]:
cmp_cutoff = cm.linear.Set1_07.to_step(len(bins),index=[i for i in range (len(bins))])
cmp_cutoff

In [None]:
# Plot trips in the same bin
m = folium.Map(location=[bin_trips[0].data.start_loc["coordinates"][1], bin_trips[0].data.start_loc["coordinates"][0]],zoom_start=12, max_zoom= 30, control_scale=True)
t_index = 0

# Here we to choose a specific bin
for t in range (12,14):    
    for i in range (len(bins[t])):
        layer = folium.PolyLine([[bin_trips[t_index].data.start_loc["coordinates"][1],bin_trips[t_index].data.start_loc["coordinates"][0]],
                                [bin_trips[t_index].data.end_loc["coordinates"][1],bin_trips[t_index].data.end_loc["coordinates"][0]]],weight=2, 
                                color=cmp_cutoff(t))
        t_index += 1
        layer.add_to(m)
m.add_child(cmp_cutoff)
        
m

### Trips in clusters

In [None]:
bin_trips, bins = pipeline.remove_noise(trips, radius)

In [None]:
feat = featurization.featurization(bin_trips)
min = 0
max = int(math.ceil(1.5 * len(bins)))
feat.cluster(min_clusters=min, max_clusters=max)
logging.debug('number of clusters: %d' % feat.clusters)

In [None]:
labels_clt = feat.labels
labels_clt = list(set(sorted(labels_clt)))
labels_clt

In [None]:
cmp_clusters_cutoff = cm.linear.Set1_07.to_step((feat.clusters),index=[i for i in labels_clt])
cmp_clusters_cutoff

In [None]:
# Plot trips in the same cluster
cluster_cutoff = folium.Map(location=[feat.points[0][1], feat.points[0][0]],zoom_start=12, max_zoom= 30, control_scale=True)

print(feat.labels)
if feat.labels:             
    for i in range(len(feat.points)):
        
        # Here we can choose a specific cluster to plot
        if feat.labels[i]==7:
            start_lat = feat.points[i][1]
            start_lon = feat.points[i][0]
            end_lat = feat.points[i][3]
            end_lon = feat.points[i][2]
            layer = folium.PolyLine([[start_lat,start_lon],
                                     [end_lat,end_lon]],weight=2, color=cmp_clusters_cutoff(feat.labels[i]))
            layer.add_to(cluster_cutoff)
cluster_cutoff.add_child(cmp_clusters_cutoff)    
cluster_cutoff

## Data collection

Note: Run this part after importing packages above

In [None]:
import pandas as pd

In [None]:
radius = 300
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
all_data_bins = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    sim = similarity.similarity(trips, radius)
    sim.bin_data()
    bins_num_all = len(sim.bins)
    all_data_bins.append(bins_num_all )
all_data_bins

In [None]:
all_data_clusters = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    sim = similarity.similarity(trips, radius)
    sim.bin_data()
    feat = featurization.featurization(trips)
    min = 0
    max = int(math.ceil(1.5 * len(sim.bins)))
    feat.cluster(min_clusters=min, max_clusters=max)
    all_clusters = feat.clusters
    all_data_clusters.append(all_clusters)
all_data_clusters

In [None]:
# Should restart to run this part and the next part, do not run all data first
above_cutoff_bins = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    bin_trips, bins = pipeline.remove_noise(trips, radius)
    above_cutoff_bins.append(len(bins))
above_cutoff_bins

In [None]:
above_cutoff_clusters = []
for i in range (len(all_users)):
    if i == 11:
        continue
    user = all_users[i]
    trips = pipeline.read_data(uuid=user)
    bin_trips, bins = pipeline.remove_noise(trips, radius)
    feat = featurization.featurization(bin_trips)
    min = 0
    max = int(math.ceil(1.5 * len(bins)))
    feat.cluster(min_clusters=min, max_clusters=max)
    above_cutoff_clusters.append(feat.clusters)
above_cutoff_clusters

In [None]:
user_index = ['user1','user2','user3','user4','user5','user6','user7','user8','user9','user10','user11','user13']

In [None]:
all_data = {'all bins':[i for i in all_data_bins],'all clusters':[i for i in all_data_clusters]}

In [None]:
cutoff_data = {'above cutoff bins':[i for i in above_cutoff_bins],'above cutoff clusters':[i for i in above_cutoff_clusters]}

In [None]:
df_all = pd.DataFrame(data = all_data, index = user_index)
df_all

In [None]:
df_cutoff = pd.DataFrame(data = cutoff_data, index = user_index)
df_cutoff

In [None]:

df_cutoff.plot(kind = 'bar')