This notebook is for evaluating clusters above cutoff and exploring data for a single user after first round clustering.

In [None]:
import logging

# Our imports
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.featurization as featurization
import emission.analysis.modelling.tour_model.representatives as representatives
import emission.storage.decorations.analysis_timeseries_queries as esda
import pandas as pd
from numpy import *
import confirmed_trips_eval_bins_clusters as evaluation
from sklearn import metrics
from pandas.testing import assert_frame_equal

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

## Choose one user for experiment

In [None]:
user = all_users[0]

In [None]:
#read the data from the database. We choose key=esda.CONFIRMED_TRIP_KEY to get confirmed trips here
trips = pipeline.read_data(uuid=user,key=esda.CONFIRMED_TRIP_KEY)

In [None]:
# select trips that have user_input to analyze
non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
len(non_empty_trips)

In [None]:
# filter out trips that are not fully labeled(contain NaN in user_input)
non_empty_trips_df = pd.DataFrame(t["data"]["user_input"]for t in non_empty_trips)
valid_trips_df = non_empty_trips_df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=False)
valid_trips_idx_ls = valid_trips_df.index.tolist()
valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]
len(valid_trips),valid_trips

In [None]:
bin_trips, bins = pipeline.remove_noise(valid_trips, radius)

In [None]:
logging.debug('The list of bins is %s' % bins)

In [None]:
# clustering the data only based on sil score (min_cluster = 0) instead of bins number (len(bins))
feat = featurization.featurization(bin_trips)
min = 0
max = int(math.ceil(1.5 * len(bins)))
feat.cluster(min_clusters=min, max_clusters=max)

In [None]:
logging.debug('number of clusters: %d' % feat.clusters)

In [None]:
logging.debug('labels list is: %s' % feat.labels)

In [None]:
cluster_trips = feat.data
cluster_trips

In [None]:
cluster_user_input_df = pd.DataFrame(data=[i["data"]["user_input"] for i in cluster_trips])
cluster_user_input_df

### Original output

In [None]:
# turn cluster_trips to list without any changes
cluster_user_input_ls = cluster_user_input_df.values.tolist()
cluster_user_input_ls

In [None]:
# drop duplicate user_input
no_dup_df=cluster_user_input_df.drop_duplicates()
no_dup_df,len(no_dup_df)

In [None]:
# turn non-duplicate user_input into list
no_dup_list = no_dup_df.values.tolist()
no_dup_list

In [None]:
# collect labels_true based on user_input
labels_true =[]
for trip in cluster_user_input_ls:
    if trip in no_dup_list:
        labels_true.append(no_dup_list.index(trip))
labels_true

In [None]:
labels_pred = feat.labels

In [None]:
cluster_ps=[]
for trip in cluster_trips:
    cluster_ps.append([trip["data"]["start_loc"]["coordinates"][0],
                       trip["data"]["start_loc"]["coordinates"][1],
                       trip["data"]["end_loc"]["coordinates"][0],
                       trip["data"]["end_loc"]["coordinates"][1]])
cluster_ps_df = pd.DataFrame(data=cluster_ps)
cluster_ps_df

In [None]:
label_ps_df = pd.DataFrame(data=feat.points)
label_ps_df

In [None]:
# compare two data frames, return nothing if two data frames are the same
assert_frame_equal(cluster_ps_df,label_ps_df)

In [None]:
metrics.homogeneity_score(labels_true, labels_pred)

In [None]:
metrics.completeness_score(labels_true, labels_pred)

In [None]:
metrics.v_measure_score(labels_true, labels_pred)

### After changing language

In [None]:
span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance',
 'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby',
 'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment',
 'grupo_comunitario':'community group','caminata_comunitaria':'community walk'}

In [None]:
# change language and turn data frame to list
cluster_sp2en_df = cluster_user_input_df.replace(span_eng_dict)
cluster_sp2en_ls = cluster_sp2en_df.values.tolist()
cluster_sp2en_ls

In [None]:
# drop duplicate user_input
no_dup_sp2en_df=cluster_sp2en_df.drop_duplicates()
no_dup_sp2en_df

In [None]:
# turn non-duplicate user_input into list
no_dup_sp2en_list = no_dup_sp2en_df.values.tolist()
no_dup_sp2en_list

In [None]:
# collect labels_true based on user_input
labels_true_sp2en =[]
for trip in cluster_sp2en_ls:
    if trip in no_dup_sp2en_list:
        labels_true_sp2en.append(no_dup_sp2en_list.index(trip))
labels_true_sp2en

In [None]:
labels_pred = labels_pred = feat.labels
labels_pred

In [None]:
metrics.homogeneity_score(labels_true_sp2en, labels_pred)

In [None]:
metrics.completeness_score(labels_true_sp2en, labels_pred)

In [None]:
metrics.v_measure_score(labels_true_sp2en, labels_pred)

### After converting purposes and mode

In [None]:
map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home',
               'insurance_payment':'insurance'}

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
# convert purpose
cluster_cvt_pur_df = cluster_sp2en_df.replace(map_pur_dict)
# convert mode
cluster_cvt_pur_mo_df = cluster_cvt_pur_df
for i in range(len(cluster_cvt_pur_mo_df)):
    if cluster_cvt_pur_mo_df.iloc[i]["replaced_mode"] == "same_mode":
        print(cluster_cvt_pur_mo_df.iloc[i]) # to see which row will be converted
        cluster_cvt_pur_mo_df.iloc[i]["replaced_mode"] = cluster_cvt_pur_mo_df.iloc[i]['mode_confirm']
print(cluster_cvt_pur_mo_df)
cluster_cvt_pur_mo_ls = cluster_cvt_pur_mo_df.values.tolist()
cluster_cvt_pur_mo_ls

In [None]:
# drop duplicate user_input
no_dup_cvt_pur_mode_df = cluster_cvt_pur_mo_df.drop_duplicates()
no_dup_cvt_pur_mode_df

In [None]:
# turn non-duplicate user_input into list
no_dup_cvt_pur_mo_ls = no_dup_cvt_pur_mode_df.values.tolist()
no_dup_cvt_pur_mo_ls

In [None]:
# collect labels_true based on user_input
labels_true_cvt =[]
for trip in cluster_cvt_pur_mo_ls:
    if trip in no_dup_cvt_pur_mo_ls:
        labels_true_cvt.append(no_dup_cvt_pur_mo_ls.index(trip))
labels_true_cvt

In [None]:
labels_pred = labels_pred = feat.labels
labels_pred

In [None]:
metrics.homogeneity_score(labels_true_cvt, labels_pred)

In [None]:
metrics.completeness_score(labels_true_cvt, labels_pred)

In [None]:
metrics.v_measure_score(labels_true_cvt, labels_pred)