In [None]:
import logging

# Our imports
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.featurization as featurization
import emission.analysis.modelling.tour_model.representatives as representatives
import emission.storage.decorations.analysis_timeseries_queries as esda
import pandas as pd
from numpy import *
import confirmed_trips_eval_bins_clusters as evaluation
from sklearn import metrics
from pandas.testing import assert_frame_equal

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 300

## Choose one user for experiment

In [None]:
user = all_users[6]

In [None]:
#read the data from the database. We choose key=esda.CONFIRMED_TRIP_KEY to get confirmed trips here
trips = pipeline.read_data(uuid=user,key=esda.CONFIRMED_TRIP_KEY)

In [None]:
# select trips that have user_input to analyze
non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
len(non_empty_trips)

In [None]:
# filter out trips that are not fully labeled(contain NaN in user_input)
valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and 
               'purpose_confirm'in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]]
len(valid_trips),valid_trips

In [None]:
sim = similarity.similarity(valid_trips, radius)
sim.data

In [None]:
sim.bin_data()

In [None]:
filter_trips = sim.data
trip_index_ls = []
bins = sim.bins
for bin in bins:
    for index in bin:
        trip_index_ls.append(index)
bin_trips = [filter_trips[num]for num in trip_index_ls]

print(len(bin_trips),len(bins))
bin_trips

In [None]:
logging.debug('The list of bins is %s' % bins)

In [None]:
# show all user labels in all bins
for bin in bins:
    bin_user_input = (filter_trips[i].data["user_input"] for i in bin)
    bin_df = pd.DataFrame(data = bin_user_input)
    print(bin_df)

### Original output

In [None]:
bin_trips_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips])
bin_trips_df

In [None]:
# turn all user_input into list without binning
bin_trips_user_input_ls = bin_trips_df.values.tolist()
bin_trips_user_input_ls

In [None]:
# drop duplicate user_input
no_dup_df=bin_trips_df.drop_duplicates()
no_dup_df,len(no_dup_df)

In [None]:
# turn non-duplicate user_input into list
no_dup_list = no_dup_df.values.tolist()
no_dup_list

In [None]:
pd.set_option('display.max_rows', 300)

In [None]:
# collect labels_true based on user_input
labels_true =[]
for trip in bin_trips_user_input_ls:
    if trip in no_dup_list:
        labels_true.append(no_dup_list.index(trip))
labels_true

In [None]:
# collect labels_pred based on bins
labels_pred = []
for i in range(len(bins)):
    for trip in bins[i]:
        labels_pred.append(i)
labels_pred

In [None]:
bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"]for trip in bin_trips])
bin_input = pd.DataFrame(data=[trip["data"]["user_input"]for trip in bin_trips])
len(bin_trips_ts)
bin_input

In [None]:
bins_ts = pd.DataFrame(data=[filter_trips[num]["data"]["start_ts"]for num in trip_index_ls])
bins_input = pd.DataFrame(data=[filter_trips[num]["data"]["user_input"]for num in trip_index_ls])
len(trip_index_ls)
bins_input

In [None]:
# compare two data frames, return nothing if two data frames are the same
assert_frame_equal(bins_ts,bin_trips_ts)

In [None]:
metrics.homogeneity_score(labels_true, labels_pred)

In [None]:
metrics.completeness_score(labels_true, labels_pred)

In [None]:
metrics.v_measure_score(labels_true, labels_pred)

### After changing language

In [None]:
span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance',
 'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby',
 'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment',
 'grupo_comunitario':'community group','caminata_comunitaria':'community walk'}

In [None]:
# use dict to replace the values in Spanish in the bin(this step just for showing the trips in each bin)
for bin in bins:
    bin_user_input = (valid_trips[i].data["user_input"] for i in bin)
    bin_df = pd.DataFrame(data = bin_user_input)
    sp2en_bin_df = bin_df.replace(span_eng_dict)
    print(sp2en_bin_df)

In [None]:
# turn all user_input into list without binning
bin_trips_sp2en_df = bin_trips_df.replace(span_eng_dict)
bin_trips_sp2en_ls = bin_trips_sp2en_df.values.tolist()
bin_trips_sp2en_ls

In [None]:
# drop duplicate user_input
no_dup_sp2en_df=bin_trips_sp2en_df.drop_duplicates()
no_dup_sp2en_df

In [None]:
# turn non-duplicate user_input into list
no_dup_sp2en_list = no_dup_sp2en_df.values.tolist()
no_dup_sp2en_list

In [None]:
# collect labels_true based on user_input
labels_true_sp2en =[]
for trip in bin_trips_sp2en_ls:
    if trip in no_dup_sp2en_list:
        labels_true_sp2en.append(no_dup_sp2en_list.index(trip))
labels_true_sp2en

In [None]:
# collect labels_pred based on bins
labels_pred = []
for i in range(len(bins)):
    for trip in bins[i]:
        labels_pred.append(i)
labels_pred

In [None]:
metrics.homogeneity_score(labels_true_sp2en, labels_pred)

In [None]:
metrics.completeness_score(labels_true_sp2en, labels_pred)

In [None]:
metrics.v_measure_score(labels_true_sp2en, labels_pred)

### After converting purposes and mode

In [None]:
map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home',
               'insurance_payment':'insurance'}

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
# convert purpose
bin_trips_cvt_pur_df = bin_trips_sp2en_df.replace(map_pur_dict)
# convert mode
bin_trips_cvt_pur_mo_df = bin_trips_cvt_pur_df
for i in range(len(bin_trips_cvt_pur_mo_df)):
    if bin_trips_cvt_pur_mo_df.iloc[i]["replaced_mode"] == "same_mode":
        print(bin_trips_cvt_pur_mo_df.iloc[i]) # to see which row will be converted
        bin_trips_cvt_pur_mo_df.iloc[i]["replaced_mode"] = bin_trips_cvt_pur_mo_df.iloc[i]['mode_confirm']
print(bin_trips_cvt_pur_mo_df)
bin_trips_cvt_pur_mode_ls = bin_trips_cvt_pur_mo_df.values.tolist()
bin_trips_cvt_pur_mode_ls

In [None]:
# drop duplicate user_input
no_dup_cvt_pur_mode_df = bin_trips_cvt_pur_mo_df.drop_duplicates()
no_dup_cvt_pur_mode_df

In [None]:
# turn non-duplicate user_input into list
no_dup_cvt_pur_mo_ls = no_dup_cvt_pur_mode_df.values.tolist()
no_dup_cvt_pur_mo_ls

In [None]:
# collect labels_true based on user_input
labels_true_cvt =[]
for trip in bin_trips_cvt_pur_mode_ls:
    if trip in no_dup_cvt_pur_mo_ls:
        labels_true_cvt.append(no_dup_cvt_pur_mo_ls.index(trip))
labels_true_cvt

In [None]:
# collect labels_pred based on bins
labels_pred = []
for i in range(len(bins)):
    for trip in bins[i]:
        labels_pred.append(i)
labels_pred

In [None]:
metrics.homogeneity_score(labels_true_cvt, labels_pred)

In [None]:
metrics.completeness_score(labels_true_cvt, labels_pred)

In [None]:
metrics.v_measure_score(labels_true_cvt, labels_pred)