In [None]:
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.similarity as similarity
import pandas as pd
import numpy as np
import emission.analysis.modelling.tour_model.get_request_percentage as grp
import emission.analysis.modelling.tour_model.get_scores as gs
import emission.analysis.modelling.tour_model.label_processing as lp
import emission.analysis.modelling.tour_model.get_users as gu
import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
import evaluation_pipeline as ep
import matplotlib.pyplot as plt
import get_plot as plot
import emission.core.common as ecc

In [None]:
np.set_printoptions(suppress=True)

In [None]:
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', None)

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

### using scipy hierarchical clustering

In [None]:
# get all/valid user list
user_ls, valid_users = gu.get_user_ls(all_users, radius)

# collect request percentage for the first or second round (requested trips / total trips) for all users
all_percentage_first_test = []
all_percentage_second_test = []

# collect homogeneity score for the first/second round for all users
all_homogeneity_score_first_test = []
all_homogeneity_score_second_test = []

# collect tradeoffs for all users
all_tradoffs = []
# collect scores for all users
all_scores = []

for a in range(1):
    user = all_users[a]
    trips = preprocess.read_data(user)
    filter_trips = preprocess.filter_data(trips, radius)
    print('user', a + 1, 'filter_trips len', len(filter_trips))

    # filter out users that don't have enough valid labeled trips
    if not gu.valid_user(filter_trips, trips):
        continue
    tune_idx, test_idx = preprocess.split_data(filter_trips)

    # choose tuning/test set to run the model
    # this step will use KFold (5 splits) to split the data into different subsets
    # - tune: tuning set
    # - test: test set
    # Here we user a bigger part of the data for testing and a smaller part for tuning
    tune_data = preprocess.get_subdata(filter_trips, test_idx)
    test_data = preprocess.get_subdata(filter_trips, tune_idx)
    # collect the user labels request percentage from the first round
    pct_collect_first = []
    # collect the homogeneity score 
    homo_collect_first = []
    pct_collect_second = []
    homo_collect_second = []
    coll_score = []
    coll_tradeoffs = []
    
    # tune data
    for j in range(len(tune_data)):
        tuning_parameters = ep.tune(tune_data[j],radius,kmeans=False)
        coll_tradeoffs.append(tuning_parameters)
    all_tradoffs.append(coll_tradeoffs)

    # testing
    for k in range(len(test_data)):
        tradoffs = coll_tradeoffs[k]
        low = tradoffs[0]
        dist_pct = tradoffs[1]
        homo_first, percentage_first, homo_second, percentage_second, scores = ep.test(test_data[k],radius,low,dist_pct,kmeans=False)
        pct_collect_first.append(percentage_first)
        homo_collect_first.append(homo_first)
        pct_collect_second.append(percentage_second)
        homo_collect_second.append(homo_second)
        coll_score.append(scores)
    all_scores.append(coll_score)
    all_percentage_first_test.append(pct_collect_first)
    all_percentage_second_test.append(pct_collect_second)
    all_homogeneity_score_first_test.append(homo_collect_first)
    all_homogeneity_score_second_test.append(homo_collect_second)
    
print('all_percentage_first_test', all_percentage_first_test)
print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)
print('all_percentage_second_test', all_percentage_second_test)
print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)
print('all_scores',all_scores)
print('all_tradoffs',all_tradoffs)

### using kmeans

In [None]:
# get all/valid user list
user_ls, valid_users = gu.get_user_ls(all_users, radius)

# collect request percentage for the first or second round (requested trips / total trips) for all users
all_percentage_first_test = []
all_percentage_second_test = []

# collect homogeneity score for the first/second round for all users
all_homogeneity_score_first_test = []
all_homogeneity_score_second_test = []

# collect tradeoffs for all users
all_tradoffs = []
# collect scores for all users
all_scores = []

for a in range(1):
    user = all_users[a]
    trips = preprocess.read_data(user)
    filter_trips = preprocess.filter_data(trips, radius)
    print('user', a + 1, 'filter_trips len', len(filter_trips))

    # filter out users that don't have enough valid labeled trips
    if not gu.valid_user(filter_trips, trips):
        continue
    tune_idx, test_idx = preprocess.split_data(filter_trips)

    # choose tuning/test set to run the model
    # this step will use KFold (5 splits) to split the data into different subsets
    # - tune: tuning set
    # - test: test set
    # Here we user a bigger part of the data for testing and a smaller part for tuning
    tune_data = preprocess.get_subdata(filter_trips, test_idx)
    test_data = preprocess.get_subdata(filter_trips, tune_idx)
    # collect the user labels request percentage from the first round
    pct_collect_first = []
    # collect the homogeneity score 
    homo_collect_first = []
    pct_collect_second = []
    homo_collect_second = []
    coll_score = []
    coll_tradeoffs = []
    
    # tune data
    for j in range(len(tune_data)):
        # for tuning, we don't add kmeans for re-clustering. We just need to get tuning parameters
        tuning_parameters = ep.tune(tune_data[j],radius,kmeans=False)
        coll_tradeoffs.append(tuning_parameters)
    all_tradoffs.append(coll_tradeoffs)

    # testing
    for k in range(len(test_data)):
        tradoffs = coll_tradeoffs[k]
        low = tradoffs[0]
        dist_pct = tradoffs[1]       
        # for testing, we add kmeans to re-build the model
        homo_first, percentage_first, homo_second, percentage_second, scores = ep.test(test_data[k],radius,low,dist_pct,kmeans=True)
        pct_collect_first.append(percentage_first)
        homo_collect_first.append(homo_first)
        pct_collect_second.append(percentage_second)
        homo_collect_second.append(homo_second)
        coll_score.append(scores)
    all_scores.append(coll_score)
    all_percentage_first_test.append(pct_collect_first)
    all_percentage_second_test.append(pct_collect_second)
    all_homogeneity_score_first_test.append(homo_collect_first)
    all_homogeneity_score_second_test.append(homo_collect_second)
    
print('all_percentage_first_test', all_percentage_first_test)
print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)
print('all_percentage_second_test', all_percentage_second_test)
print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)
print('all_scores',all_scores)
print('all_tradoffs',all_tradoffs)