In [None]:
import logging
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.similarity as similarity
import pandas as pd
import numpy as np
import get_request_percentage as grp
import get_scores as gs
import label_processing as lp
import get_users as gu
import data_preprocessing as preprocess
import matplotlib.pyplot as plt
import get_plot as plot
import emission.core.common as ecc

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

In [None]:
# get all/valid user list
user_ls, valid_users = gu.get_user_ls(all_users, radius)

# collect request percentage for the first or second round (requested trips / total trips) for one user
single_first_round_req_pct = []
single_second_round_req_pct = []

# collect request percentage for the first or second round (requested trips / total trips) for all users
all_percentage_first_train = []
all_percentage_first_test = []
all_percentage_second_train = []
all_percentage_second_test = []
all_median_pct_first = []

# collect homogeneity score for the first/second round for all users
all_homogeneity_score_first_train = []
all_homogeneity_score_first_test = []
all_homogeneity_score_second_train = []
all_homogeneity_score_second_test = []
all_median_homo_first = []

for a in range(len(all_users)):
    user = all_users[a]
    filter_trips, trips = preprocess.filter_data(user, radius)
    print('user', a + 1, 'filter_trips len', len(filter_trips))

    # filter out users that don't have enough valid labeled trips
    if not gu.valid_user(filter_trips, trips):
        continue
    train_idx, test_idx = preprocess.split_data(filter_trips)

    # choose training/test set to run the model
    # this step will use KFold (5 splits) to split the data into different subsets
    # - train: training set(used for tuning)
    # - test: test set
    # Here we user a bigger part of the data for testing and a smaller part for tuning
    train_data = preprocess.get_subdata(filter_trips, test_idx)
    test_data = preprocess.get_subdata(filter_trips, train_idx)

    # collect request percentage for a user for the first round
    pct_collect_first_train = []
    # collect homogeneity score for a user for the first round
    homo_collect_first_train = []
    # collect request percentage for a user for the second round
    pct_collect_second_train = []
    # collect homogeneity score for a user for the second round
    homo_collect_second_train = []

    # run tuning set first
    # collect tuning parameters
    coll_low = []
    coll_dist_pct = []
    colle_tune_score = []

    # run every subset
    for j in range(len(train_data)):
        sim = similarity.similarity(train_data[j], radius)
        filter_trips = sim.data
        sim.bin_data()
        sim.delete_bins()
        bins = sim.bins
        bin_trips = sim.newdata

        # compare the trip orders in bin_trips with those in bins above cutoff
        gs.compare_trip_orders(bins, bin_trips, filter_trips)

        # create a list idx_labels_track to store indices and labels
        # the indices of the items will be the same in the new label list after the second round clustering
        # item[0] is the original index of the trip in filter_trips
        # item[1] is the label from the first round of clustering
        idx_labels_track = []
        for bin in bins:
            for ori_idx in bin:
                idx_labels_track.append([ori_idx])

        # get first round labels
        first_labels = []
        for b in range(len(bins)):
            for trip in bins[b]:
                first_labels.append(b)
        new_labels = first_labels.copy()
        first_label_set = list(set(first_labels))

        # store first round labels in idx_labels_track list
        for i in range(len(first_labels)):
            idx_labels_track[i].append(first_labels[i])
        # make a copy of idx_labels_track
        track = idx_labels_track.copy()

        # get request percentage for the subset for the first round
        percentge_first = float('%.3f' % grp.get_req_pct(new_labels,track,filter_trips,sim))
        pct_collect_first_train.append(percentge_first)

        # get homogeneity score for the subset for the first round
        homo_first = float('%.3f' % gs.score(bin_trips, first_labels))
        homo_collect_first_train.append(homo_first)

        # tune parameters
        highest_score = 0
        sel_low = 0
        sel_dist_pct = 0
        sel_homo_second = 0
        sel_percentge_second = 0

        for dist_pct in np.arange(0.15, 0.6, 0.02):
            for low in range(250, 600):

                # get second round labels
                for l in first_label_set:
                    # store second round trips data
                    second_round_trips = []
                    # create a track to store indices and labels for the second round
                    second_round_idx_labels = []
                    for index, first_label in enumerate(first_labels):
                        if first_label == l:
                            second_round_trips.append(bin_trips[index])
                            second_round_idx_labels.append([index, first_label])
                    point_features = []

                    for trip in second_round_trips:
                        start = trip.data.start_loc["coordinates"]
                        end = trip.data.end_loc["coordinates"]
                        distance = trip.data.distance
                        duration = trip.data.duration
                        point_features.append([start[0], start[1], end[0], end[1], distance, duration])

                    x = np.array(point_features)

                    method = 'single'
                    # get labels after two rounds of clustering on common trips
                    new_labels = lp.get_new_labels(x, low, dist_pct, second_round_idx_labels, new_labels, method=method)
                    track = lp.change_track_labels(track, new_labels)

                # get request percentage for the subset for the second round
                percentge_second = float('%.3f' % grp.get_req_pct(new_labels, track, filter_trips, sim))

                # get homogeneity score for the second round
                homo_second = float('%.3f' % gs.score(bin_trips, new_labels))

                curr_score = 0.5 * homo_second + 0.5 * (1 - percentge_second)
                curr_score = float('%.3f' % curr_score)
                if curr_score > highest_score:
                    highest_score = curr_score
                    sel_low = low
                    sel_dist_pct = dist_pct
                    sel_homo_second = homo_second
                    sel_percentge_second = percentge_second
        coll_low.append(sel_low)
        coll_dist_pct.append(sel_dist_pct)
        colle_tune_score.append(highest_score)
        pct_collect_second_train.append(sel_percentge_second)
        homo_collect_second_train.append(sel_homo_second)
    print('coll_low ', coll_low)
    print('coll_dist_pct ', coll_dist_pct)
    print('colle_tune_score ', colle_tune_score)

    # run test set for evaluation
    # collect request percentage for a user for the first round
    pct_collect_first_test = []
    # collect homogeneity score for a user for the first round
    homo_collect_first_test = []
    # collect request percentage for a user for the second round
    pct_collect_second_test = []
    # collect homogeneity score for a user for the second round
    homo_collect_second_test = []

    # run every subset
    for k in range(len(test_data)):
        sim = similarity.similarity(test_data[k], radius)
        filter_trips = sim.data
        sim.bin_data()
        sim.delete_bins()
        bins = sim.bins
        bin_trips = sim.newdata
        print('bins ', bins)

        # compare the trip orders in bin_trips with those in filter_trips above cutoff
        gs.compare_trip_orders(bins, bin_trips, filter_trips)

        idx_labels_track = []
        for bin in bins:
            for ori_idx in bin:
                idx_labels_track.append([ori_idx])

        # get first round labels
        first_labels = []
        for b in range(len(bins)):
            for trip in bins[b]:
                first_labels.append(b)
        new_labels = first_labels.copy()
        first_label_set = list(set(first_labels))

        # store first round labels in idx_labels_track list
        for i in range(len(first_labels)):
            idx_labels_track[i].append(first_labels[i])
        # make a copy of idx_labels_track
        track = idx_labels_track.copy()

        # get request percentage for the subset for the first round
        percentge_first = float('%.3f' % grp.get_req_pct(new_labels,track,filter_trips,sim))
        pct_collect_first_test.append(percentge_first)

        # get homogeneity score for the subset for the first round
        homo_first = float('%.3f' % gs.score(bin_trips, new_labels))
        homo_collect_first_test.append(homo_first)

        low = coll_low[k]
        dist_pct = coll_dist_pct[k]

        # get second round labels
        for l in first_label_set:
            # store second round trips data
            second_round_trips = []
            # create a track to store indices and labels for the second round
            second_round_idx_labels = []
            for index, first_label in enumerate(first_labels):
                if first_label == l:
                    second_round_trips.append(bin_trips[index])
                    second_round_idx_labels.append([index, first_label])
            points = []
            point_features = []

            for trip in second_round_trips:
                start = trip.data.start_loc["coordinates"]
                end = trip.data.end_loc["coordinates"]
                #                 hour = trip.data.start_local_dt['hour']
                distance = trip.data.distance
                duration = trip.data.duration
                points.append([start[0], start[1], end[0], end[1]])
                point_features.append([start[0], start[1], end[0], end[1], distance, duration])

            x = np.array(point_features)

            method = 'single'
            # get labels after two rounds of clustering on common trips
            new_labels = lp.get_new_labels(x, low, dist_pct, second_round_idx_labels, new_labels, method=method)           
            track = lp.change_track_labels(track, new_labels)
        

        # get request percentage for the subset for the second round
        percentge_second = float('%.3f' % grp.get_req_pct(new_labels, track, filter_trips, sim))
        pct_collect_second_test.append(percentge_second)

        # get homogeneity score for the second round
        homo_second = float('%.3f' % gs.score(bin_trips, new_labels))
        homo_collect_second_test.append(homo_second)

    print('pct_collect_second_test ', pct_collect_second_test)
    print('homo_collect_second_test ', homo_collect_second_test)

    # collect request percentage for the first round for all users
    all_percentage_first_test.append(pct_collect_first_test)

    # collect homogeneity score for the first round for all users
    all_homogeneity_score_first_test.append(homo_collect_first_test)

    # collect request percentage for the second round for all users
    all_percentage_second_test.append(pct_collect_second_test)

    # collect homogeneity score for the second round for all users
    all_homogeneity_score_second_test.append(homo_collect_second_test)

#     get_scatter(req_propor_median,homogeneity_score,valid_users)
print('all_percentage_first_test', all_percentage_first_test)
print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)
print('all_percentage_second_test', all_percentage_second_test)
print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)

# plot evaluation scatter for the first round
plt.figure()
plot.get_scatter(all_percentage_first_test, all_homogeneity_score_first_test, valid_users)

# plot evaluation scatter for the second round
plt.figure()
plot.get_scatter(all_percentage_second_test, all_homogeneity_score_second_test, valid_users)