In [None]:
import logging

# Our imports
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.featurization as featurization
import emission.analysis.modelling.tour_model.representatives as representatives
import emission.storage.decorations.analysis_timeseries_queries as esda
import pandas as pd
from numpy import *
import confirmed_trips_eval_bins_clusters as evaluation
import data_preprocessing as preprocess
from sklearn import metrics
from pandas.testing import assert_frame_equal
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.cluster.hierarchy import linkage, dendrogram,fcluster
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.model_selection import KFold
from past.utils import old_div
from tabulate import tabulate
import folium
import branca.colormap as clm
import get_request_percentage as grp
import get_scores as gs
import label_processing as lp
import get_users as gu
import data_preprocessing as preprocess
import get_plot as plot
import emission.core.common as ecc

In [None]:
def score(bin_trips,filter_trips,labels_pred,second_round=None):
    bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips])
    bin_trips_user_input_df = evaluation.map_labels(bin_trips_user_input_df, sp2en=None, cvt_pur_mo=True)

    # turn all user_input into list without binning
    bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist()
    # drop duplicate user_input
    no_dup_df = bin_trips_user_input_df.drop_duplicates()
    # turn non-duplicate user_input into list
    no_dup_list = no_dup_df.values.tolist()

    # collect labels_true based on user_input
    labels_true = []
    for trip in bin_trips_user_input_ls:
        if trip in no_dup_list:
            labels_true.append(no_dup_list.index(trip))

    labels_pred = labels_pred
    
    if second_round:
        pass
    else:       
        # compare the trips order in bins and those in valid_trips using timestamp
        bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips])
        bin_ls = []
        for bin in bins:
            for index in bin:
                bin_ls.append(index)
        bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls])
        # compare two data frames, the program will continue to score calculation if two data frames are the same
        assert_frame_equal(bins_ts, bin_trips_ts)
    homo_score = metrics.homogeneity_score(labels_true, labels_pred)
    return homo_score

In [None]:
def get_second_labels(x,method,low,dist_pct):
    z = linkage(x, method=method, metric='euclidean')
    last_d = z[-1][2]
    clusters = []
    if last_d < low:
#         print('last distance ',last_d)
        for i in range(len(x)):
            clusters.append(0)
    else:
        max_d = last_d * dist_pct
        clusters = fcluster(z, max_d, criterion='distance')
#     print('clusters is ',clusters)    
    return clusters

In [None]:
def get_new_labels(x,low,dist_pct,second_round_idx_labels,new_labels,method=None):
    idx_label = second_round_idx_labels.copy()
    second_labels= get_second_labels(x,method,low,dist_pct) 
    for i in range(len(second_labels)):
        index = idx_label[i][0]
        new_label = idx_label[i][1]
        # concatenate labels from two rounds
        new_label = int(str(new_label) + str(second_labels[i]))
        for k in range(len(new_labels)):
            if k == index:
                new_labels[k] = new_label
                break
    return new_labels

In [None]:
# group similar trips according to new_labels, store the original indices 
def group_similar_trips(new_labels,track):
    bin_sim_trips = []
    for trip_index,label in enumerate(new_labels):
        added = False
        for bin in bin_sim_trips:
            if label == new_labels[bin[0]]:
                bin.append(trip_index)
                added = True
                break
        if not added:
            bin_sim_trips.append([trip_index])
    # using track to replace the current indices with original indicies
    for bin in bin_sim_trips:
        for i in range(len(bin)):
            bin[i] = track[bin[i]][0]
    return bin_sim_trips

In [None]:
def change_track_labels(track,new_labels):
    for i in range(len(new_labels)):
        track[i][1] = new_labels[i]
    return track

In [None]:
# collect requested trips and common trips(no need to request) indices above cutoff
def requested_trips_ab_cutoff(new_bins,filter_trips):   
    ab_trip_ls = []
    no_req_trip_ls = []
    for bin in new_bins:
        early_trip_index,index = evaluation.find_first_trip(filter_trips,bin)
        ab_trip_ls.append(early_trip_index)
        
        for k in range(len(bin)):
            if k != index:
                no_req_trip_idx = bin[k]
                no_req_trip_ls.append(no_req_trip_idx)
    return ab_trip_ls,no_req_trip_ls

In [None]:
# collect requested trips indices below cutoff
def requested_trips_bl_cutoff(sim):
    # bins below cutoff
    bl_bins = sim.below_cutoff
    
    # collect requested trips indices below cutoff
    bl_trip_ls = []
    for bin in bl_bins:
        for trip_index in bin:
            bl_trip_ls.append(trip_index)
    return  bl_trip_ls

In [None]:
# a list of all requested trips indices
def get_requested_trips(new_bins,filter_trips,sim):
    ab_trip_ls,no_req_trip_ls = requested_trips_ab_cutoff(new_bins,filter_trips)
    bl_trip_ls = requested_trips_bl_cutoff(sim)
    req_trips_ls=ab_trip_ls+bl_trip_ls
    return req_trips_ls

In [None]:
# get request percentage based on the number of requested trips and the total number of trips
def get_req_pct(new_labels,track,filter_trips):
    # - new_bins: bins with original indices of similar trips
    new_bins = group_similar_trips(new_labels,track)
    req_trips = get_requested_trips(new_bins,filter_trips,sim)
    pct = len(req_trips)/len(filter_trips)
    return pct

In [None]:
# use KFold (n_splits=5) to split the data into 5 models (5 training sets, 5 test sets)
def split_data(filter_trips):
    X = []
    for trip in filter_trips:
        start = trip.data.start_loc["coordinates"]
        end = trip.data.end_loc["coordinates"]
        distance = trip.data.distance
        duration = trip.data.duration   
        X.append([start[0], start[1], end[0], end[1],distance,duration])
        
    kf = KFold(n_splits=5,shuffle=True,random_state=9)
    train_idx = []
    test_idx = []
    for train_index, test_index in kf.split(X):
        train_idx.append(train_index)
        test_idx.append(test_index)
    return train_idx, test_idx

In [None]:
# collect a set of data(training/test set) after splitting 
# - splited_indices: pass in train set indices or test set indices
def get_subdata(sim,train_test_set):
    collect_sub_data = []
    for train_test_subset in train_test_set:
        sub_data = []
        for idx in train_test_subset:
            sub_data.append(sim.data[idx])
        collect_sub_data.append(sub_data)
    return collect_sub_data

In [None]:
def get_scatter(percentage,homo_score,valid_users):
    x=percentage
    y=homo_score
    v=valid_users
    cmp = cm.get_cmap('Dark2', len(valid_users))

    sc = []
    for i in range(len(valid_users)):
        for n in range(len(x[i])):           
            point = plt.scatter(x[i][n], y[i][n], color=cmp.colors[i], s=70, alpha=0.7)
        sc.append(point)
    plt.legend(sc,v,markerscale=0.8,scatterpoints=1,bbox_to_anchor=(1.23,1))
    plt.xlabel('user input request percentage',fontsize=16)
    plt.ylabel('homogeneity score',fontsize=16)
    plt.xticks(np.arange(0.4,1.1,step=0.1),fontsize=14)
    plt.yticks(np.arange(0.2,1.1,step=0.1),fontsize=14)

In [None]:
def same_cluster_map(cluster,filter_trips,bins):
    color_map = clm.linear.Set1_07.to_step(len(bins), index=[i for i in range(len(bins)+1)])
    first_trip = filter_trips[cluster[0]]
    map = folium.Map(location=[first_trip.data.start_loc["coordinates"][1], first_trip.data.start_loc["coordinates"][0]],
                   zoom_start=12, max_zoom=30, control_scale=True)

    zoom_points = []
    for curr_trip_index in cluster:
        for i in range(len(bins)):          
            curr_trip = filter_trips[curr_trip_index]
            if curr_trip_index in bins[i]:
                # We need polyline to plot the trip according to start_loc and end_loc
                # Flip indices because points are in geojson (i.e. lon, lat),folium takes [lat,lon]
#                 layer = folium.PolyLine(
#                     [[curr_trip.data.start_loc["coordinates"][1], curr_trip.data.start_loc["coordinates"][0]],
#                      [curr_trip.data.end_loc["coordinates"][1], curr_trip.data.end_loc["coordinates"][0]]], weight=2,
#                     color=color_map(i+1))
#                 layer.add_to(map)
                start_points = folium.CircleMarker([curr_trip.data.start_loc["coordinates"][1], curr_trip.data.start_loc["coordinates"][0]],
                                    radius=3,color='green',fill=True,fill_color='green',fill_opacity=1)
#                 start_points.add_to(map)
#
                end_points = folium.CircleMarker([curr_trip.data.end_loc["coordinates"][1], curr_trip.data.end_loc["coordinates"][0]],
                                    radius=3,color='red',fill=True,fill_color='red',fill_opacity=1)
                end_points.add_to(map)
                zoom_points.append([curr_trip.data.start_loc["coordinates"][1],
                                    curr_trip.data.start_loc["coordinates"][0]])
                zoom_points.append([curr_trip.data.end_loc["coordinates"][1],
                                    curr_trip.data.end_loc["coordinates"][0]])
    df = pd.DataFrame(zoom_points, columns=['Lat', 'Long'])
    sw = df[['Lat', 'Long']].min().values.tolist()
    ne = df[['Lat', 'Long']].max().values.tolist()
    map.fit_bounds([sw, ne])

#     map.add_child(color_map)
    return map



In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

In [None]:
# look for refactoring problem
# get valid user list
user_ls,valid_users = gu.get_user_ls(all_users, radius)

# req_propor_median = []
# homogeneity_score = []

# collect request percentage for the first/second round (requested trips/total trips) for one user
single_first_round_req_pct = []
single_second_round_req_pct = []


# collect request percentage for the first/second round (requested trips/total trips) for all users
all_percentage_first_train = []
all_percentage_first_test = []
all_percentage_second_train = []
all_percentage_second_test= []
all_median_pct_first = []

# collect homogeneity score for the first/second round for all users
all_homogeneity_score_first_train = []
all_homogeneity_score_first_test = []
all_homogeneity_score_second_train = []
all_homogeneity_score_second_test = []
all_median_homo_first = []

for a in range(1):
    user = all_users[a]
    filter_trips, trips = preprocess.filter_data(user, radius)
    print('user',a+1,'filter_trips len', len(filter_trips))

    # filter out users that don't have enough valid labeled trips
    if not gu.valid_user(filter_trips, trips):
        continue
    train_idx, test_idx = split_data(filter_trips)  
    print('test_idx ',test_idx)
    # choose training/test set to run the model
    # this step will use KFold (3 splits) to split the data into different subsets
    # - train: training set
    # - test: test set    
    # Here we user a bigger part of the data for testing and a smaller part for tuning
#     train_data = get_subdata(sim,test_idx)
#     test_data = get_subdata(sim,train_idx)
    train_data = preprocess.get_subdata(filter_trips, test_idx)
    test_data = preprocess.get_subdata(filter_trips, train_idx)


    
    # collect request percentage for a user for the first round
    pct_collect_first_train = []
    # collect homogeneity score for a user for the first round
    homo_collect_first_train = []
    # collect request percentage for a user for the second round
    pct_collect_second_train = []
    # collect homogeneity score for a user for the second round
    homo_collect_second_train = []
    
    # run training set first
    # collect tuning parameters
    coll_low = []
    coll_dist_pct = []
    colle_tune_score = []

    # run every subset
    for j in range(len(train_data)):
        print('tuning set',j)
        sim = similarity.similarity(train_data[j], radius)
        filter_trips = sim.data
        sim.bin_data()
        sim.delete_bins()
        bins = sim.bins
        bin_trips = sim.newdata
        print('bins from the first round ',bins)
        
        # compare the trip orders in bin_trips with those in bins above cutoff
        gs.compare_trip_orders(bins, bin_trips, filter_trips)

        # create a list idx_labels_track to store indices and labels
        # the indices of the items will be the same in the new label list after the second round clustering
        # item[0] is the original index of the trip in filter_trips
        # item[1] will be the new label after the second round clustering
        idx_labels_track = []
        for bin in bins:
            for ori_idx in bin:
                idx_labels_track.append([ori_idx])

        # get first round labels
        first_labels = []
        for b in range(len(bins)):
            for trip in bins[b]:
                first_labels.append(b)
        new_labels = first_labels.copy()
#         print('first round labels train ', new_labels)
        first_label_set = list(set(first_labels))

        # store first round labels in idx_labels_track list
        for i in range(len(first_labels)):
            idx_labels_track[i].append(first_labels[i])
        # make a copy of idx_labels_track
        track = idx_labels_track.copy()
        print('track from the first round',track)
        
        # get request percentage for the subset for the first round
        percentge_first = float('%.3f' % get_req_pct(new_labels,track,filter_trips))
        pct_collect_first_train.append(percentge_first)
        
        # get homogeneity score for the subset for the first round
        homo_first = float('%.3f' % score(bin_trips,filter_trips,new_labels))
        homo_collect_first_train.append(homo_first)
        
        # tune parameters
        highest_score = 0
        sel_low = 0
        sel_dist_pct = 0
        sel_homo_second = 0
        sel_percentge_second = 0


        for dist_pct in np.arange(0.15, 0.6, 0.02):
            for low in range(250,600):
                print('dist_pct is ', dist_pct, 'low is ', low)
        
                # get second round labels
                for l in first_label_set:
                    print('first label ',l)
                    # store second round trips data
                    second_round_trips = []
                    # create a track to store indices and labels for the second round
                    second_round_idx_labels = []          
                    for index, first_label in enumerate(first_labels):
                        if first_label == l:
                            second_round_trips.append(bin_trips[index])
                            second_round_idx_labels.append([index,first_label])
                    points = []
                    point_features = []
                    print('second_round_idx_labels',second_round_idx_labels)

                    for trip in second_round_trips:
                        start = trip.data.start_loc["coordinates"]
                        end = trip.data.end_loc["coordinates"]
        #                 hour = trip.data.start_local_dt['hour']
                        distance = trip.data.distance
                        duration = trip.data.duration   
                        points.append([start[0], start[1], end[0], end[1]])
                        point_features.append([start[0], start[1], end[0], end[1],distance,duration])

                    x = np.array(point_features)

                    method = 'single'
                    # get labels after two rounds of clustering on common trips
                    new_labels = get_new_labels(x,low,dist_pct,second_round_idx_labels,new_labels,method=method)
                    print('first round label is ', l,' new_labels ',new_labels)
                    track = change_track_labels(track,new_labels)
                    print('second round labels ',new_labels)

                # get request percentage for the subset for the second round
                percentge_second = float('%.3f' % get_req_pct(new_labels,track,filter_trips))            

                # get homogeneity score for the second round
                homo_second = float('%.3f' % score(bin_trips,filter_trips,new_labels,second_round=True))
                   
                curr_score = 0.5 * homo_second + 0.5 * (1 - percentge_second)
                curr_score = float('%.3f' % curr_score)
                if curr_score > highest_score:
                    highest_score = curr_score
                    sel_low = low
                    sel_dist_pct = dist_pct
                    sel_homo_second = homo_second
                    sel_percentge_second = percentge_second
        coll_low.append(sel_low)
        coll_dist_pct.append(sel_dist_pct)
        colle_tune_score.append(highest_score)
        pct_collect_second_train.append(sel_percentge_second)
        homo_collect_second_train.append(sel_homo_second)
    print('coll_low ',coll_low)
    print('coll_dist_pct ',coll_dist_pct)
    print('colle_tune_score ',colle_tune_score)
        


               
#         # get user input request proportion in a day
#         propor_single_user = get_request_proportion(new_bins,filter_trips,sim)

#         # get user input request proportion median in a day
#         median = np.median(propor_single_user)
        
#         # collect medians for every user
#         req_propor_median.append(median)
       
   
    
    # run test set for evaluation
    # collect request percentage for a user for the first round
    pct_collect_first_test = []
    # collect homogeneity score for a user for the first round
    homo_collect_first_test= []
    # collect request percentage for a user for the second round
    pct_collect_second_test = []
    # collect homogeneity score for a user for the second round
    homo_collect_second_test = []
                   
    # run every subset
    for k in range(len(test_data)):
        print('test set',k)
        sim = similarity.similarity(test_data[k], radius)
        filter_trips = sim.data
        sim.bin_data()
        sim.delete_bins()
        bins = sim.bins
        bin_trips = sim.newdata
        print('bins ', bins)

        
        # compare the trip orders in bin_trips with those in filter_trips above cutoff
        gs.compare_trip_orders(bins, bin_trips, filter_trips)

        # create a list idx_labels_track to store indices and labels
        # the indices of the items will be the same in the new label list after the second round clustering
        # item[0] is the original index of the trip in filter_trips
        # item[1] will be the new label after the second round clustering
        idx_labels_track = []
        for bin in bins:
            for ori_idx in bin:
                idx_labels_track.append([ori_idx])

        # get first round labels
        first_labels = []
        for b in range(len(bins)):
            for trip in bins[b]:
                first_labels.append(b)
        new_labels = first_labels.copy()
#         print('first round labels ', new_labels)
        first_label_set = list(set(first_labels))

        # store first round labels in idx_labels_track list
        for i in range(len(first_labels)):
            idx_labels_track[i].append(first_labels[i])
        # make a copy of idx_labels_track
        track = idx_labels_track.copy()
        
        # get request percentage for the subset for the first round
        percentge_first = float('%.3f' % get_req_pct(new_labels,track,filter_trips))
        pct_collect_first_test.append(percentge_first)
        
        # get homogeneity score for the subset for the first round
        homo_first = float('%.3f' % score(bin_trips,filter_trips,new_labels))
        homo_collect_first_test.append(homo_first)
                   
        low = coll_low[k]
        dist = coll_dist_pct[k]
                   
        # get second round labels
        for l in first_label_set:
            # store second round trips data
            second_round_trips = []
            # create a track to store indices and labels for the second round
            second_round_idx_labels = []          
            for index, first_label in enumerate(first_labels):
                if first_label == l:
                    second_round_trips.append(bin_trips[index])
                    second_round_idx_labels.append([index,first_label])
            points = []
            point_features = []

            for trip in second_round_trips:
                start = trip.data.start_loc["coordinates"]
                end = trip.data.end_loc["coordinates"]
    #                 hour = trip.data.start_local_dt['hour']
                distance = trip.data.distance
                duration = trip.data.duration   
                points.append([start[0], start[1], end[0], end[1]])
                point_features.append([start[0], start[1], end[0], end[1],distance,duration])

            x = np.array(point_features)

            method = 'single'
            # get labels after two rounds of clustering on common trips
            new_labels = get_new_labels(x,low,dist_pct,second_round_idx_labels,new_labels,method=method)
    #         print('first round label is ', l,' new_labels ',new_labels)
            track = change_track_labels(track,new_labels)
    #                 print('second round labels ',new_labels)

        # get request percentage for the subset for the second round
        print('track',track)
        percentge_second = float('%.3f' % get_req_pct(new_labels,track,filter_trips))            
        pct_collect_second_test.append(percentge_second)
                   
        # get homogeneity score for the second round
        homo_second = float('%.3f' % score(bin_trips,filter_trips,new_labels,second_round=True))
        homo_collect_second_test.append(homo_second)
        
    print('pct_collect_second_test ',pct_collect_second_test)
    print('homo_collect_second_test ',homo_collect_second_test)

                   
    # collect request percentage for the first round for all users
    all_percentage_first_test.append(pct_collect_first_test)
    
    # collect homogeneity score for the first round for all users
    all_homogeneity_score_first_test.append(homo_collect_first_test)   
    
    # collect request percentage for the second round for all users
    all_percentage_second_test.append(pct_collect_second_test)
        
    # collect homogeneity score for the second round for all users
    all_homogeneity_score_second_test.append(homo_collect_second_test)
    
    
print('all_percentage_first_test', all_percentage_first_test)
print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)
print('all_percentage_second_test', all_percentage_second_test)
print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)

