# Trajectory Recommendation - Shared

In [None]:
#% matplotlib inline

import os, sys, time
import math, random
import pandas as pd
import numpy as np
from scipy.stats import kendalltau

#import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.cluster import KMeans

import cython

In [None]:
data_dir = 'data/data-new'
dat_suffix = ['Osak', 'Glas', 'Edin', 'Toro', 'Melb']

In [None]:
#dat_ix = 0  #NOTE: this variable should be defined in another notebook that runs this notebook

Hyperparameters.

In [None]:
BIN_CLUSTER = 5  # discritization parameter
LOG_SMALL = -10  # log(x) when x is a very small positive real number
LOG_ZERO = -1000 # log(0)

## 1.1 Load Data

In [None]:
fpoi = os.path.join(data_dir, 'poi-' + dat_suffix[dat_ix] + '.csv')

In [None]:
poi_all = pd.read_csv(fpoi)
poi_all.set_index('poiID', inplace=True)
poi_all.head()

In [None]:
ftraj = os.path.join(data_dir, 'traj-' + dat_suffix[dat_ix] + '.csv')

In [None]:
traj_all = pd.read_csv(ftraj)
traj_all.head()

In [None]:
num_user = traj_all['userID'].unique().shape[0]
num_poi = traj_all['poiID'].unique().shape[0]
num_traj = traj_all['trajID'].unique().shape[0]
pd.DataFrame({'#user': num_user, '#poi': num_poi, '#traj': num_traj, '#traj/user': num_traj/num_user}, \
             index=[str(dat_suffix[dat_ix])])

Distribution of the number of POIs in trajectories.

In [None]:
#ax = traj_all['trajLen'].hist(bins=20)
#ax.set_yscale('log')
#ax.set_xlabel('#POIs in trajectory'); ax.set_ylabel('#Trajectories')

Distribution of POI visit duration.

In [None]:
#ax = traj_all['poiDuration'].hist(bins=20)
#ax.set_xscale('log')
#ax.set_yscale('log')
#ax.set_xlabel('POI visit duration (sec)'); ax.set_ylabel('#POI visits')

## 1.2 Utility Functions

Print computing progress.

In [None]:
def print_progress(cnt, total):
    """Display a progress bar"""
    assert(cnt > 0 and total > 0 and cnt <= total)
    length = 80
    ratio = cnt / total
    n = int(length * ratio)
    sys.stdout.write('\r[%-80s] %d%%' % ('-'*n, int(ratio*100)))
    sys.stdout.flush()

Extract trajectory, i.e., a list of POIs.

In [None]:
def extract_traj(tid, traj_all):
    traj = traj_all[traj_all['trajID'] == tid].copy()
    traj.sort_values(by=['startTime'], ascending=True, inplace=True)
    return traj['poiID'].tolist()

Compute POI properties, e.g., popularity, total number of visit, average visit duration.

In [None]:
def calc_poi_info(trajid_list, traj_all, poi_all):
    assert(len(trajid_list) > 0)
    poi_info = traj_all[traj_all['trajID'] == trajid_list[0]][['poiID', 'poiDuration']].copy() 
    for i in range(1, len(trajid_list)):
        traj = traj_all[traj_all['trajID'] == trajid_list[i]][['poiID', 'poiDuration']]
        poi_info = poi_info.append(traj, ignore_index=True)
    
    poi_info = poi_info.groupby('poiID').agg([np.mean, np.size])
    poi_info.columns = poi_info.columns.droplevel()
    poi_info.reset_index(inplace=True)
    poi_info.rename(columns={'mean':'avgDuration', 'size':'nVisit'}, inplace=True)
    poi_info.set_index('poiID', inplace=True) 
    poi_info['poiCat'] = poi_all.loc[poi_info.index, 'poiCat']
    poi_info['poiLon'] = poi_all.loc[poi_info.index, 'poiLon']
    poi_info['poiLat'] = poi_all.loc[poi_info.index, 'poiLat']
    
    # POI popularity: the number of distinct users that visited the POI
    pop_df = traj_all[traj_all['trajID'].isin(trajid_list)][['poiID', 'userID']].copy()
    pop_df = pop_df.groupby('poiID').agg(pd.Series.nunique)
    pop_df.rename(columns={'userID':'nunique'}, inplace=True)
    poi_info['popularity'] = pop_df.loc[poi_info.index, 'nunique']
    
    return poi_info.copy()

Compute distance between two POIs using [Haversine formula](http://en.wikipedia.org/wiki/Great-circle_distance).

In [None]:
def calc_dist_vec(longitudes1, latitudes1, longitudes2, latitudes2):
    """Calculate the distance (unit: km) between two places on earth, vectorised"""
    # convert degrees to radians
    lng1 = np.radians(longitudes1)
    lat1 = np.radians(latitudes1)
    lng2 = np.radians(longitudes2)
    lat2 = np.radians(latitudes2)
    radius = 6371.0088 # mean earth radius, en.wikipedia.org/wiki/Earth_radius#Mean_radius

    # The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
    dlng = np.fabs(lng1 - lng2)
    dlat = np.fabs(lat1 - lat2)
    dist =  2 * radius * np.arcsin( np.sqrt( 
                (np.sin(0.5*dlat))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(0.5*dlng))**2 ))
    return dist

## 1.3 Auxiliary Data Structures

Distance between POIs.

In [None]:
POI_DISTMAT = pd.DataFrame(data=np.zeros((poi_all.shape[0], poi_all.shape[0]), dtype=np.float), \
                           index=poi_all.index, columns=poi_all.index)

In [None]:
for ix in poi_all.index:
    POI_DISTMAT.loc[ix] = calc_dist_vec(poi_all.loc[ix, 'poiLon'], \
                                        poi_all.loc[ix, 'poiLat'], \
                                        poi_all['poiLon'], \
                                        poi_all['poiLat'])

In [None]:
trajid_set_all = sorted(traj_all['trajID'].unique().tolist())

In [None]:
poi_info_all = calc_poi_info(trajid_set_all, traj_all, poi_all)

Dictionary maps every trajectory ID to the actual trajectory.

In [None]:
traj_dict = dict()

In [None]:
for trajid in trajid_set_all:
    traj = extract_traj(trajid, traj_all)
    assert(trajid not in traj_dict)
    traj_dict[trajid] = traj

Define a *query* (in IR terminology) using tuple (start POI, #POI).

In [None]:
QUERY_ID_DICT = dict()  # (start, length) --> qid

In [None]:
keys = [(traj_dict[x][0], len(traj_dict[x])) \
        for x in sorted(traj_dict.keys()) if len(traj_dict[x]) > 1]
cnt = 0
for key in keys:
    if key not in QUERY_ID_DICT:   # (start, length) --> qid
        QUERY_ID_DICT[key] = cnt
        cnt += 1

In [None]:
print('#traj in total:', len(trajid_set_all))
print('#traj (length >= 2):', traj_all[traj_all['trajLen'] >= 2]['trajID'].unique().shape[0])
print('#traj length max:', traj_all['trajLen'].max())
print('#query tuple:', len(QUERY_ID_DICT))

Number of trajectories for each query.

In [None]:
TRAJ_GROUP_DICT = dict()

In [None]:
for tid in traj_dict:
    if len(traj_dict[tid]) > 1:
        key = (traj_dict[tid][0], len(traj_dict[tid]))
        if key in TRAJ_GROUP_DICT: TRAJ_GROUP_DICT[key].add(tid)
        else:                      TRAJ_GROUP_DICT[key] = set({tid})

In [None]:
x = [key for key in sorted(TRAJ_GROUP_DICT.keys())]
y = [len(TRAJ_GROUP_DICT[key]) for key in x]

In [None]:
ix1 = np.argmin(y)
ix2 = np.argmax(y)
#print('min:', x[ix1], y[ix1])
#print('max:', x[ix2], y[ix2])

In [None]:
#plt.scatter(list(range(len(x))), sorted(y, reverse=True))
#plt.ylabel('#Trajectories')
#plt.xlabel('Query index')
#plt.xlim(xmin=-1)
#plt.ylim(ymin=-1)

# 2. POI Features

POI Features given query (`startPOI`, `nPOI`):
1. `category`: one-hot encoding of POI category, encode `True` as `1` and `False` as `-1`
1. `neighbourhood`: one-hot encoding of POI cluster, encode `True` as `1` and `False` as `-1`
1. `popularity`: log of POI popularity, i.e., the number of distinct users that visited the POI
1. `nVisit`: log of the total number of visit by all users
1. `avgDuration`: log of average POI visit duration
1. `trajLen`: trajectory length, i.e., the number of POIs `nPOI` in trajectory, copy from query
1. `sameCatStart`: 1 if POI category is the same as that of `startPOI`, -1 otherwise
1. `distStart`: distance (haversine formula) from `startPOI`
1. `diffPopStart`: difference in POI popularity from `startPOI` (NO LOG as it could be negative)
1. `diffNVisitStart`: difference in the total number of visit from `startPOI`
1. `diffDurationStart`: difference in average POI visit duration from the actual duration spent at `startPOI`
1. `sameNeighbourhoodStart`: 1 if POI resides in the same cluster as that of `startPOI`, -1 otherwise

In [None]:
DF_COLUMNS = ['poiID', 'label', 'queryID', 'category', 'neighbourhood', 'popularity', 'nVisit', 'avgDuration', \
              'trajLen', 'sameCatStart', 'distStart', 'diffPopStart', 'diffNVisitStart', 'diffDurationStart', \
              'sameNeighbourhoodStart']

# 3. Factorised Transition Probabilities between POIs

Estimate a transition matrix for each feature of POI, transition probabilities between different POIs can be computed by taking the Kronecker product of the individual transition matrix corresponding to each feature (with normalisation and a few constraints).

## 3.1 POI Features for Factorisation

POI features used to factorise transition matrix of Markov Chain with POI features (vector) as states:
- Category of POI
- Popularity of POI (discritize with uniform log-scale bins, #bins <=5 )
- The number of POI visits (discritize with uniform log-scale bins, #bins <=5 )
- The average visit duration of POI (discritise with uniform log-scale bins, #bins <= 5)
- The neighborhood relationship between POIs (clustering POI(lat, lon) using k-means, #clusters <= 5)

We count the number of transition first, then normalise each row while taking care of zero by adding each cell a number $k=1$.

In [None]:
def normalise_transmat(transmat_cnt):
    transmat = transmat_cnt.copy()
    assert(isinstance(transmat, pd.DataFrame))
    for row in range(transmat.index.shape[0]):
        rowsum = np.sum(transmat.iloc[row] + 1)
        assert(rowsum > 0)
        transmat.iloc[row] = (transmat.iloc[row] + 1) / rowsum
    return transmat

POIs in training set.

In [None]:
poi_train = sorted(poi_info_all.index)

Dictionary to map POIs to $[0, \dots, M]$.

In [None]:
POI_DICT = dict()
POI_DICT = {poi:pid for pid, poi in enumerate(poi_train)}

## 3.2 Transition Matrix between POI Cateogries

In [None]:
poi_cats = poi_all.loc[poi_train, 'poiCat'].unique().tolist()
poi_cats.sort()
POI_CAT_LIST = poi_cats
POI_CAT_LIST

In [None]:
def gen_transmat_cat(trajid_list, traj_dict, poi_info, poi_cats=POI_CAT_LIST):
    transmat_cat_cnt = pd.DataFrame(data=np.zeros((len(poi_cats), len(poi_cats)), dtype=np.float), \
                                    columns=poi_cats, index=poi_cats)
    for tid in trajid_list:
        t = traj_dict[tid]
        if len(t) > 1:
            for pi in range(len(t)-1):
                p1 = t[pi]
                p2 = t[pi+1]
                assert(p1 in poi_info.index and p2 in poi_info.index)
                cat1 = poi_info.loc[p1, 'poiCat']
                cat2 = poi_info.loc[p2, 'poiCat']
                transmat_cat_cnt.loc[cat1, cat2] += 1
    return normalise_transmat(transmat_cat_cnt)

In [None]:
#gen_transmat_cat(trajid_set_all, traj_dict, poi_info_all)

## 3.3 Transition Matrix between POI Popularity Classes

In [None]:
poi_pops = poi_info_all.loc[poi_train, 'popularity']

Discretize POI popularity with uniform log-scale bins.

In [None]:
expo_pop1 = np.log10(max(1, min(poi_pops)))
expo_pop2 = np.log10(max(poi_pops))
#print(expo_pop1, expo_pop2)

In [None]:
nbins_pop = BIN_CLUSTER
logbins_pop = np.logspace(np.floor(expo_pop1), np.ceil(expo_pop2), nbins_pop+1)
logbins_pop[0] = 0  # deal with underflow
if logbins_pop[-1] < poi_info_all['popularity'].max():
    logbins_pop[-1] = poi_info_all['popularity'].max() + 1
logbins_pop

In [None]:
#ax = pd.Series(poi_pops).hist(figsize=(5, 3), bins=logbins_pop)
#ax.set_xlim(xmin=0.1)
#ax.set_xscale('log')

In [None]:
def gen_transmat_pop(trajid_list, traj_dict, poi_info, logbins_pop=logbins_pop):
    nbins = len(logbins_pop) - 1
    transmat_pop_cnt = pd.DataFrame(data=np.zeros((nbins, nbins), dtype=np.float), \
                                    columns=np.arange(1, nbins+1), index=np.arange(1, nbins+1))
    for tid in trajid_list:
        t = traj_dict[tid]
        if len(t) > 1:
            for pi in range(len(t)-1):
                p1 = t[pi]
                p2 = t[pi+1]
                assert(p1 in poi_info.index and p2 in poi_info.index)
                pop1 = poi_info.loc[p1, 'popularity']
                pop2 = poi_info.loc[p2, 'popularity']
                pc1, pc2 = np.digitize([pop1, pop2], logbins_pop)
                transmat_pop_cnt.loc[pc1, pc2] += 1
    return normalise_transmat(transmat_pop_cnt), logbins_pop

In [None]:
#gen_transmat_pop(trajid_set_all, traj_dict, poi_info_all)[0]

## 3.4 Transition Matrix between the Number of POI Visit Classes

In [None]:
poi_visits = poi_info_all.loc[poi_train, 'nVisit']

Discretize the number of POI visit with uniform log-scale bins.

In [None]:
expo_visit1 = np.log10(max(1, min(poi_visits)))
expo_visit2 = np.log10(max(poi_visits))
#print(expo_visit1, expo_visit2)

In [None]:
nbins_visit = BIN_CLUSTER
logbins_visit = np.logspace(np.floor(expo_visit1), np.ceil(expo_visit2), nbins_visit+1)
logbins_visit[0] = 0  # deal with underflow
if logbins_visit[-1] < poi_info_all['nVisit'].max():
    logbins_visit[-1] = poi_info_all['nVisit'].max() + 1
logbins_visit

In [None]:
#ax = pd.Series(poi_visits).hist(figsize=(5, 3), bins=logbins_visit)
#ax.set_xlim(xmin=0.1)
#ax.set_xscale('log')

In [None]:
def gen_transmat_visit(trajid_list, traj_dict, poi_info, logbins_visit=logbins_visit):
    nbins = len(logbins_visit) - 1
    transmat_visit_cnt = pd.DataFrame(data=np.zeros((nbins, nbins), dtype=np.float), \
                                      columns=np.arange(1, nbins+1), index=np.arange(1, nbins+1))
    for tid in trajid_list:
        t = traj_dict[tid]
        if len(t) > 1:
            for pi in range(len(t)-1):
                p1 = t[pi]
                p2 = t[pi+1]
                assert(p1 in poi_info.index and p2 in poi_info.index)
                visit1 = poi_info.loc[p1, 'nVisit']
                visit2 = poi_info.loc[p2, 'nVisit']
                vc1, vc2 = np.digitize([visit1, visit2], logbins_visit)
                transmat_visit_cnt.loc[vc1, vc2] += 1
    return normalise_transmat(transmat_visit_cnt), logbins_visit

In [None]:
#gen_transmat_visit(trajid_set_all, traj_dict, poi_info_all)[0]

## 3.5 Transition Matrix between POI Average Visit Duration Classes

In [None]:
poi_durations = poi_info_all.loc[poi_train, 'avgDuration']

In [None]:
expo_duration1 = np.log10(max(1, min(poi_durations)))
expo_duration2 = np.log10(max(poi_durations))
#print(expo_duration1, expo_duration2)

In [None]:
nbins_duration = BIN_CLUSTER
logbins_duration = np.logspace(np.floor(expo_duration1), np.ceil(expo_duration2), nbins_duration+1)
logbins_duration[0] = 0  # deal with underflow
logbins_duration[-1] = np.power(10, expo_duration2+2)
logbins_duration

In [None]:
#ax = pd.Series(poi_durations).hist(figsize=(5, 3), bins=logbins_duration)
#ax.set_xlim(xmin=0.1)
#ax.set_xscale('log')

In [None]:
def gen_transmat_duration(trajid_list, traj_dict, poi_info, logbins_duration=logbins_duration):
    nbins = len(logbins_duration) - 1
    transmat_duration_cnt = pd.DataFrame(data=np.zeros((nbins, nbins), dtype=np.float), \
                                         columns=np.arange(1, nbins+1), index=np.arange(1, nbins+1))
    for tid in trajid_list:
        t = traj_dict[tid]
        if len(t) > 1:
            for pi in range(len(t)-1):
                p1 = t[pi]
                p2 = t[pi+1]
                assert(p1 in poi_info.index and p2 in poi_info.index)
                d1 = poi_info.loc[p1, 'avgDuration']
                d2 = poi_info.loc[p2, 'avgDuration']
                dc1, dc2 = np.digitize([d1, d2], logbins_duration)
                transmat_duration_cnt.loc[dc1, dc2] += 1
    return normalise_transmat(transmat_duration_cnt), logbins_duration

In [None]:
#gen_transmat_duration(trajid_set_all, traj_dict, poi_info_all)[0]

## 3.6 Transition Matrix between POI Neighborhood Classes

KMeans in scikit-learn seems unable to use custom distance metric and no implementation of [Haversine formula](http://en.wikipedia.org/wiki/Great-circle_distance), use Euclidean distance to approximate.

In [None]:
X = poi_all.loc[poi_train, ['poiLon', 'poiLat']]
nclusters = BIN_CLUSTER

In [None]:
kmeans = KMeans(n_clusters=nclusters, random_state=987654321)
kmeans.fit(X)

In [None]:
clusters = kmeans.predict(X)
POI_CLUSTER_LIST = sorted(np.unique(clusters))
POI_CLUSTERS = pd.DataFrame(data=clusters, index=poi_train)
POI_CLUSTERS.index.name = 'poiID'
POI_CLUSTERS.rename(columns={0:'clusterID'}, inplace=True)
POI_CLUSTERS['clusterID'] = POI_CLUSTERS['clusterID'].astype(np.int)

Scatter plot of POI coordinates with clustering results.

In [None]:
#diff = poi_all.loc[poi_train, ['poiLon', 'poiLat']].max() - poi_all.loc[poi_train, ['poiLon', 'poiLat']].min()
#ratio = diff['poiLon'] / diff['poiLat']
#height = 6; width = int(round(ratio)*height)
#plt.figure(figsize=[width, height])
#plt.scatter(poi_all.loc[poi_train, 'poiLon'], poi_all.loc[poi_train, 'poiLat'], c=clusters, s=50)

In [None]:
def gen_transmat_neighbor(trajid_list, traj_dict, poi_info, poi_clusters=POI_CLUSTERS):
    nclusters = len(poi_clusters['clusterID'].unique())
    transmat_neighbor_cnt = pd.DataFrame(data=np.zeros((nclusters, nclusters), dtype=np.float), \
                                         columns=np.arange(nclusters), index=np.arange(nclusters))
    for tid in trajid_list:
        t = traj_dict[tid]
        if len(t) > 1:
            for pi in range(len(t)-1):
                p1 = t[pi]
                p2 = t[pi+1]
                assert(p1 in poi_info.index and p2 in poi_info.index)
                c1 = poi_clusters.loc[p1, 'clusterID']
                c2 = poi_clusters.loc[p2, 'clusterID']
                transmat_neighbor_cnt.loc[c1, c2] += 1
    return normalise_transmat(transmat_neighbor_cnt), poi_clusters

In [None]:
#gen_transmat_neighbor(trajid_set_all, traj_dict, poi_info_all)[0]

# 4. Evaluation Metrics

Compute the F1 score.

In [None]:
def calc_F1(traj_act, traj_rec, noloop=True):
    '''Compute recall, precision and F1 for recommended trajectories'''
    assert(isinstance(noloop, bool))
    assert(len(traj_act) > 0)
    assert(len(traj_rec) > 0)
    
    if noloop == True:
        intersize = len(set(traj_act) & set(traj_rec))
    else: # if there are sub-tours in both ground truth and prediction
        match_tags = np.zeros(len(traj_act), dtype=np.bool)
        for poi in traj_rec:
            for j in range(len(traj_act)):
                if match_tags[j] == False and poi == traj_act[j]:
                    match_tags[j] = True
                    break
        intersize = np.nonzero(match_tags)[0].shape[0]
        
    recall = intersize / len(traj_act)
    precision = intersize / len(traj_rec)
    F1 = 2 * precision * recall / (precision + recall)
    return F1

Compute the pairs-F1 score.

In [None]:
%load_ext Cython

In [None]:
%%cython
import numpy as np
cimport numpy as np

cpdef float calc_pairsF1(y, y_hat):
    assert(len(y) > 0)
    assert(len(y) == len(set(y))) # no loops in y
    cdef int n, nr, n0, n0r, nc, poi1, poi2, i, j
    n = len(y)
    nr = len(y_hat)
    n0 = int(n*(n-1) / 2)
    n0r = int(nr*(nr-1) / 2)
    
    # y determines the correct visiting order
    order_dict = dict()
    for i in range(n):
        order_dict[y[i]] = i
        
    nc = 0
    for i in range(nr):
        poi1 = y_hat[i]
        for j in range(i+1, nr):
            poi2 = y_hat[j]
            if poi1 in order_dict and poi2 in order_dict and poi1 != poi2:
                if order_dict[poi1] < order_dict[poi2]: nc += 1

    cdef float precision, recall, F1
    precision = (1.0 * nc) / (1.0 * n0r)
    recall = (1.0 * nc) / (1.0 * n0)
    if nc == 0:
        F1 = 0
    else:
        F1 = 2. * precision * recall / (precision + recall)
    return F1

Kendall's $\tau$ as evaluation metric: 

The ranks of all POIs in trajectory $\mathbf{y}$ should be greater than all other POIs that do not appear in trajectory $\mathbf{y}$, which we require that they have the same rank (use rank $0$ by default).

In [None]:
def gen_rank(y, M, default_rank=0):
    """
    compute the rank of all POIs given a trajectory
    y - trajectory: a sequence of POIs without duplication
    M - total number of POIs
    default_rank - the rank for all POIs do not appear in y
    """
    assert(len(y) > 0)
    assert(len(y) <= M)
    assert(default_rank >= 0)
    assert(default_rank <= M)
    rank = np.ones(M) * default_rank
    for j in range(len(y)):
        poi = y[j]
        prank = M - j
        rank[poi - 1] = prank
    return rank

In [None]:
#y1 = [1, 3, 2, 5]
#y2 = [1, 2, 5, 3]
#M = 10
#default = 0
#print(gen_rank(y1, M, default))
#print(gen_rank(y2, M, default))

Test the effect of the rank for all POIs not in trajectory.

In [None]:
#for default in np.arange(0, 10.5, .5):
#    assert(len(y1) == len(y2))
#    if default >= M - len(y1) + 1: continue
#    r1 = gen_rank(y1, M, default)
#    r2 = gen_rank(y2, M, default)
#    #print(r1, r2)
#    print(default, kendalltau(r1, r2))

In [None]:
#default = 7  # incorrect when missing value >= the lowest rank for observed data
#r1 = gen_rank(y1, M, default)
#r2 = gen_rank(y2, M, default)
#print(r1, r2)
#print(default, kendalltau(r1, r2))

Compute the Kendall's $\tau$ (taking care of ties).

In [None]:
def calc_kendalltau(y, y_hat):
    #assert(len(y) == len(y_hat))
    M = len(POI_DICT)
    assert(len(y) <= M)
    assert(len(y_hat) <= M)
    
    r1 = gen_rank([POI_DICT[p] for p in y], M)
    r2 = gen_rank([POI_DICT[p] for p in y_hat], M)
    
    return kendalltau(r1, r2)[0]

In [None]:
#calc_kendalltau(y1, y2)

Compute all evaluation metrics given a prediction and the set of ground truth.

In [None]:
def evaluate(y_hat, ground_truth_id_set, use_max=True):
    """
    compute all evaluation metrics: 
    - F1 score on points, 
    - F1 score on pairs,
    - Kendall's tau.
    
    y_hat - the prediction for query x
    ground_truth_id_set - the set of trajecory IDs for query x
    M - the total number of POIs
    use_max - True: use the maximum of all scores for each metric, False: use the mean
    """
    assert(len(y_hat) > 0)
    assert(len(ground_truth_id_set) > 0)
    
    ground_truth = [traj_dict[x] for x in ground_truth_id_set]
    F1 = np.zeros(len(ground_truth), dtype=np.float)
    pF1 = np.zeros(len(ground_truth), dtype=np.float)
    Tau = np.zeros(len(ground_truth), dtype=np.float)
    for j in range(len(ground_truth)):
        assert(len(y_hat) == len(ground_truth[j]))
        F1[j] = calc_F1(ground_truth[j], y_hat)
        pF1[j] = calc_pairsF1(ground_truth[j], y_hat)
        Tau[j] = calc_kendalltau(ground_truth[j], y_hat)
    if use_max == True:  # use maximum similarity score
        return np.max(F1), np.max(pF1), np.max(Tau)
    else:                # use mean similarity score
        return np.mean(F1), np.mean(pF1), np.mean(Tau)

Function to check the equality of two variables

In [None]:
def vars_equal(d1, d2):
    """ Check equality of two variables"""
    
    def list_equal(d1, d2):
        assert type(d1) == type(d2) == list
        assert len(d1) == len(d2)
        for j in range(len(d1)): assert vars_equal(d1[j], d2[j])
        return True
    
    def set_equal(d1, d2):
        assert type(d1) == type(d2) == set
        assert list_equal(sorted(d1), sorted(d2))
        return True
    
    def dict_equal(d1, d2):
        assert type(d1) == type(d2) == dict
        assert len(d1.keys()) == len(d2.keys())
        assert pd.Series(sorted(d1.keys())).equals(pd.Series(sorted(d1.keys())))
        for key in d1.keys(): assert vars_equal(d1[key], d2[key])
        return True

    assert type(d1) == type(d2)
    int_types = {int, np.int0, np.int8, np.int16, np.int32, np.int64}
    float_types = {float, np.float16, np.float32, np.float64, np.float128}
    if type(d1) == str:           assert d1 == d2
    elif type(d1) in int_types:   assert d1 == d2
    elif type(d1) in float_types: assert np.isclose(d1, d2)  # np.isclose(10, 10.0001) is True
    elif type(d1) == list:        assert list_equal(d1, d2)
    elif type(d1) == set:         assert set_equal(d1, d2)
    elif type(d1) == dict:        assert dict_equal(d1, d2)
    elif type(d1) == np.ndarray:  assert np.allclose(d1, d2)
    elif type(d1) in {pd.DataFrame, pd.Series}: assert d1.equals(d2)
    else: assert False, 'UNrecognised type: %s\n' % type(d1)
    return True