# Leave-one-out Evaluation of SSVM

Leave-one-query-out evaluation of (single-/multi-label) SSVM with regularisation parameter $C$ tuned using Monte-Carlo cross validation.

In [None]:
import os, sys, pickle
import numpy as np
import pandas as pd
import random
import cvxopt

In [None]:
sys.path.append('src/')

In [None]:
import pyximport; pyximport.install()
from inference_lv import do_inference_list_viterbi

In [None]:
from inference import do_inference_brute_force, do_inference_viterbi
from shared import TrajData
from ssvm import SSVM

In [None]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)

In [None]:
dat_ix = 0

In [None]:
dat_obj = TrajData(dat_ix)

In [None]:
N_JOBS = 6         # number of parallel jobs
C_SET = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000]  # regularisation parameter
MC_PORTION = 0.1   # the portion of data that sampled by Monte-Carlo cross-validation
MC_NITER = 5       # number of iterations for Monte-Carlo cross-validation
SSVM_SHARE_PARAMS = True  # share params among POIs/transitions in SSVM
SSVM_MULTI_LABEL = True  # use multi-label SSVM

Nested cross-validation with Monte-Carlo cross-validation as inner loop.

In [None]:
inference_methods = [do_inference_brute_force, do_inference_list_viterbi, do_inference_viterbi]
methods_suffix = ['bruteForce', 'listViterbi', 'viterbi']

In [None]:
method_ix = 1

In [None]:
recdict_ssvm = dict()
cnt = 1
keys = sorted(dat_obj.TRAJ_GROUP_DICT.keys())

# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
    ps, L = keys[i]
    best_C = 1
    #best_F1 = 0; best_pF1 = 0
    best_Tau = 0
    keys_cv = keys[:i] + keys[i+1:]
    
    # use all training+validation set to compute POI features, 
    # make sure features do NOT change for training and validation
    trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJ_GROUP_DICT[keys[i]]
    poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
    
    poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
    if ps not in poi_set_i: 
        sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
        continue
    
    # tune regularisation constant C
    for ssvm_C in C_SET:
        print('\n--------------- try_C: %f ---------------\n' % ssvm_C); sys.stdout.flush() 
        F1_ssvm = []; pF1_ssvm = []; Tau_ssvm = []        
        
        # inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
        for j in range(MC_NITER):
            poi_list = []
            while True: # make sure the start POI in test set are also in training set
                rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
                test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
                assert(len(test_ix) > 0)
                trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJ_GROUP_DICT[keys[i]]
                for j in test_ix: 
                    trajid_set_train = trajid_set_train - dat_obj.TRAJ_GROUP_DICT[keys_cv[j]]
                poi_set = {p for tid in sorted(trajid_set_train) for p in dat_obj.traj_dict[tid] \
                           if len(dat_obj.traj_dict[tid]) >= 2}
                good_partition = True
                for j in test_ix: 
                    if keys_cv[j][0] not in poi_set: good_partition = False; break
                if good_partition == True: 
                    poi_list = sorted(poi_set)
                    break

            # train
            ssvm = SSVM(inference_train=inference_methods[method_ix], inference_pred=inference_methods[method_ix], 
                        dat_obj=dat_obj, share_params=SSVM_SHARE_PARAMS, multi_label=SSVM_MULTI_LABEL, 
                        C=ssvm_C, poi_info=poi_info_i.loc[poi_list].copy())
            if ssvm.train(sorted(trajid_set_train), n_jobs=N_JOBS) == True:            
                for j in test_ix: # test
                    ps_cv, L_cv = keys_cv[j]
                    y_hat = ssvm.predict(ps_cv, L_cv)
                    if y_hat is not None:
                        F1, pF1, tau = dat_obj.evaluate(keys_cv[j], y_hat)
                        F1_ssvm.append(F1); pF1_ssvm.append(pF1); Tau_ssvm.append(tau)
            else: 
                for j in test_ix:
                    F1_ssvm.append(0); pF1_ssvm.append(0); Tau_ssvm.append(0)
        
        #mean_F1 = np.mean(F1_ssvm); mean_pF1 = np.mean(pF1_ssvm)
        mean_Tau = np.mean(Tau_ssvm)
        print('mean_Tau: %.3f' % mean_Tau)
        if mean_Tau > best_Tau:
            best_Tau = mean_Tau
            best_C = ssvm_C
    print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
    sys.stdout.flush()
    
    # train model using all examples in training set and measure performance on test set
    ssvm = SSVM(inference_train=inference_methods[method_ix], inference_pred=inference_methods[method_ix], 
                dat_obj=dat_obj, share_params=SSVM_SHARE_PARAMS, multi_label=SSVM_MULTI_LABEL, 
                C=best_C, poi_info=poi_info_i)#, debug=True)
    if ssvm.train(sorted(trajid_set_i), n_jobs=N_JOBS) == True:
        y_hat = ssvm.predict(ps, L)
        print(cnt, y_hat)
        if y_hat is not None:
            recdict_ssvm[(ps, L)] = {'PRED': y_hat, 'W': ssvm.osssvm.w, 'C': ssvm.C}
        
    cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [None]:
F1_ssvm = []; pF1_ssvm = []; tau_ssvm = []
for key in sorted(recdict_ssvm.keys()):
    F1, pF1, tau = dat_obj.evaluate(key, recdict_ssvm[key]['PRED'])
    F1_ssvm.append(F1); pF1_ssvm.append(pF1); tau_ssvm.append(tau)
nF1 = np.sum([True if np.abs(x-1.0) < 1e-6 else False for x in F1_ssvm])
npF1 = np.sum([True if np.abs(x-1.0) < 1e-6 else False for x in pF1_ssvm])
print('SSVM: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f), perfectF1: %d/%d, perfectPairsF1: %d/%d' % \
      (np.mean(F1_ssvm), np.std(F1_ssvm)/np.sqrt(len(F1_ssvm)), \
       np.mean(pF1_ssvm), np.std(pF1_ssvm)/np.sqrt(len(pF1_ssvm)), \
       np.mean(tau_ssvm), np.std(tau_ssvm)/np.sqrt(len(tau_ssvm)), nF1, len(F1_ssvm), npF1, len(pF1_ssvm)))

In [None]:
fssvm = os.path.join(dat_obj.data_dir, 'ssvm-' + methods_suffix[method_ix] + '-' + dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_ssvm, open(fssvm, 'bw'))