# Experiment on Generated Data

The goal of this notebook is to design an experiment to check if multi-user and multi-label (which is what our dataset looks like) is a problem for SSVM.   
To chieve this goal,
1. a trained SSVM $\mathcal{M}_0$ (on Glasgow dataset $\mathcal{D}_0$) is used to generate a single user, single label dataset $\mathcal{D}_1$. Concretely, we predict a trajectory for every query $(p, l), p \in \mathcal{P}, l \in \{3,4,5,6,7\}$ use $\mathcal{M}_0$, where $\mathcal{P}$ is from $\mathcal{D}_0$.
1. train a new SSVM $\mathcal{M}_1$ using features (POI and transition features) computed from $\mathcal{D}_0$ and labels from $\mathcal{D}_1$, and check the performance on training set (i.e., $\mathcal{D}_1$).
1. perform leave-one-out cross validation on $\mathcal{D}_1$. Hyperparameter (i.e., $C$) is determined by trying some numbers when holding one label in $\mathcal{D}_1$ as test example and using all other labels in $\mathcal{D}_1$ as training set (POI and transition features are computed from $\mathcal{D}_0$), then fix the $C$ for all leave-one-out cross validations.
1. we noted that POI and transition features are computed from $\mathcal{D}_0$ and labels are from $\mathcal{D}_1$, as we can't compute the duration related features (i.e., avgDuration for POI, and log transition probability between discretized duration buckets) on $\mathcal{D}_1$ as no duration information is generated.
1. we try to disable duration related features one-by-one, and perform step $3$ to check whether duration related features help.
1. if duration related features don't help, we can turn off them and then compute POI and transition features from $\mathcal{D}_1$ and use labels in $\mathcal{D}_1$, then we want to compare the performance of RankSVM and SSVM on $\mathcal{D}_1$ (using leave-one-out cross validation), if SSVM performans better than RankSVM, it means multi-user and multi-label in our dataset is a problem for SSVM.

In [None]:
import os, pickle, random
import pandas as pd
import numpy as np
import cvxopt

In [None]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)

Run notebook ```ssvm.ipynb```.

In [None]:
%run 'ssvm.ipynb'

Load trained parameters and prediction results

In [None]:
dump_variables = True

In [None]:
fname = os.path.join(data_dir, 'ssvm-listViterbi-Glas.pkl')

In [None]:
ssvm_lv = pickle.load(open(fname, 'rb'))  # a dict: query -> {'PRED': trajectory, 'C': ssvm-c, 'W': model_params}

In [None]:
query = (1, 3)
W = ssvm_lv[query]['W']

In [None]:
Wname = 'W.pkl'
if dump_variables == True: pickle.dump(W, open(Wname, 'wb'))

In [None]:
W0 = pickle.load(open(Wname, 'rb'))
assert(np.allclose(W, W0))

In [None]:
trajid_set = set(trajid_set_all) - TRAJ_GROUP_DICT[query]
poi_set = set()
for tid in trajid_set: 
    if len(traj_dict[tid]) >= 2:
        poi_set = poi_set | set(traj_dict[tid])
poi_list = sorted(poi_set)
n_states = len(poi_set)
n_edge_features = 5
n_node_features = (len(W) - n_states * n_states * n_edge_features) // n_states
#print(len(W), n_states, n_node_features)
unary_params = W[:n_states * n_node_features].reshape(n_states, n_node_features)
pw_params = W[n_states * n_node_features:].reshape((n_states, n_states, n_edge_features))    

poi_id_dict, poi_id_rdict = dict(), dict()
for idx, poi in enumerate(poi_list):
    poi_id_dict[poi] = idx
    poi_id_rdict[idx] = poi
    
print('Finished.')

In [None]:
many = 'many.pkl'
if dump_variables == True: 
    pickle.dump([sorted(trajid_set), sorted(poi_set), poi_list, poi_id_dict, poi_id_rdict, unary_params, pw_params],
                open(many, 'wb'))

Compare with the results of previous runs.

In [None]:
[trajid_set0, poi_set0, poi_list0, poi_id_dict0, poi_id_rdict0, unary_params0, pw_params0]=pickle.load(open(many, 'rb'))
assert(np.all(np.array(sorted(trajid_set)) == np.array(trajid_set0)))
assert(np.all(np.array(sorted(poi_set)) == np.array(poi_set0)))
assert(np.all(np.array(poi_list) == np.array(poi_list0)))
assert(np.all(np.array(sorted(poi_id_dict.keys())) == np.array(sorted(poi_id_dict0.keys()))))
poi_id_v = [poi_id_dict[key] for key in sorted(poi_id_dict.keys())]
poi_id_v0 = [poi_id_dict0[key] for key in sorted(poi_id_dict0.keys())]
assert(np.all(np.array(poi_id_v) == np.array(poi_id_v0)))
assert(np.all(np.array(sorted(poi_id_rdict.keys())) == np.array(sorted(poi_id_rdict0.keys()))))
poi_id_rv = [poi_id_rdict[key] for key in sorted(poi_id_rdict.keys())]
poi_id_rv0 = [poi_id_rdict0[key] for key in sorted(poi_id_rdict0.keys())]
assert(np.all(np.array(poi_id_rv) == np.array(poi_id_rv0)))
assert(np.allclose(unary_params, unary_params0))
assert(np.allclose(pw_params, pw_params0))

print('True')

Compute feature scaling parameters

In [None]:
poi_info = calc_poi_info(sorted(trajid_set), traj_all, poi_all)

traj_list = [traj_dict[k] for k in sorted(trajid_set) if len(traj_dict[k]) >= 2]
node_features_list = Parallel(n_jobs=N_JOBS)\
                     (delayed(calc_node_features)\
                      (tr[0], len(tr), poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                       cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in traj_list)
edge_features = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info.copy())
fdim = node_features_list[0].shape
X_node_all = np.vstack(node_features_list)
scaler = MaxAbsScaler(copy=False)
scaler.fit(X_node_all)

# turn off duration
#poi_info['avgDuration'] = 0.0
edge_features[:, :, 3] = LOG_ZERO

print('Finished.')

In [None]:
node_features0 = calc_node_features(traj_list[0][0], len(traj_list[0]), poi_list, poi_info.copy(), 
                                    poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)

In [None]:
manymore = 'manymore.pkl'
if dump_variables == True: 
    pickle.dump([node_features0, traj_list[0], 
                 POI_CLUSTERS, POI_CAT_LIST, POI_CLUSTER_LIST, poi_info, traj_list, node_features_list,
                 X_node_all, edge_features], open(manymore, 'wb'))

Compare with the results of previous runs.

In [None]:
[nf0, tr0, POI_CLUSTERS0, POI_CAT_LIST0, POI_CLUSTER_LIST0, poi_info0, traj_list0, node_features_list0,
 X_node_all0, edge_features0] = pickle.load(open(manymore, 'rb'))
assert(np.all(np.array(traj_list[0]) == np.array(tr0)))
assert(POI_CLUSTERS.equals(POI_CLUSTERS0))
assert(pd.Series(POI_CAT_LIST).equals(pd.Series(POI_CAT_LIST0)))
assert(np.all(np.array(POI_CLUSTER_LIST) == np.array(POI_CLUSTER_LIST0)))
assert(poi_info.equals(poi_info0))
assert(np.allclose(node_features0, nf0))
assert(len(traj_list) == len(traj_list0))
for t in range(len(traj_list)): 
    assert(np.all(np.array(traj_list[t]) == np.array(traj_list0[t])))
assert(len(node_features_list) == len(node_features_list0))
for t in range(len(node_features_list)):
    assert(np.allclose(node_features_list[t], node_features_list0[t]))
assert(np.allclose(X_node_all, X_node_all0))
assert(np.allclose(edge_features, edge_features0))

print('True')

Generating trajectories

In [None]:
lengthes = [3, 4, 5]#, 6, 7]
fake_labels = []
for poi in sorted(poi_list):
    for L in lengthes:
        X_node_test = calc_node_features(poi, L, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                                         cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
        X_node_test = scaler.transform(X_node_test)  # feature scaling
        unary_features = X_node_test
        pw_features = edge_features.copy()
        y_pred = do_inference_listViterbi(poi_id_dict[poi], L, len(poi_set), 
                                          unary_params, pw_params, unary_features, pw_features)
        fake_labels.append([poi_id_rdict[p] for p in y_pred])

print('Finished.')

In [None]:
fakename = 'fake_labels.pkl'
if dump_variables == True: pickle.dump(fake_labels, open(fakename, 'wb'))

Compare with the results of previous runs.

In [None]:
fake_labels0 = pickle.load(open(fakename, 'rb'))
assert(len(fake_labels) == len(fake_labels0))
for l in range(len(fake_labels)):
    fl0 = fake_labels0[l]
    fl1 = fake_labels[l]
    assert(len(fl0) == len(fl1))
    assert(np.all(np.array(fl0) == np.array(fl1)))
    
print('True')

Computing scaling on the generated data

In [None]:
train_labels = fake_labels.copy()
node_features_all = Parallel(n_jobs=N_JOBS)\
                    (delayed(calc_node_features)\
                     (tr[0], len(tr), poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                      cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in train_labels)
fdim_train = node_features_all[0].shape
X_node_train = np.vstack(node_features_all)
scaler_train = MaxAbsScaler(copy=False)
X_node_train = scaler_train.fit_transform(X_node_train)
X_node_train = X_node_train.reshape(-1, fdim_train[0], fdim_train[1])
assert(len(train_labels) == X_node_train.shape[0])
X_train = [(X_node_train[k, :, :], edge_features.copy(), 
            (poi_id_dict[train_labels[k][0]], len(train_labels[k]))) for k in range(len(train_labels))]
y_train = [np.array([poi_id_dict[k] for k in tr]) for tr in train_labels]
assert(len(X_train) == len(y_train))

print('Finished.')

Training on generated data

In [None]:
C = 0.001

sm = MyModel(inference_fun=do_inference_listViterbi)
osssvm = OneSlackSSVM(model=sm, C=C, n_jobs=N_JOBS, verbose=0)
try:
    osssvm.fit(X_train, y_train, initialize=True)
    print('SSVM training finished.')
except:
    sys.stderr.write('SSVM training FAILED.\n')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(osssvm.objective_curve_, label='dual')
plt.plot(osssvm.primal_objective_curve_, label='primal')
plt.legend()

Prediction on generated data

In [None]:
predictions = dict()
for i in range(len(fake_labels)):
    ps_cv, L_cv = fake_labels[i][0], len(fake_labels[i])
    X_node_test = calc_node_features(ps_cv, L_cv, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, 
                                     cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
    X_node_test = scaler_train.transform(X_node_test)
    X_test = [(X_node_test, edge_features, (poi_id_dict[ps_cv], L_cv))]
    y_hat = osssvm.predict(X_test)
    predictions[(ps_cv, L_cv)] = {'PRED': np.array([poi_id_rdict[p] for p in y_hat[0]]), 'REAL':fake_labels[i]}
    #print(fake_labels[i], '->', predictions[(ps_cv, L_cv)]['PRED'].tolist())

print('Finished.')

Compute evaluation metrics

In [None]:
F1_ssvm = []; pF1_ssvm = []; tau_ssvm = []
for key in sorted(predictions.keys()):
    F1 = calc_F1(predictions[key]['REAL'], predictions[key]['PRED'])
    pF1 = calc_pairsF1(predictions[key]['REAL'], predictions[key]['PRED'])
    tau = calc_kendalltau(predictions[key]['REAL'], predictions[key]['PRED'])
    F1_ssvm.append(F1); pF1_ssvm.append(pF1); tau_ssvm.append(tau)
print('SSVM: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
      (np.mean(F1_ssvm), np.std(F1_ssvm)/np.sqrt(len(F1_ssvm)), \
       np.mean(pF1_ssvm), np.std(pF1_ssvm)/np.sqrt(len(pF1_ssvm)), \
       np.mean(tau_ssvm), np.std(tau_ssvm)/np.sqrt(len(tau_ssvm))))