# Experiment on Generated Data

The goal of this notebook is to design an experiment to check if multi-user and multi-label (which is what our dataset looks like) is a problem for SSVM.   
To chieve this goal,
1. a trained SSVM $\mathcal{M}_0$ (on Glasgow dataset $\mathcal{D}_0$) is used to generate a single user, single label dataset $\mathcal{D}_1$. Concretely, we predict a trajectory for every query $(p, l), p \in \mathcal{P}, l \in \{3,4,5,6,7\}$ use $\mathcal{M}_0$, where $\mathcal{P}$ is from $\mathcal{D}_0$.
1. train a new SSVM $\mathcal{M}_1$ using features (POI and transition features) computed from $\mathcal{D}_0$ and labels from $\mathcal{D}_1$, and check the performance on training set (i.e., $\mathcal{D}_1$).
1. perform leave-one-out cross validation on $\mathcal{D}_1$. Hyperparameter (i.e., $C$) is determined by trying some numbers when holding one label in $\mathcal{D}_1$ as test example and using all other labels in $\mathcal{D}_1$ as training set (POI and transition features are computed from $\mathcal{D}_0$), then fix the $C$ for all leave-one-out cross validations.
1. we noted that POI and transition features are computed from $\mathcal{D}_0$ and labels are from $\mathcal{D}_1$, as we can't compute the duration related features (i.e., avgDuration for POI, and log transition probability between discretized duration buckets) on $\mathcal{D}_1$ as no duration information is generated. So we try to disable duration related features one-by-one, and perform step $3$ to check whether duration related features help.
1. if duration related features don't help, we can turn off them and then compute POI and transition features from $\mathcal{D}_1$ and use labels in $\mathcal{D}_1$, then we want to compare the performance of RankSVM and SSVM on $\mathcal{D}_1$ (using leave-one-out cross validation), if SSVM performans better than RankSVM, it means multi-user and multi-label in our dataset is a problem for SSVM.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, pickle, random
import pandas as pd
import numpy as np
import cvxopt

In [None]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)

Run notebook ```ssvm.ipynb```.

In [None]:
%run 'ssvm.ipynb'

## Step 1 - Generate new dataset

Load trained parameters and prediction results

In [None]:
dump_variables = True

In [None]:
fname = os.path.join(data_dir, 'ssvm-listViterbi-Glas.pkl')

In [None]:
ssvm_lv = pickle.load(open(fname, 'rb'))  # a dict: query -> {'PRED': trajectory, 'C': ssvm-c, 'W': model_params}

In [None]:
query = (1, 3)
W = ssvm_lv[query]['W']

In [None]:
trajid_set = set(trajid_set_all) - TRAJ_GROUP_DICT[query]
poi_set = set()
for tid in trajid_set: 
    if len(traj_dict[tid]) >= 2:
        poi_set = poi_set | set(traj_dict[tid])
poi_list = sorted(poi_set)
n_states = len(poi_set)
n_edge_features = 5
n_node_features = (len(W) - n_states * n_states * n_edge_features) // n_states
#print(len(W), n_states, n_node_features)
unary_params = W[:n_states * n_node_features].reshape(n_states, n_node_features)
pw_params = W[n_states * n_node_features:].reshape((n_states, n_states, n_edge_features))    

poi_id_dict, poi_id_rdict = dict(), dict()
for idx, poi in enumerate(poi_list):
    poi_id_dict[poi] = idx
    poi_id_rdict[idx] = poi
    
print('Finished.')

Compute feature scaling parameters

In [None]:
poi_info = calc_poi_info(sorted(trajid_set), traj_all, poi_all)

traj_list = [traj_dict[k] for k in sorted(trajid_set) if len(traj_dict[k]) >= 2]
node_features_list = Parallel(n_jobs=N_JOBS)\
                     (delayed(calc_node_features)\
                      (tr[0], len(tr), poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                       cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in traj_list)
edge_features = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info.copy())
fdim = node_features_list[0].shape
X_node_all = np.vstack(node_features_list)
scaler = MaxAbsScaler(copy=False)
scaler.fit(X_node_all)

# turn off duration related features
#poi_info['avgDuration'] = 0.0
#edge_features[:, :, 3] = LOG_ZERO

print('Finished.')

Generating trajectories

In [None]:
lengthes = [3, 4, 5]#, 6, 7]
fake_labels = []
for poi in sorted(poi_list):
    for L in lengthes:
        X_node_test = calc_node_features(poi, L, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                                         cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
        X_node_test = scaler.transform(X_node_test)  # feature scaling
        unary_features = X_node_test
        pw_features = edge_features.copy()
        y_pred = do_inference_listViterbi(poi_id_dict[poi], L, len(poi_set), 
                                          unary_params, pw_params, unary_features, pw_features)
        fake_labels.append([poi_id_rdict[p] for p in y_pred])

print('Finished.')

## Step 2 - Train SSVM on generated dataset

Computing scaling parameters and training features/labels

In [None]:
def calc_train_data(train_labels, poi_list, poi_info, edge_features, poi_id_dict):
    node_features_all = Parallel(n_jobs=N_JOBS)\
                        (delayed(calc_node_features)\
                         (tr[0], len(tr), poi_list, poi_info, poi_clusters=POI_CLUSTERS, \
                          cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in train_labels)
    fdim_train = node_features_all[0].shape
    X_node_train = np.vstack(node_features_all)
    scaler_train = MaxAbsScaler(copy=False)
    X_node_train = scaler_train.fit_transform(X_node_train)
    X_node_train = X_node_train.reshape(-1, fdim_train[0], fdim_train[1])
    assert(len(train_labels) == X_node_train.shape[0])
    X_train = [(X_node_train[k, :, :], edge_features.copy(), 
                (poi_id_dict[train_labels[k][0]], len(train_labels[k]))) for k in range(len(train_labels))]
    y_train = [np.array([poi_id_dict[k] for k in tr]) for tr in train_labels]
    assert(len(X_train) == len(y_train))
    
    return X_train, y_train, scaler_train

Training on generated data

In [None]:
def train_ssvm(X_train, y_train, C):
    sm = MyModel(inference_fun=do_inference_listViterbi)
    osssvm = OneSlackSSVM(model=sm, C=C, n_jobs=N_JOBS, verbose=0)
    try:
        osssvm.fit(X_train, y_train, initialize=True)
        print('SSVM training finished.')
    except:
        sys.stderr.write('SSVM training FAILED.\n')
    return osssvm

Plot the primal dual objective value curve

In [None]:
def plot_obj_curve(ssvm):
    plt.plot(ssvm.objective_curve_, label='dual')
    plt.plot(ssvm.primal_objective_curve_, label='primal')
    plt.legend()

Make prediction

In [None]:
def predict(ssvm, ps, L, poi_list, poi_info, edge_features, scaler_train, poi_id_dict, poi_id_rdict):
    X_node_test = calc_node_features(ps, L, poi_list, poi_info, poi_clusters=POI_CLUSTERS, 
                                     cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
    X_node_test = scaler_train.transform(X_node_test)
    X_test = [(X_node_test, edge_features, (poi_id_dict[ps], L))]
    y_hat = ssvm.predict(X_test)
    return np.array([poi_id_rdict[p] for p in y_hat[0]])

Compute evaluation metrics

In [None]:
def evaluation(predictions):
    F1_ssvm = []; pF1_ssvm = []; tau_ssvm = []
    for key in sorted(predictions.keys()):
        F1 = calc_F1(predictions[key]['REAL'], predictions[key]['PRED'])
        pF1 = calc_pairsF1(predictions[key]['REAL'], predictions[key]['PRED'])
        tau = calc_kendalltau(predictions[key]['REAL'], predictions[key]['PRED'])
        F1_ssvm.append(F1); pF1_ssvm.append(pF1); tau_ssvm.append(tau)
    F1_mean = np.mean(F1_ssvm); pF1_mean = np.mean(pF1_ssvm); tau_mean = np.mean(tau_ssvm)
    print('SSVM: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (F1_mean, np.std(F1_ssvm)/np.sqrt(len(F1_ssvm)), \
           pF1_mean, np.std(pF1_ssvm)/np.sqrt(len(pF1_ssvm)), \
           tau_mean, np.std(tau_ssvm)/np.sqrt(len(tau_ssvm))))
    return F1_mean, pF1_mean, tau_mean

Train on generated dataset

In [None]:
C = 0.01

In [None]:
train_labels = fake_labels.copy()
X_train, y_train, scaler_train = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                 edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)

In [None]:
plot_obj_curve(ssvm)

Evaluate on training set

In [None]:
predictions = dict()
for label in train_labels:
    y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
                     scaler_train, poi_id_dict, poi_id_rdict)
    predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}

In [None]:
evaluation(predictions)

## Step 3 - Leave-one-out evaluation on generated dataset

### Choose hyper-parameter C

Choose hyper-parameter C using Monte-Carlo cross validation

In [None]:
num_test = int(len(fake_labels) * 0.2)

In [None]:
best_tau = 0; best_C = 0
for C in C_SET:
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        indices = np.arange(len(fake_labels))
        np.random.shuffle(indices)
        test_ix = indices[:num_test]
        train_ix = indices[num_test:]
        train_labels = [fake_labels[ix] for ix in train_ix]
        test_labels  = [fake_labels[ix] for ix in test_ix]
        X_train, y_train, scaler_train = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                         edge_features.copy(), poi_id_dict.copy())
        ssvm = train_ssvm(X_train, y_train, C)
        predictions = dict()
        for label in test_labels:
            y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(), 
                             scaler_train, poi_id_dict, poi_id_rdict)
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    tau_mean = np.mean(tau_test)
    if tau_mean > best_tau:
        best_tau = tau_mean
        best_C = C
        print('best_tau: %.3f, best_C: %.3f' % (best_tau, best_C))        

### Leave-one-out cross validation

In [None]:
predictions = dict()

In [None]:
for i in range(len(fake_labels)):
    train_labels = fake_labels[:i] + fake_labels[i+1:]
    X_train, y_train, scaler_train = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                     edge_features.copy(), poi_id_dict.copy())
    ssvm = train_ssvm(X_train, y_train, C)
    for label in train_labels:
        y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(), 
                         scaler_train, poi_id_dict, poi_id_rdict)
        predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}

In [None]:
evaluation(predictions)

## Step 4 - Check the informative of duration related features

Turn off duration related features one-by-one, and perform [step 3](#Step-3---Leave-one-out-evaluation-on-generated-dataset) to check whether duration related features help.

Concretely, disable duration related POI and transition features in [step 1](#Step-1---Generate-new-dataset) one-by-one, and run step 1 to step 3.

## Step 5 - Compute POI and transition features on the generated data

### Compute features

In [None]:
poi_info_new = poi_info.copy()
edge_features_new = edge_features.copy()

# turn off duration related features
poi_info_new['avgDuration'] = 0.0
edge_features_new[:, :, 3] = LOG_ZERO

# set popularity/nvisit and popularity/nvisit transition related features

### Choose hyper-parameter C

In [None]:
best_tau = 0; best_C = 0
for C in C_SET:
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        indices = np.arange(len(fake_labels))
        np.random.shuffle(indices)
        test_ix = indices[:num_test]
        train_ix = indices[num_test:]
        train_labels = [fake_labels[ix] for ix in train_ix]
        test_labels  = [fake_labels[ix] for ix in test_ix]
        X_train, y_train, scaler_train = calc_train_data(train_labels, poi_list, poi_info_new.copy(), 
                                                         edge_features_new.copy(), poi_id_dict.copy())
        ssvm = train_ssvm(X_train, y_train, C)
        predictions = dict()
        for label in test_labels:
            y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info_new.copy(), edge_features_new.copy(), 
                             scaler_train, poi_id_dict, poi_id_rdict)
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    tau_mean = np.mean(tau_test)
    if tau_mean > best_tau:
        best_tau = tau_mean
        best_C = C
        print('best_tau: %.3f, best_C: %.3f' % (best_tau, best_C))        

### Leave-one-out cross validation

In [None]:
predictions = dict()

In [None]:
for i in range(len(fake_labels)):
    train_labels = fake_labels[:i] + fake_labels[i+1:]
    X_train, y_train, scaler_train = calc_train_data(train_labels, poi_list, poi_info_new.copy(), 
                                                     edge_features_new.copy(), poi_id_dict.copy())
    ssvm = train_ssvm(X_train, y_train, C)
    for label in train_labels:
        y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info_new.copy(), edge_features_new.copy(), 
                         scaler_train, poi_id_dict, poi_id_rdict)
        predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}

In [None]:
evaluation(predictions)