# Dataset Preprocess

<a id='toc'></a>

[Table of Contents](#toc)
1. [Load Data](#sec1)
1. [Compute POI Information](#sec2)
1. [Compute Trajectory Statistics](#sec3)
1. [Filtering out Short Trajectories](#sec4)
1. [Filtering out Trajectories with Sub-tours](#sec5)

In [None]:
% matplotlib inline

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dir_ijcai = 'data/data-ijcai15'
dir_recsys = 'data/data-recsys16'

In [None]:
#fvisit = os.path.join(dir_ijcai, 'userVisits-Osak.csv')
#fcoord = os.path.join(dir_ijcai, 'photoCoords-Osak.csv')
#fvisit = os.path.join(dir_ijcai, 'userVisits-Glas.csv')
#fcoord = os.path.join(dir_ijcai, 'photoCoords-Glas.csv')
#fvisit = os.path.join(dir_ijcai, 'userVisits-Edin.csv')
#fcoord = os.path.join(dir_ijcai, 'photoCoords-Edin.csv')
fvisit = os.path.join(dir_ijcai, 'userVisits-Toro.csv')
fcoord = os.path.join(dir_ijcai, 'photoCoords-Toro.csv')

In [None]:
suffix = fvisit.split('-')[-1].split('.')[0]

In [None]:
fpoi = os.path.join(dir_recsys, 'poi-' + suffix + '.csv')
ftraj_all = os.path.join(dir_recsys, 'traj-all-' + suffix + '.csv')
ftraj_noshort = os.path.join(dir_recsys, 'traj-noshort-' + suffix + '.csv')
ftraj_nofew = os.path.join(dir_recsys, 'traj-nofew-' + suffix + '.csv')

<a id='sec1'></a>

## 1. Load Data

Load user visit data and photo coordinates.

In [None]:
visits = pd.read_csv(fvisit, sep=';')
coords = pd.read_csv(fcoord, sep=';')
assert(visits.shape[0] == coords.shape[0])
visits = pd.merge(visits, coords, on='photoID') # merge data frames according to column 'photoID'
visits.head()

In [None]:
num_photo = visits['photoID'].unique().shape[0]
num_user = visits['userID'].unique().shape[0]
num_poi = visits['poiID'].unique().shape[0]
num_traj = visits['seqID'].unique().shape[0]
pd.DataFrame({'#photo': num_photo, '#user': num_user, '#poi': num_poi, '#traj': num_traj, \
              '#photo/user': num_photo/num_user, '#traj/user': num_traj/num_user}, index=[str(suffix)])

<a id='sec2'></a>

## 2. Compute POI Information

Compute POI (Longitude, Latitude) as the average coordinates of the assigned photos.

In [None]:
poi_coords = visits[['poiID', 'photoLon', 'photoLat']].groupby('poiID').mean()
poi_coords.reset_index(inplace=True)
poi_coords.rename(columns={'photoLon':'poiLon', 'photoLat':'poiLat'}, inplace=True)

Extract POI category.

In [None]:
poi_cat = visits[['poiID', 'poiTheme']].groupby('poiID').first()
poi_cat.reset_index(inplace=True)

In [None]:
poi_all = pd.merge(poi_cat, poi_coords, on='poiID')
poi_all.set_index('poiID', inplace=True)
poi_all.rename(columns={'poiTheme':'poiCat'}, inplace=True)
poi_all

Scatter plot of POI coordinates.

In [None]:
height = 3
ratio = (poi_all['poiLon'].max() - poi_all['poiLon'].min()) / (poi_all['poiLat'].max() - poi_all['poiLat'].min())
plt.figure(figsize=[height * np.round(ratio), height])
plt.scatter(poi_all['poiLon'], poi_all['poiLat'])

Save POI information to CSV file.

In [None]:
#poi_all.to_csv(fpoi, index=True)

<a id='sec3'></a>

## 3. Compute Trajectory Statistics

Compute trajectories information including simple statistics such as length (#POIs), POI start time, POI endtime, etc.

In [None]:
def calc_traj_df(seqid, visits):
    """Compute trajectories info, taking care of trajectories that contain sub-tours"""
    traj_df = visits[visits['seqID'] == seqid].copy()
    traj_df.sort_values(by='dateTaken', ascending=True, inplace=True)
    df_ = pd.DataFrame(columns=['poiID', 'startTime', 'endTime', '#photo'])
    assert(traj_df.shape[0] > 0)
    ix = traj_df.index[0]
    j = 0
    df_.loc[j] = [traj_df.loc[ix, 'poiID'], traj_df.loc[ix, 'dateTaken'], traj_df.loc[ix, 'dateTaken'], 1]
    for i in range(1, traj_df.shape[0]):
        ix = traj_df.index[i]
        if traj_df.loc[ix, 'poiID'] == df_.loc[j, 'poiID']:
            df_.loc[j, 'endTime'] = traj_df.loc[ix, 'dateTaken']
            df_.loc[j, '#photo'] += 1
        else:
            j += 1
            df_.loc[j] = [traj_df.loc[ix, 'poiID'], traj_df.loc[ix, 'dateTaken'], traj_df.loc[ix, 'dateTaken'], 1]
    df_['userID'] = traj_df.loc[traj_df.index[0], 'userID']
    df_['trajID'] = traj_df.loc[traj_df.index[0], 'seqID']
    df_['trajLen'] = df_.shape[0]
    return df_

In [None]:
traj_all = pd.DataFrame(columns=['userID', 'trajID', 'poiID', 'startTime', 'endTime', '#photo', 'trajLen'])
for seqid in sorted(visits['seqID'].unique().tolist()):
    traj_df = calc_traj_df(seqid, visits)
    traj_all = traj_all.append(traj_df, ignore_index=True)
traj_all.head()

In [None]:
traj_all.dtypes

In [None]:
int_cols = ['trajID', 'poiID', 'trajLen', 'startTime', 'endTime', '#photo']
traj_all[int_cols] = traj_all[int_cols].astype(np.int, copy=False)

Sanity check.

In [None]:
print(np.all(traj_all['trajLen'] >= 1))
print(np.all(traj_all['#photo'] >= 1))
print(np.all(traj_all['startTime'] <= traj_all['endTime']))

In [None]:
traj_all['poiDuration'] = traj_all['endTime'] - traj_all['startTime']
print(traj_all.shape)
traj_all.head()

In [None]:
traj_all.dtypes

In [None]:
# ERROR: this method doesn't consider sub-tours in trajectory
#traj_all = traj[['userID', 'seqID', 'poiID', 'dateTaken']].copy().groupby(['userID', 'seqID', 'poiID'])\
#           .agg([np.min, np.max, np.size])  
#traj_all.columns = traj_all.columns.droplevel()
#traj_all.reset_index(inplace=True)
#traj_all.rename(columns={'amin':'startTime', 'amax':'endTime', 'size':'#photo', 'seqID':'trajID'}, inplace=True)

Save trajectories and the associated stats to CSV files.

In [None]:
#traj_all.to_csv(ftraj_all, index=False)

<a id='sec4'></a>

## 4. Filtering out Short Trajectories

Filtering out short trajectories, i.e., trajectories with only 1 or 2 POIs.

In [None]:
traj_noshort = traj_all[traj_all['trajLen'] >= 3].copy()
print(traj_noshort.shape)
traj_noshort.head()

In [None]:
traj_noshort['#photo'].sum()

Save trajectories and the associated stats without short trajectories to CSV files.

In [None]:
#traj_noshort.to_csv(ftraj_noshort, index=False)

<a id='sec5'></a>

## 5. Filtering out Users with Few Trajectories

Filtering out users (and related trajectories) with few trajectories, e.g. less than $5$ trajectories.

In [None]:
MIN_N = 5

In [None]:
user_list = []

In [None]:
for user in sorted(traj_all['userID'].unique().tolist()):
    ntraj = traj_all[traj_all['userID'] == user]['trajID'].unique().shape[0]
    if ntraj >= MIN_N:
        user_list.append(user)

In [None]:
traj_nofew = traj_all[traj_all['userID'].isin(user_list)].copy()
print(traj_nofew.shape)
traj_nofew.head()

Save trajectories.

In [None]:
traj_nofew.to_csv(ftraj_nofew, index=False)

Sanity check.

In [None]:
for user in sorted(traj_nofew['userID'].unique().tolist()):
    udf = traj_nofew[traj_nofew['userID'] == user]
    assert(udf['trajID'].unique().shape[0] >= MIN_N)
print('Checking finished.')