# Demo for Semantic Trajectory Analysis Algorithm

#### List the files in the "data"directory to make sure that all the required files exist

In [1]:
ls -al ../data

total 714076
drwxrwxr-x 2 hmm hmm      4096 Jan 23  2016 [0m[01;34m.[0m/
drwxrwxr-x 5 hmm hmm      4096 Jan 23  2016 [01;34m..[0m/
-rw-rw-r-- 1 hmm hmm 107054346 Dec 30  2015 mit_trj_parkinglot_all.csv
-rw-rw-r-- 1 hmm hmm    380886 Jan  2  2016 mit_trj_parkinglot_all_hilbert100.csv
-rw-rw-r-- 1 hmm hmm 135459113 Dec 31  2015 mit_trj_parkinglot_all_hilbert.csv
-rw-rw-r-- 1 hmm hmm 226392785 Dec 30  2015 mit_trj_parkinglot_all.json
-rw-rw-r-- 1 hmm hmm    252362 Aug  9  2015 [01;35mparkinglot.png[0m
-rw-rw-r-- 1 hmm hmm  22128957 Feb 17  2016 trip_as_vec.model
-rw-rw-r-- 1 hmm hmm  48543680 Feb 17  2016 trip_as_vec.model.docvecs.doctag_syn0.npy
-rw-rw-r-- 1 hmm hmm  95480480 Feb 17  2016 trip_as_vec.model.syn0.npy
-rw-rw-r-- 1 hmm hmm  95480480 Feb 17  2016 trip_as_vec.model.syn1.npy


#### Import "pandas" library, which is for I/O

In [2]:
import pandas as pd

#### Using pandas to read structured raw data with hilbert indexing

In [3]:
df = pd.read_csv('../data/mit_trj_parkinglot_all_hilbert.csv')

#### Group data by doc_id (i.e. trajectory ID)

In [4]:
dfgs = df.groupby('doc_id')

#### First 5 rows will be like: 

In [5]:
df.head()

Unnamed: 0,t,x,y,doc_id,hilbert_idx
0,11,315,146,md_0,124105
1,21,327,143,md_0,121578
2,26,333,145,md_0,121682
3,27,334,145,md_0,121687
4,31,339,146,md_0,120993


#### All column names:  

In [6]:
df.columns

Index([u't', u'x', u'y', u'doc_id', u'hilbert_idx'], dtype='object')

#### Trip point index as a document:

In [7]:
list(dfgs.groups.keys())[:2]

['md_27345', 'md_34904']

In [8]:
tmp = df.ix[dfgs.groups['md_27345']]['hilbert_idx']

#### Index of a trip: 

In [9]:
tmp.values

array([121259, 121255, 121250, 121254, 121250, 121255, 121251, 121247,
       121243, 121244, 121226, 121225, 121229, 121228, 121229, 121218,
       121216, 121301, 121215, 121130, 121214, 121215, 121129, 121128,
       121105, 121073, 121072, 120591, 120595, 120594, 120595, 120599,
       120598, 120809, 120808, 120807, 120802, 120801, 120800, 120821,
       120822])

#### Import "gensim" library for document vector

In [10]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

#### Generate labelled sentences (1. labelled 2. only contain clean words)

In [11]:
sentences = []
for k in dfgs.groups.keys():
    tmp = df.ix[dfgs.groups[k]]['hilbert_idx']
    sentences.append(LabeledSentence(words=[str(i) for i in tmp], tags=[k]))

#### First sentence: 

In [12]:
sentences[1]

TaggedDocument(words=['105847', '105875', '105893', '102824', '102813', '102771', '102722', '101426', '101501', '101506', '101557', '101873', '101893', '101951', '101954', '101983', '98713', '98700', '98687', '98599', '98590', '98554', '98548', '98540', '98282', '98292', '98296', '76471', '76429', '76413', '76339', '76348', '76232', '76252', '76057', '76049', '76085', '76032', '75527', '75574', '75572', '75567', '75645', '75635', '75631', '75666', '75673', '74855', '74852', '74846', '74833', '74828', '74817', '74729', '74722', '74723', '74741', '74697', '72782', '74690', '72785', '72798', '72796', '72805', '74667', '74664', '74669', '74670', '74657', '74679', '74678', '74681', '74682', '74703', '74698', '74700', '74705', '74718', '74719', '74721', '74734', '74733', '74732', '74811', '74808', '74812', '74813', '74801', '74800', '74799', '74800', '74799', '74875', '74873', '74872', '74877', '74882', '74878', '74864', '74895', '74892', '74891', '74910', '74911', '74915', '74916', '74919',

#### Initialization of the model where alpha is set to a value and sentences are input

In [13]:
model = Doc2Vec(alpha=0.01)

In [14]:
model.build_vocab(sentences)

#### Train  model: 

In [15]:
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

#### After training, you can choose to save the model for future use

In [16]:
model.save('../data/trip_as_vec.model')

#### Alternatively, you can skip the processes of initilization/training/saving model, but load a existing model instead

In [22]:
model = Doc2Vec.load('../data/trip_as_vec.model')

#### Size of a document vector: 

In [23]:
model.docvecs[0].shape

(300,)

#### Import "matplotlib" for plotting diagram: 

In [24]:
%matplotlib inline
from matplotlib import pyplot as plt

#### Tag for second sentence: 

In [25]:
t = model.docvecs.index2doctag[1]
print(t)

AttributeError: 'DocvecsArray' object has no attribute 'index2doctag'

#### Document vector with tag above:

In [26]:
model.docvecs.indexed_doctags(t)

NameError: name 't' is not defined

#### Print all the information within one trajectory

In [27]:
tag = 'md_27345'
print('Original description {:s}'.format(tag))
idx = int(tag.split('_')[-1])
tmp = df.ix[dfgs.groups['md_27345']]
print(tmp)

Original description md_27345
               t    x    y    doc_id  hilbert_idx
2950712  2800701  336  129  md_27345       121259
2950713  2800702  338  129  md_27345       121255
2950714  2800703  338  130  md_27345       121250
2950715  2800704  338  128  md_27345       121254
2950716  2800705  338  130  md_27345       121250
2950717  2800706  338  129  md_27345       121255
2950718  2800708  339  130  md_27345       121251
2950719  2800709  340  131  md_27345       121247
2950720  2800710  340  129  md_27345       121243
2950721  2800711  340  130  md_27345       121244
2950722  2800712  340  132  md_27345       121226
2950725  2800713  340  133  md_27345       121225
2950728  2800714  342  133  md_27345       121229
2950730  2800715  342  132  md_27345       121228
2950734  2800717  342  133  md_27345       121229
2950738  2800719  342  134  md_27345       121218
2950740  2800720  343  135  md_27345       121216
2950743  2800721  343  136  md_27345       121301
2950746  2800722  34

#### Define a method to plot a single trip

In [28]:
plt.figure(figsize=(10, 8))
def plot_trip(doc_id, lw=None, c=None, s=1):
    s = str("{0:.2f}".format(s * 100)) + "%"
    idx = dfgs.groups[doc_id]
    tmp = df.ix[idx]
    plt.plot(tmp['x'], 360 - tmp['y'], color=c, linewidth=lw, label=s)
    

<matplotlib.figure.Figure at 0x7ff13e7adb50>

#### Read in the background image

In [29]:
img = plt.imread('../data/parkinglot.png')
img_gry = img[:, :, 0]

#### Check the similarity of three trajectories base on one query trajectory

In [30]:
tag1 =  model.docvecs.index2doctag[100]
tag2 =  model.docvecs.index2doctag[35736]
tag3 =  model.docvecs.index2doctag[10000]
tag4 =  model.docvecs.index2doctag[1000]
plt.figure(figsize=(10, 8))
plt.imshow(img_gry[::-1, :], extent=(0, 480, 0, 360), cmap='gray', origin="lower")
plot_trip(tag1, lw=4, c='c')
plot_trip(tag2, lw=1, s=model.docvecs.similarity(tag1, tag2))
plot_trip(tag3, lw=1, s=model.docvecs.similarity(tag1, tag3))
plot_trip(tag4, lw=1, s=model.docvecs.similarity(tag1, tag4))
plt.legend()

AttributeError: 'DocvecsArray' object has no attribute 'index2doctag'

#### Find the most similar trips based on a query trip

In [31]:
tag =  model.docvecs.index2doctag[100]
tag = "md_33358"
plt.figure(figsize=(10, 8))
plt.imshow(img_gry[::-1, :], extent=(0, 480, 0, 360), cmap='gray', origin="lower")
plot_trip(tag, lw=4, c='c')
print('='*100)
count = 0
for t, s in model.docvecs.most_similar(tag):
    if count < 8:
        plot_trip(t, lw=1, s=s)
        plt.legend()
    count = count + 1

AttributeError: 'DocvecsArray' object has no attribute 'index2doctag'

## THE DEMO FINISHES HERE!