## [Liu and Lee (2017)](https://sci-hub.se/10.1109/iske.2017.8258799)
### What is done:
* for each trajectory, first node is set as (lat,lon,time) = (0,0,0) and is calculated que relative distance an time spent for each consecutive node in relation to the previous one
* data split 80% training and 20% test (no shuffling)

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

### Passo 1

In [2]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points['label'][no_label] = 0

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

In [3]:
start = datetime.datetime.now()
df1 = read_all_users('../Data')
df1
end = datetime.datetime.now()
print('Tempo gasto no passo 1:',end-start)

[1/182] processing user 000
[2/182] processing user 001
[3/182] processing user 002
[4/182] processing user 003
[5/182] processing user 004
[6/182] processing user 005
[7/182] processing user 006
[8/182] processing user 007
[9/182] processing user 008
[10/182] processing user 009
[11/182] processing user 010
[12/182] processing user 011
[13/182] processing user 012
[14/182] processing user 013
[15/182] processing user 014
[16/182] processing user 015
[17/182] processing user 016
[18/182] processing user 017
[19/182] processing user 018
[20/182] processing user 019
[21/182] processing user 020
[22/182] processing user 021
[23/182] processing user 022
[24/182] processing user 023
[25/182] processing user 024
[26/182] processing user 025
[27/182] processing user 026
[28/182] processing user 027
[29/182] processing user 028
[30/182] processing user 029
[31/182] processing user 030
[32/182] processing user 031
[33/182] processing user 032
[34/182] processing user 033
[35/182] processing use

In [4]:
df1

Unnamed: 0,time,lat,lon,alt,label,user
0,2008-10-23 02:53:04,39.984702,116.318417,492.000000,0,0
1,2008-10-23 02:53:10,39.984683,116.318450,492.000000,0,0
2,2008-10-23 02:53:15,39.984686,116.318417,492.000000,0,0
3,2008-10-23 02:53:20,39.984688,116.318385,492.000000,0,0
4,2008-10-23 02:53:25,39.984655,116.318263,492.000000,0,0
...,...,...,...,...,...,...
17,2008-03-14 03:39:56,40.914867,111.710500,3802.493438,0,181
18,2008-03-14 03:41:17,40.914267,111.710333,3795.931759,0,181
19,2008-03-14 03:43:02,40.912467,111.710667,3795.931759,0,181
20,2008-03-14 03:43:28,40.911517,111.711317,3779.527559,0,181


### Passo 2

In [5]:
df2 = df1[df1['label'] != 0]
df2.reset_index(inplace=True)
df2.drop('index', axis=1, inplace=True)
df2

Unnamed: 0,time,lat,lon,alt,label,user
0,2008-03-28 14:54:40,39.894178,116.318200,-777.0,6,10
1,2008-03-28 14:55:14,39.894505,116.321132,-777.0,6,10
2,2008-03-28 14:56:13,39.894953,116.326452,-777.0,6,10
3,2008-03-28 14:57:12,39.894600,116.332542,-777.0,6,10
4,2008-03-28 14:58:11,39.889622,116.337040,-777.0,6,10
...,...,...,...,...,...,...
5427112,2008-11-29 02:29:27,40.029529,116.411977,291.0,5,179
5427113,2008-11-29 02:29:29,40.029320,116.411975,289.0,5,179
5427114,2008-11-29 02:29:31,40.029111,116.411963,275.0,5,179
5427115,2008-11-29 02:29:33,40.028904,116.411962,274.0,5,179


### Passo 3

In [6]:
import datetime as dt

start = datetime.datetime.now()

df3 = df2
df3['time_p'],df3['lat_p'],df3['lon_p'] = 0,0,0
for idx, row in df3.iterrows():
    if idx != 0 and row['time'].day==df3.at[idx-1,'time'].day:
        df3.at[idx,'time_p'] = (row['time'] - df3.at[idx-1,'time']).seconds
        df3.at[idx,'lat_p'] = row['lat'] - df3.at[idx-1,'lat']
        df3.at[idx,'lon_p'] = row['lon'] - df3.at[idx-1,'lon']
end = datetime.datetime.now()
print('Tempo gasto no passo 3:',end-start)

df3

Tempo gasto no passo 3: 0:16:32.019474


Unnamed: 0,time,lat,lon,alt,label,user,time_p,lat_p,lon_p
0,2008-03-28 14:54:40,39.894178,116.318200,-777.0,6,10,0,0.000000,0.000000e+00
1,2008-03-28 14:55:14,39.894505,116.321132,-777.0,6,10,34,0.000327,2.932000e-03
2,2008-03-28 14:56:13,39.894953,116.326452,-777.0,6,10,59,0.000448,5.320000e-03
3,2008-03-28 14:57:12,39.894600,116.332542,-777.0,6,10,59,-0.000353,6.090000e-03
4,2008-03-28 14:58:11,39.889622,116.337040,-777.0,6,10,59,-0.004978,4.498000e-03
...,...,...,...,...,...,...,...,...,...
5427112,2008-11-29 02:29:27,40.029529,116.411977,291.0,5,179,2,-0.000210,-3.000000e-06
5427113,2008-11-29 02:29:29,40.029320,116.411975,289.0,5,179,2,-0.000209,-2.000000e-06
5427114,2008-11-29 02:29:31,40.029111,116.411963,275.0,5,179,2,-0.000209,-1.200000e-05
5427115,2008-11-29 02:29:33,40.028904,116.411962,274.0,5,179,2,-0.000207,-1.000000e-06


### Passo 4

In [7]:
df4 = df3[['time_p','lat_p','lon_p','alt','label','user']]
delta_time = df4['time_p'].values.tolist()
def time_interval(t):
    if t<=9:
        return t
    elif t <= 20:
        return [10,20]
    elif t <= 50:
        return [21,50]
    return 51

intervals = list(map(time_interval,delta_time))
df4['interval_class'] = intervals
df4

Unnamed: 0,time_p,lat_p,lon_p,alt,label,user,interval_class
0,0,0.000000,0.000000e+00,-777.0,6,10,0
1,34,0.000327,2.932000e-03,-777.0,6,10,"[21, 50]"
2,59,0.000448,5.320000e-03,-777.0,6,10,51
3,59,-0.000353,6.090000e-03,-777.0,6,10,51
4,59,-0.004978,4.498000e-03,-777.0,6,10,51
...,...,...,...,...,...,...,...
5427112,2,-0.000210,-3.000000e-06,291.0,5,179,2
5427113,2,-0.000209,-2.000000e-06,289.0,5,179,2
5427114,2,-0.000209,-1.200000e-05,275.0,5,179,2
5427115,2,-0.000207,-1.000000e-06,274.0,5,179,2


### Passo 5

In [22]:
from sklearn.model_selection import train_test_split

X = df4[['time_p','lat_p','lon_p','alt','user','interval_class']]
y = df4[['label']].values.tolist()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

### Passo 6

#### Decision Three / No feature selection

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

start = datetime.datetime.now()

depths = [2,3,5]
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d)
    clf.fit(X_train, y_train)
    scores = cross_val_score(clf,X_test,y_test)
    print(scores)
    
end = datetime.datetime.now()
print('Tempo gasto:',end-start)

ValueError: setting an array element with a sequence.