## [Endo et al (2016)](https://link.springer.com/article/10.1007/s41060-016-0014-1)
### What is done:
* We removed the data of users who have only ten annotations or fewer
* We labeled each section of GPS trajectories between the beginning and end times with an annotation, and used these sections as a segment of the same transportation mode
* Although there are 11 types of annotations, we used only seven (walking, bus, car, bike, taxi, subway, and train)

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

### Passo 1

In [106]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

# mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {'walk':1, 'bike':2, 'bus':3, 'car':4, 'taxi':5, 'train':6, 'motorcycle':7, 'subway':0, 'airplane':0, 'boat':0, 'run':0}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]        
    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    points['label'] = labels['label'].iloc[indices].values

def read_user(user_folder):
    labels = None
    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        labels = labels[labels['label'] != 0]
        apply_labels(df, labels)
    else:
        df['label'] = 0
    return df, labels

def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df, labels = read_user(os.path.join(folder,sf))
        if not isinstance(labels, type(None)) and labels.shape[0] > 10:
            df['user'] = int(sf)
            dfs.append(df)
    return pd.concat(dfs)

In [107]:
start = datetime.datetime.now()
df1 = read_all_users('../Data')
end = datetime.datetime.now()
print('Tempo gasto no passo 1:',end-start)
df1

[1/182] processing user 000
[2/182] processing user 001
[3/182] processing user 002
[4/182] processing user 003
[5/182] processing user 004
[6/182] processing user 005
[7/182] processing user 006
[8/182] processing user 007
[9/182] processing user 008
[10/182] processing user 009
[11/182] processing user 010
[12/182] processing user 011
[13/182] processing user 012
[14/182] processing user 013
[15/182] processing user 014
[16/182] processing user 015
[17/182] processing user 016
[18/182] processing user 017
[19/182] processing user 018
[20/182] processing user 019
[21/182] processing user 020
[22/182] processing user 021
[23/182] processing user 022
[24/182] processing user 023
[25/182] processing user 024
[26/182] processing user 025
[27/182] processing user 026
[28/182] processing user 027
[29/182] processing user 028
[30/182] processing user 029
[31/182] processing user 030
[32/182] processing user 031
[33/182] processing user 032
[34/182] processing user 033
[35/182] processing use

Unnamed: 0,time,lat,lon,alt,label,user
0,2007-08-04 03:30:32,39.921712,116.472343,13.0,3,10
1,2007-08-04 03:30:33,39.921705,116.472343,13.0,3,10
2,2007-08-04 03:30:34,39.921695,116.472345,13.0,3,10
3,2007-08-04 03:30:35,39.921683,116.472342,13.0,3,10
4,2007-08-04 03:30:36,39.921672,116.472342,13.0,3,10
...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84.0,1,179
4705,2008-11-29 08:15:54,40.007780,116.319360,88.0,1,179
4706,2008-11-29 08:15:56,40.007756,116.319362,92.0,1,179
4707,2008-11-29 08:15:58,40.007740,116.319361,97.0,1,179


In [109]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df1, test_size=0.2)