In [14]:
import os
import pandas as pd
import uuid
from datetime import datetime

In [11]:
def check_labels_file(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        user_folder = os.path.join(folder,sf)
        labels_file = os.path.join(user_folder, 'labels.txt')
        if os.path.exists(labels_file):
            print('Reading user %s ---------------------------' % (sf))
check_labels_file('single_sample')

Reading user 010 ---------------------------


In [12]:
''' 
Ao ler os usuários, verificar a existência do arquivo labels.txt
'''
def create_base_dataframes(folder):
    subfolders = os.listdir(folder)
    dfs = []
    
    # df com cada linha sendo uma trajetória classificada
    transport_df = pd.DataFrame(columns=['Start Time','End Time','Transportation Mode'])
    
    # df com cada linha sendo um registro de gps com o identificador da trajetória
    gps_points_df = pd.DataFrame(columns=['Identifier', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Label'])
    
    for i, sf in enumerate(subfolders):
        user_folder = os.path.join(folder,sf)
        labels_file = os.path.join(user_folder, 'labels.txt')
        
        print('Reading user %s ---------------------------' % (sf))
        
        # cria uma tabela do usuário em que cada linha é uma trajetória
        u_transport_df = pd.read_csv(labels_file, sep="\t")
        u_transport_df['Identifier'] = 0
        
        transport_df = pd.concat((transport_df,u_transport_df),axis=0)
        # cria uma tabela do usuário que contém todos os seus registros de gps
#             u_gps_points_df = pd.DataFrame(columns=['Identifier', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Label'])
            
        plt_files = os.path.join(user_folder, 'Trajectory')
        
        for filename in os.listdir(plt_files): 
            u_gps_points_df = pd.read_csv(os.path.join(plt_files, filename), skiprows=6, header=None,
                                  parse_dates=[[5, 6]], infer_datetime_format=True)
            u_gps_points_df.rename(inplace=True, columns={'5_6': 'Timestamp', 0: 'Latitude', 
                                                  1: 'Longitude', 3: 'Altitude'})
            u_gps_points_df.drop(inplace=True, columns=[2, 4])
        
            gps_points_df = pd.concat((gps_points_df,u_gps_points_df),axis=0)
    
    return transport_df,gps_points_df

def label_trajectories(transport_df,gps_points_df):
    count = 1
    total = transport_df.shape[0]
    for index_transport,row_transport in transport_df.iterrows():
        print('Processing trajectory (%s/%s)' % (count,total))
        
        identifier = uuid.uuid1() # identifiers are repeating in trajectories. How to create unique ones?
        
        transport_df.loc[index_transport, 'Identifier'] = identifier
        
        start = datetime.strptime(row_transport['Start Time'],'%Y/%m/%d %H:%M:%S')
        end = datetime.strptime(row_transport['End Time'],'%Y/%m/%d %H:%M:%S')
        label = row_transport['Transportation Mode']
        
        for index_gps,row_gps in gps_points_df.iterrows():
            if start <= row_gps['Timestamp'] <= end:
                gps_points_df.loc[index_gps,'Identifier'] = identifier
                gps_points_df.loc[index_gps,'Label'] = label
                
        count += 1
        
    return transport_df,gps_points_df

In [15]:
trajectories,gps_points_df = create_base_dataframes('single_sample')
trajectories,gps_points_df = label_trajectories(trajectories,gps_points_df)

Reading user 010 ---------------------------
Processing trajectory (1/434)
Processing trajectory (2/434)
Processing trajectory (3/434)
Processing trajectory (4/434)
Processing trajectory (5/434)
Processing trajectory (6/434)
Processing trajectory (7/434)
Processing trajectory (8/434)
Processing trajectory (9/434)
Processing trajectory (10/434)
Processing trajectory (11/434)
Processing trajectory (12/434)
Processing trajectory (13/434)
Processing trajectory (14/434)
Processing trajectory (15/434)
Processing trajectory (16/434)
Processing trajectory (17/434)
Processing trajectory (18/434)
Processing trajectory (19/434)
Processing trajectory (20/434)
Processing trajectory (21/434)
Processing trajectory (22/434)
Processing trajectory (23/434)
Processing trajectory (24/434)
Processing trajectory (25/434)
Processing trajectory (26/434)
Processing trajectory (27/434)
Processing trajectory (28/434)
Processing trajectory (29/434)
Processing trajectory (30/434)
Processing trajectory (31/434)
Pro

Processing trajectory (259/434)
Processing trajectory (260/434)
Processing trajectory (261/434)
Processing trajectory (262/434)
Processing trajectory (263/434)
Processing trajectory (264/434)
Processing trajectory (265/434)
Processing trajectory (266/434)
Processing trajectory (267/434)
Processing trajectory (268/434)
Processing trajectory (269/434)
Processing trajectory (270/434)
Processing trajectory (271/434)
Processing trajectory (272/434)
Processing trajectory (273/434)
Processing trajectory (274/434)
Processing trajectory (275/434)
Processing trajectory (276/434)
Processing trajectory (277/434)
Processing trajectory (278/434)
Processing trajectory (279/434)
Processing trajectory (280/434)
Processing trajectory (281/434)
Processing trajectory (282/434)
Processing trajectory (283/434)
Processing trajectory (284/434)
Processing trajectory (285/434)
Processing trajectory (286/434)
Processing trajectory (287/434)
Processing trajectory (288/434)
Processing trajectory (289/434)
Processi

In [16]:
trajectories

Unnamed: 0,Start Time,End Time,Transportation Mode,Identifier
0,2007/06/26 11:32:29,2007/06/26 11:40:29,bus,e76bf222-b1a3-11ed-b9a9-d683eb738645
1,2008/03/28 14:52:54,2008/03/28 15:59:59,train,fb876924-b1a3-11ed-99a2-d683eb738645
2,2008/03/28 16:00:00,2008/03/28 22:02:00,train,0f7368f5-b1a4-11ed-8412-d683eb738645
3,2008/03/29 01:27:50,2008/03/29 15:59:59,train,2536cbd4-b1a4-11ed-ae23-d683eb738645
4,2008/03/29 16:00:00,2008/03/30 15:59:59,train,3c7a96c4-b1a4-11ed-99f5-d683eb738645
...,...,...,...,...
429,2008/12/07 10:30:54,2008/12/07 10:34:14,taxi,fffed867-b1cb-11ed-ab93-d683eb738645
430,2008/12/07 10:59:29,2008/12/07 11:29:48,train,136d7d91-b1cc-11ed-bbca-d683eb738645
431,2008/12/07 11:43:12,2008/12/07 12:23:26,bus,28d5cf3d-b1cc-11ed-b41f-d683eb738645
432,2008/12/07 12:23:34,2008/12/07 12:25:07,walk,440ebbb1-b1cc-11ed-899c-d683eb738645


In [17]:
gps_points_df

Unnamed: 0,Identifier,Timestamp,Latitude,Longitude,Altitude,Label
0,d8f01f95-b1cb-11ed-8c90-d683eb738645,2007-08-04 03:30:32,39.921712,116.472343,13,walk
1,d8f01f95-b1cb-11ed-8c90-d683eb738645,2007-08-04 03:30:33,39.921705,116.472343,13,walk
2,d8f01f95-b1cb-11ed-8c90-d683eb738645,2007-08-04 03:30:34,39.921695,116.472345,13,walk
3,d8f01f95-b1cb-11ed-8c90-d683eb738645,2007-08-04 03:30:35,39.921683,116.472342,13,walk
4,d8f01f95-b1cb-11ed-8c90-d683eb738645,2007-08-04 03:30:36,39.921672,116.472342,13,walk
...,...,...,...,...,...,...
2738,28d5cf3d-b1cc-11ed-b41f-d683eb738645,2009-03-21 05:34:49,39.136261,117.218261,-59,bus
2739,28d5cf3d-b1cc-11ed-b41f-d683eb738645,2009-03-21 05:34:50,39.136256,117.218276,-59,bus
2740,28d5cf3d-b1cc-11ed-b41f-d683eb738645,2009-03-21 05:34:51,39.136256,117.218291,-59,bus
2741,28d5cf3d-b1cc-11ed-b41f-d683eb738645,2009-03-21 05:34:52,39.136256,117.218303,-59,bus


In [18]:
trajectories.to_csv('dataframes/trajectories_singlesample.csv', index = False, encoding='utf-8')
gps_points_df.to_csv('dataframes/gps_points_df_singlesample.csv', index = False, encoding='utf-8')