In [7]:
import os
import pandas as pd
import uuid
from datetime import datetime

In [2]:
def check_labels_file(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        user_folder = os.path.join(folder,sf)
        labels_file = os.path.join(user_folder, 'labels.txt')
        if os.path.exists(labels_file):
            print('Reading user %s ---------------------------' % (sf))
check_labels_file('small_sample')

Reading user 020 ---------------------------
Reading user 021 ---------------------------
Reading user 052 ---------------------------
Reading user 053 ---------------------------
Reading user 056 ---------------------------


In [3]:
''' 
Ao ler os usuários, verificar a existência do arquivo labels.txt
'''
def create_base_dataframes(folder):
    subfolders = os.listdir(folder)
    dfs = []
    
    # df com cada linha sendo uma trajetória classificada
    transport_df = pd.DataFrame(columns=['Start Time','End Time','Transportation Mode'])
    
    # df com cada linha sendo um registro de gps com o identificador da trajetória
    gps_points_df = pd.DataFrame(columns=['Identifier', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Label'])
    
    for i, sf in enumerate(subfolders):
        user_folder = os.path.join(folder,sf)
        labels_file = os.path.join(user_folder, 'labels.txt')
        
        print('Reading user %s ---------------------------' % (sf))
        
        # cria uma tabela do usuário em que cada linha é uma trajetória
        u_transport_df = pd.read_csv(labels_file, sep="\t")
        u_transport_df['Identifier'] = 0
        
        transport_df = pd.concat((transport_df,u_transport_df),axis=0)
        # cria uma tabela do usuário que contém todos os seus registros de gps
#             u_gps_points_df = pd.DataFrame(columns=['Identifier', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Label'])
            
        plt_files = os.path.join(user_folder, 'Trajectory')
        
        for filename in os.listdir(plt_files): 
            u_gps_points_df = pd.read_csv(os.path.join(plt_files, filename), skiprows=6, header=None,
                                  parse_dates=[[5, 6]], infer_datetime_format=True)
            u_gps_points_df.rename(inplace=True, columns={'5_6': 'Timestamp', 0: 'Latitude', 
                                                  1: 'Longitude', 3: 'Altitude'})
            u_gps_points_df.drop(inplace=True, columns=[2, 4])
        
            gps_points_df = pd.concat((gps_points_df,u_gps_points_df),axis=0)
    
    return transport_df,gps_points_df

def label_trajectories(transport_df,gps_points_df):
    count = 1
    total = transport_df.shape[0]
    for index_transport,row_transport in transport_df.iterrows():
        print('Processing trajectory (%s/%s)' % (count,total))
        
        identifier = uuid.uuid1() # identifiers are repeating in trajectories. How to create unique ones?
        
        transport_df.loc[index_transport, 'Identifier'] = identifier
        
        start = datetime.strptime(row_transport['Start Time'],'%Y/%m/%d %H:%M:%S')
        end = datetime.strptime(row_transport['End Time'],'%Y/%m/%d %H:%M:%S')
        label = row_transport['Transportation Mode']
        
        for index_gps,row_gps in gps_points_df.iterrows():
            if start <= row_gps['Timestamp'] <= end:
                gps_points_df.loc[index_gps,'Identifier'] = identifier
                gps_points_df.loc[index_gps,'Label'] = label
                
        count += 1
        
    return transport_df,gps_points_df

In [8]:
trajectories,gps_points_df = create_base_dataframes('small_sample')
trajectories,gps_points_df = label_trajectories(trajectories,gps_points_df)

Reading user 020 ---------------------------
Reading user 021 ---------------------------
Reading user 052 ---------------------------
Reading user 053 ---------------------------
Reading user 056 ---------------------------
Processing trajectory (1/719)
Processing trajectory (2/719)
Processing trajectory (3/719)
Processing trajectory (4/719)
Processing trajectory (5/719)
Processing trajectory (6/719)
Processing trajectory (7/719)
Processing trajectory (8/719)
Processing trajectory (9/719)
Processing trajectory (10/719)
Processing trajectory (11/719)
Processing trajectory (12/719)
Processing trajectory (13/719)
Processing trajectory (14/719)
Processing trajectory (15/719)
Processing trajectory (16/719)
Processing trajectory (17/719)
Processing trajectory (18/719)
Processing trajectory (19/719)
Processing trajectory (20/719)
Processing trajectory (21/719)
Processing trajectory (22/719)
Processing trajectory (23/719)
Processing trajectory (24/719)
Processing trajectory (25/719)
Processin

Processing trajectory (254/719)
Processing trajectory (255/719)
Processing trajectory (256/719)
Processing trajectory (257/719)
Processing trajectory (258/719)
Processing trajectory (259/719)
Processing trajectory (260/719)
Processing trajectory (261/719)
Processing trajectory (262/719)
Processing trajectory (263/719)
Processing trajectory (264/719)
Processing trajectory (265/719)
Processing trajectory (266/719)
Processing trajectory (267/719)
Processing trajectory (268/719)
Processing trajectory (269/719)
Processing trajectory (270/719)
Processing trajectory (271/719)
Processing trajectory (272/719)
Processing trajectory (273/719)
Processing trajectory (274/719)
Processing trajectory (275/719)
Processing trajectory (276/719)
Processing trajectory (277/719)
Processing trajectory (278/719)
Processing trajectory (279/719)
Processing trajectory (280/719)
Processing trajectory (281/719)
Processing trajectory (282/719)
Processing trajectory (283/719)
Processing trajectory (284/719)
Processi

Processing trajectory (511/719)
Processing trajectory (512/719)
Processing trajectory (513/719)
Processing trajectory (514/719)
Processing trajectory (515/719)
Processing trajectory (516/719)
Processing trajectory (517/719)
Processing trajectory (518/719)
Processing trajectory (519/719)
Processing trajectory (520/719)
Processing trajectory (521/719)
Processing trajectory (522/719)
Processing trajectory (523/719)
Processing trajectory (524/719)
Processing trajectory (525/719)
Processing trajectory (526/719)
Processing trajectory (527/719)
Processing trajectory (528/719)
Processing trajectory (529/719)
Processing trajectory (530/719)
Processing trajectory (531/719)
Processing trajectory (532/719)
Processing trajectory (533/719)
Processing trajectory (534/719)
Processing trajectory (535/719)
Processing trajectory (536/719)
Processing trajectory (537/719)
Processing trajectory (538/719)
Processing trajectory (539/719)
Processing trajectory (540/719)
Processing trajectory (541/719)
Processi

In [10]:
trajectories.to_csv('dataframes/trajectories_smallsample.csv', index = False, encoding='utf-8')
gps_points_df.to_csv('dataframes/gps_points_df_smallsample.csv', index = False, encoding='utf-8')