Boa caracterização da base (média, variancia dos tempos de amostragem, qntd de amostras, velocidade média, dist entre pontos)

Mesmo usuários que possuem o arquivo labels.txt tem registros sem labels

Apenas o usuário 106 tem labels para todos os registros de latlong

Quando não há label, é atribuído o valor 0 ao campo

Alguns artigos reduzem a quantidade de labels para abranger apenas os que forem relevantes ou agrupam tipos de transporte semelhantes, como subway e train.

A distribuição de labels na base de dados é:

Quantidade de amostras: 24876978

Quantidade de amostras/label:

    0-----------19449861 (0,78)
    walk---------1585401
    bus----------1276632
    bike----------948061
    train---------560979
    car-----------512807
    subway--------286168
    taxi----------242018
    airplane--------9183
    boat------------3559
    run-------------1971
    motorcycle-------338

In [1]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points['label'][no_label] = 0

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])
    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)    
    else:
        df['label'] = 0
    return df


def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

In [2]:
df = read_all_users('Data')
df

[1/182] processing user 000
[2/182] processing user 001
[3/182] processing user 002
[4/182] processing user 003
[5/182] processing user 004
[6/182] processing user 005
[7/182] processing user 006
[8/182] processing user 007
[9/182] processing user 008
[10/182] processing user 009
[11/182] processing user 010


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  points['label'][no_label] = 0


[12/182] processing user 011
[13/182] processing user 012
[14/182] processing user 013
[15/182] processing user 014
[16/182] processing user 015
[17/182] processing user 016
[18/182] processing user 017
[19/182] processing user 018
[20/182] processing user 019
[21/182] processing user 020
[22/182] processing user 021
[23/182] processing user 022
[24/182] processing user 023
[25/182] processing user 024
[26/182] processing user 025
[27/182] processing user 026
[28/182] processing user 027
[29/182] processing user 028
[30/182] processing user 029
[31/182] processing user 030
[32/182] processing user 031
[33/182] processing user 032
[34/182] processing user 033
[35/182] processing user 034
[36/182] processing user 035
[37/182] processing user 036
[38/182] processing user 037
[39/182] processing user 038
[40/182] processing user 039
[41/182] processing user 040
[42/182] processing user 041
[43/182] processing user 042
[44/182] processing user 043
[45/182] processing user 044
[46/182] proce

Unnamed: 0,time,lat,lon,alt,label,user
0,2008-10-23 02:53:04,39.984702,116.318417,492.000000,0,0
1,2008-10-23 02:53:10,39.984683,116.318450,492.000000,0,0
2,2008-10-23 02:53:15,39.984686,116.318417,492.000000,0,0
3,2008-10-23 02:53:20,39.984688,116.318385,492.000000,0,0
4,2008-10-23 02:53:25,39.984655,116.318263,492.000000,0,0
...,...,...,...,...,...,...
17,2008-03-14 03:39:56,40.914867,111.710500,3802.493438,0,181
18,2008-03-14 03:41:17,40.914267,111.710333,3795.931759,0,181
19,2008-03-14 03:43:02,40.912467,111.710667,3795.931759,0,181
20,2008-03-14 03:43:28,40.911517,111.711317,3779.527559,0,181


In [3]:
#### only for testing: selecting users that bring a variety of labels
sp = df[df['user'].isin([11,60,85,115,175])].reset_index()

In [5]:
from geopy.distance import geodesic
from geographiclib.geodesic import Geodesic
import datetime

def get_distance(df):
    df['distance'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            coords_1 = (df.at[idx-1,'lat'], df.at[idx-1,'lon'])
            coords_2 = (df.at[idx,'lat'], df.at[idx,'lon'])
            df.at[idx,'distance'] = geodesic(coords_1, coords_2).m
    print('........................Finished dist......................')
    return df

def get_time(df):
    df['delta_time'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            start = df.at[idx-1,'time']
            end = df.at[idx,'time']
            df.at[idx,'delta_time'] = pd.Timedelta((end - start)).total_seconds()
    print('........................Finished time......................')
    return df

def get_speed(df):
    df['speed'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            if df.at[idx,'distance'] != 0 and df.at[idx,'delta_time'] != 0:
                df.at[idx,'speed'] = df.at[idx,'distance']/df.at[idx,'delta_time']
    print('........................Finished speed......................')
    return df

def get_acc(df):
    df['acceleration'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            if df.at[idx,'speed'] != 0 and df.at[idx,'delta_time'] != 0:
                df.at[idx,'acceleration'] = df.at[idx,'speed']/df.at[idx,'delta_time']
    print('........................Finished acc......................')
    return df

In [6]:
def get_features(df):
#     df.reset_index()
    df = get_distance(df) 
    df = get_time(df)
    df = get_speed(df)
    df = get_acc(df)
    return df

get_features(sp)
df

........................Finished dist......................
........................Finished time......................
........................Finished speed......................
........................Finished acc......................


Unnamed: 0,time,lat,lon,alt,label,user
0,2008-10-23 02:53:04,39.984702,116.318417,492.000000,0,0
1,2008-10-23 02:53:10,39.984683,116.318450,492.000000,0,0
2,2008-10-23 02:53:15,39.984686,116.318417,492.000000,0,0
3,2008-10-23 02:53:20,39.984688,116.318385,492.000000,0,0
4,2008-10-23 02:53:25,39.984655,116.318263,492.000000,0,0
...,...,...,...,...,...,...
17,2008-03-14 03:39:56,40.914867,111.710500,3802.493438,0,181
18,2008-03-14 03:41:17,40.914267,111.710333,3795.931759,0,181
19,2008-03-14 03:43:02,40.912467,111.710667,3795.931759,0,181
20,2008-03-14 03:43:28,40.911517,111.711317,3779.527559,0,181


In [15]:
def get_general_data_description(df):
    users = df['user'].unique()
    print('Quantidade de amostras:',df.shape)
    print('Quantidade de amostras/label:')
    print(df['label'].value_counts())
    print('-----MÉDIAS-----')
    print('Velocidade:',df['speed'].mean())
    print('Aceleração:',df['acceleration'].mean())
    print('Distância:',df['distance'].mean())
    print('Tempo:',df['delta_time'].mean())
    print('-----VARIÂNCIAS-----')
    print('Velocidade:',df['speed'].var())
    print('Aceleração:',df['acceleration'].var())
    print('Distância:',df['distance'].var())
    print('Tempo:',df['delta_time'].var())

get_general_data_description(sp)

Quantidade de amostras: (841410, 11)
Quantidade de amostras/label:
0     323643
3     184953
1     149348
4     111232
5      67514
11      3882
2        838
Name: label, dtype: int64
-----MÉDIAS-----
Velocidade: 5.22985464874437
Aceleração: 2.6067755315482346
Distância: 28.253547022260253
Tempo: 121.52810639284058
-----VARIÂNCIAS-----
Velocidade: 51.216454365503026
Aceleração: 22.97240886738156
Distância: 11638715.1189422
Tempo: 401482386.55047184


In [10]:
def get_data_description_by_user(df):
    subset = df[['user','alt','distance','delta_time','speed','acceleration','label']]
    users = df['user'].unique()
    lst = []
    for u in users:
        filtered = subset[(subset['user']==u)]
        
        registros = filtered.shape[0]
        
        mean_dist = filtered['distance'].mean()
        var_dist = filtered['distance'].var()
        
        mean_time = filtered['delta_time'].mean()
        var_time = filtered['delta_time'].var()
        
        mean_speed = filtered['speed'].mean()
        var_speed = filtered['speed'].var()
        
        mean_acc = filtered['acceleration'].mean()
        var_acc = filtered['acceleration'].var()
        
        counted_labels = filtered['label'].value_counts().to_dict()
        
        lst.append([u,registros,mean_dist,var_dist,mean_time,var_time,mean_speed,var_speed,mean_acc,var_acc,counted_labels])
    new_df = pd.DataFrame(lst,columns=['User','NumRegistros','Dist Média','Variância Dist','Tempo Médio','Variância Tempo','Vel Média','Variância Vel','Acel Média','Variância Acel','Labels'])
    return new_df
  
def get_data_description_by_label(df):
    subset = df[['label','alt','distance','delta_time','speed','acceleration']]
    labels = df['label'].unique()
    lst = []
    for l in labels:
        filtered = subset[(subset['label']==l)]
        
        registros = filtered.shape[0]
        
        mean_dist = filtered['distance'].mean()
        var_dist = filtered['distance'].var()
        
        mean_time = filtered['delta_time'].mean()
        var_time = filtered['delta_time'].var()
        
        mean_speed = filtered['speed'].mean()
        var_speed = filtered['speed'].var()
        
        mean_acc = filtered['acceleration'].mean()
        var_acc = filtered['acceleration'].var()
        
        lst.append([l,registros,mean_dist,var_dist,mean_time,var_time,mean_speed,var_speed,mean_acc,var_acc])
    new_df = pd.DataFrame(lst,columns=['Label','Num Registros','Dist Média','Variância Dist','Tempo Médio','Variância Tempo','Vel Média','Variância Vel','Acel Média','Variância Acel'])
    return new_df    
    
get_data_description_by_user(sp)

Unnamed: 0,User,NumRegistros,Dist Média,Variância Dist,Tempo Médio,Variância Tempo,Vel Média,Variância Vel,Acel Média,Variância Acel,Labels
0,11,90803,12.007026,26964.07,75.209431,21407320.0,3.186007,13.467504,1.215037,2.478857,{0: 90803}
1,60,17,84.705882,3081.971,9646.352941,1543725000.0,2.411765,28.257353,0.058824,0.058824,"{1: 15, 0: 2}"
2,85,601871,14.606755,105667.6,85.129666,108421600.0,4.3399,45.004284,2.588502,29.585591,"{0: 185365, 3: 184855, 1: 148109, 5: 67329, 4:..."
3,115,148411,91.484533,65349080.0,294.977104,1823094000.0,10.092749,70.091424,3.537858,6.669039,"{4: 99680, 0: 47450, 1: 1099, 5: 182}"
4,175,308,1014.318182,89208340.0,801.331169,45843910.0,3.821429,49.912634,0.116883,2.976522,"{1: 125, 3: 98, 11: 59, 0: 23, 5: 3}"


In [11]:
# 'walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi'
get_data_description_by_label(sp)

Unnamed: 0,Label,Num Registros,Dist Média,Variância Dist,Tempo Médio,Variância Tempo,Vel Média,Variância Vel,Acel Média,Variância Acel
0,0,323643,30.641871,16638980.0,179.068165,727059000.0,3.539693,41.029962,1.249432,5.878684
1,1,149348,24.159888,29188080.0,195.375177,675994800.0,1.434971,14.691312,0.525404,5.554371
2,3,184953,16.042697,169069.2,29.71157,2474699.0,5.275605,41.043716,2.712354,23.572224
3,11,3882,26.441525,7629.138,60.618238,2322442.0,8.136012,35.884098,4.163575,15.091247
4,5,67514,33.938635,42796.62,20.959016,999466.8,14.533415,48.869242,12.03854,61.249326
5,2,838,12.554893,166.6177,112.257757,10076310.0,6.724344,28.730374,4.22315,22.03975
6,4,111232,43.835533,129856.0,70.86316,8983380.0,9.407167,50.993099,3.383909,6.249849


In [6]:
# Testes com a base completa
# obtém a média de registros por usuário
mean = round(df['user'].value_counts().values.mean())
print('Média arredondada de registros =',mean)

# Dicionário com os usuários e a quantidade de registros de cada um
dict_users = df.groupby(['user']).size().to_dict()

# cria sample dos dados com apenas os usuários que possuem quantidade
# de registros maior ou igual a média
avareged_users = [key for key,value in dict_users.items() if value >= mean]
sp = df[df['user'].isin(avareged_users)]

# Drop linhas sem informação do label
sp = sp.drop(sp[sp['label']==0].index)

sp['label'].unique()

Média arredondada de registros = 136687


KeyboardInterrupt: 