In [16]:
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import json

In [17]:
raw_train = pd.read_csv('./Taxi Trajectory Prediction/train.csv')

In [18]:
print(f'Número de trajetórias antes da limpeza: {len(raw_train)}')

Número de trajetórias antes da limpeza: 1710670


In [19]:
# Remove instancias como 'MISSING_DATA'
raw_train_df = raw_train[raw_train['MISSING_DATA'] == False]

In [20]:
# Remove colunas com dados faltantes ou pouco relevantes = DAY_TYPE, MISSING_DATA, ORIGIN_STAND, ORIGIN_CALL
raw_train.drop(columns=['DAY_TYPE', 'MISSING_DATA', 'ORIGIN_STAND', 'ORIGIN_CALL'], inplace=True)

In [21]:
print(f'Número de trajetórias antes da limpeza: {len(raw_train)}')

Número de trajetórias antes da limpeza: 1710670


In [22]:
# Retira uma amostra com tamanho de 0.05% do dataset apos limpeza
sample_train = raw_train_df.sample(frac=0.0005, random_state=3)
len(sample_train)

855

In [23]:
formatted_sample = pd.DataFrame({
  'taxi_id': pd.Series(dtype='int'),
  'lat': pd.Series(dtype='float'),
  'lng': pd.Series(dtype='float'),
  'timestamp': pd.Series(dtype='int'),
  'trip_id': pd.Series(dtype='int'),
  'call_type': pd.Series(dtype='str'),
})

In [24]:
for index, row in tqdm(sample_train.iterrows()):
  taxi_id = row['TAXI_ID']
  timestamp = row['TIMESTAMP']
  trip_id = row['TRIP_ID']
  call_type = row['CALL_TYPE']
  polyline = json.loads(row['POLYLINE'])
  for index, coor in enumerate(polyline):
    new_row = pd.DataFrame({
      'taxi_id': int(taxi_id),
      'lat': coor[1],
      'lng': coor[0], 
      'timestamp': datetime.fromtimestamp(timestamp + index *  15),
      'trip_id': int(trip_id),
      'call_type': str(call_type),
      }, index=[0])
    formatted_sample = pd.concat([new_row, formatted_sample[:]]).reset_index(drop=True)

855it [03:24,  4.18it/s]


In [25]:
# Estrutura dos dados formatados
formatted_sample.head(5)

Unnamed: 0,taxi_id,lat,lng,timestamp,trip_id,call_type
0,20000648,41.15826,-8.682777,2013-09-20 22:36:04,1379726734620000648,B
1,20000648,41.158251,-8.682786,2013-09-20 22:35:49,1379726734620000648,B
2,20000648,41.15826,-8.682795,2013-09-20 22:35:34,1379726734620000648,B
3,20000648,41.157405,-8.682102,2013-09-20 22:35:19,1379726734620000648,B
4,20000648,41.155947,-8.680914,2013-09-20 22:35:04,1379726734620000648,B


In [26]:
# Pega apenas trajetórias com mais de 5 pontos
for traj_id in formatted_sample['trip_id'].unique():
    if (len(formatted_sample[formatted_sample['trip_id'] == traj_id]) < 5):
        formatted_sample = formatted_sample[formatted_sample['trip_id'] != traj_id]

In [27]:
print('Número de trajetórias extraidos do dataset:', len(formatted_sample['trip_id'].unique()))

Número de trajetórias extraidos do dataset: 831


In [28]:
# Salva a amostra limpa e formata para ser usanda em outros experimentos
formatted_sample.to_csv('./train_formatted_sample.csv', index=False)