# GeoLife 1.3 Data Preparation

In this notebook, the geolife 1.3 dataset is prepared for further processing in the Bachelor thesis "Design and Implementation of an iPhone Application to Determine the Parking Position of a Car by Trajectory Analysis". It is based on the work of https://github.com/jmharkins/trajectory-data 

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import time

Load trajectory file names

In [3]:
data_dir = 'geolife-data/Data'
output_dir = 'geolife-data/Prepared'
dirlist = os.listdir(data_dir)

label_dirs = []
for dir in dirlist[1:]:
    if not os.path.isdir(data_dir + "/" + dir):
        continue
    trajlist = os.listdir(data_dir + '/' + dir)
    if 'labels.txt' in trajlist:
        label_dirs.append(data_dir + '/' + dir)

traj_columns = ['latitude','longitude','altitude','days_total','date','time']

### Function to Check if DateTime Record is in Given Trip

In [4]:
def get_trans_trip(record_dt,ref_df):
    time_fit = (record_dt >= ref_df['Start Time']) & (record_dt <= ref_df['End Time'])
    nmatch = time_fit.sum()
    if nmatch == 0:
        t_idx = None
    else:
        t_idx = ref_df.loc[time_fit].iloc[0].name
    return t_idx

In [7]:
ldirs_counter = 0
for ldirs in label_dirs[:2]:
    all_traj = pd.DataFrame()
    ldirs_counter+=1
    print( time.asctime(time.localtime(time.time())) + '| ' + str(ldirs_counter) + '/' + str(len(label_dirs)) + ": " + ldirs)
    user = ldirs[-3:] # loads usr id from given directory
    trajpath = ldirs + '/Trajectory/'
    traj_files = os.listdir(trajpath) # load trajectory file names
    
    # load transport mode labels into dataframe 
    trip_trans = pd.read_csv(ldirs+'/labels.txt',sep='\t')
    trip_trans['Start Time'] = pd.to_datetime(trip_trans['Start Time'])
    trip_trans['End Time'] = pd.to_datetime(trip_trans['End Time'])
    trip_s_dates = trip_trans['Start Time'].dt.date.unique()
    trip_e_dates = trip_trans['End Time'].dt.date.unique()
    trip_a_dates = np.unique(np.append(trip_s_dates,trip_e_dates))
    
    for tf in traj_files:        
        
        # create trajectory_df
        traj_df = (pd.read_csv(trajpath+tf,
                               skiprows=6,
                               usecols=[0,1,3,4,5,6],
                               names=traj_columns)
            .assign(
                record_dt = lambda x: pd.to_datetime(x['date'] + ' ' + x['time']), # create datetime column 
                user = user)) # assign user to distinguish the trajectories when merged later.
        
        if traj_df['record_dt'].dt.date.isin(trip_a_dates).any():
            traj_df['trans_trip'] = traj_df.apply(lambda x: get_trans_trip(x.record_dt,trip_trans),axis=1)
            has_trip = ~(traj_df.trans_trip.isnull())
            traj_df['trans_mode'] = np.nan
            traj_df.loc[has_trip,'trans_mode'] = traj_df.loc[has_trip].apply(lambda x: trip_trans.loc[x.trans_trip,'Transportation Mode'],axis=1)
            all_traj = pd.concat([all_traj,traj_df])
    all_traj.to_csv(output_dir + '/'+ ldirs[-3:] + '_trip_labeled.csv')
    
print( time.asctime(time.localtime(time.time())) + \
      '| Finished. Read ' + str(ldirs_counter) + \
      ' out of ' + str(len(label_dirs)) + " labeled trajectory sets.")

Tue Aug 27 16:32:34 2019| 1/69: geolife-data/Data/104
Tue Aug 27 16:32:41 2019| 2/69: geolife-data/Data/161
Tue Aug 27 16:32:42 2019| Finished. Read 2 ot of 69 labeled trajectory sets.
