In [1]:
import pandas as pd
import os, sys
import shutil

currentdir = os.path.dirname(os.path.abspath("__file__"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

import pandas as pd
from utils.helpers.alphabetical_number import increment_alphabetical
from utils.helpers.save_trajectory import save_current_trajectory
from utils.helpers.metafile_handler import create_meta_files

# Declaring variables and constants for this sheet

SHOULD_DELETE_OLD_FILES = True
LOG = False  # Set to true for printing/debug during data extraction

OUTPUT_FOLDER = "../dataset/kolumbus/output"
RAW_DATA_FILE = "../dataset/kolumbus/2023-09-04.csv"

NUMBER_OF_TRACES = 1000
MAX_TIME_DIFF_SECONDS = (
    32  # Maximum time difference between two consecutive trackpoints in a trajectory
)

In [2]:
# Reading the dataset into a dataframe

raw_df = pd.read_csv(
    RAW_DATA_FILE, usecols=["serviceJourney",
                            "recordedAtTime", "longitude", "latitude"]
)
# Filtering the data
filtered_df = raw_df[
    (raw_df["latitude"] >= 58.71)
    & (raw_df["latitude"] <= 59.02)
    & (raw_df["longitude"] >= 5.53)
    & (raw_df["longitude"] <= 5.85)
]
# Deleting raw_df for performance reasons
del raw_df
print(filtered_df.head(10))

                                 serviceJourney        recordedAtTime  \
0  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:30Z   
1  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:32Z   
2  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:34Z   
3  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:36Z   
4  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:38Z   
5  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:40Z   
6  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:42Z   
7  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:44Z   
8  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:46Z   
9  KOL:ServiceJourney:1000_230509093645391_1000  2023-09-04T03:19:48Z   

   longitude   latitude  
0   5.733078  59.003486  
1   5.732808  59.003419  
2   5.732552  59.003352  
3   5.732305  59.003284  
4   5.732059  59.003217  
5   5.731810  59.003149  
6   5.731591  

In [3]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        if filename.startswith("META"):
            continue
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [4]:
# Cell to extract traces that are of a minimum length of N locations and within a given distance window
counter = 0
name_counter = "AAA"

last_driver_id = None
last_timestamp = None

trajectory = []

for index, row in filtered_df.iterrows():
    current_driver_id = row["serviceJourney"]
    current_timestamp = pd.to_datetime(row["recordedAtTime"])

    lat = row["latitude"]
    lon = row["longitude"]
    current_location = [lat, lon]

    # On first run or change in driver_id:
    if (not last_timestamp) or (last_driver_id != current_driver_id):
        last_timestamp = current_timestamp
        last_driver_id = current_driver_id
        # Check if trajectory is empty
        if len(trajectory) > 0:
            save_current_trajectory(
                OUTPUT_FOLDER=OUTPUT_FOLDER,
                file_name=name_counter,
                trajectory=trajectory,
                trajectory_file_prefix="K",
            )
            counter += 1
            name_counter = increment_alphabetical(name_counter)
        trajectory.clear()
        continue


    trajectory.append((float(lat), float(lon)))

    if counter >= NUMBER_OF_TRACES:
        break

In [6]:
# create_meta_files(path_to_files=OUTPUT_FOLDER, data_prefix="K_", create_test_set=True)