In [1]:
import pandas as pd
import os, sys
import shutil

currentdir = os.path.dirname(os.path.abspath("__file__"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

import pandas as pd
from utils.helpers.alphabetical_number import increment_alphabetical
from utils.helpers.save_trajectory import save_current_trajectory
from utils.helpers.metafile_handler import create_meta_files
from utils.helpers.file_handler import delete_old_files

from constants import K_MAX_LON, K_MAX_LAT, K_MIN_LAT, K_MIN_LON, KOLUMBUS_OUTPUT_FOLDER 

# Declaring variables and constants for this sheet

SHOULD_DELETE_OLD_FILES = True
LOG = False  # Set to true for printing/debug during data extraction

OUTPUT_FOLDER = f"../{KOLUMBUS_OUTPUT_FOLDER}"
RAW_DATA_FILE = "../dataset/kolumbus/2023-09-04.csv"

NUMBER_OF_TRACES = 3000
MAX_TIME_DIFF_SECONDS = (
    32  # Maximum time difference between two consecutive trackpoints in a trajectory
)

In [2]:
# Reading the dataset into a dataframe

raw_df = pd.read_csv(
    RAW_DATA_FILE, usecols=["serviceJourney",
                            "recordedAtTime", "longitude", "latitude"]
)

print("Total unique trips before boundaries: ", raw_df["serviceJourney"].nunique())

filtered_df = raw_df[
    (raw_df["latitude"] >= K_MIN_LAT)
    & (raw_df["latitude"] <= K_MAX_LAT)
    & (raw_df["longitude"] >= K_MIN_LON)
    & (raw_df["longitude"] <= K_MAX_LON)
]
# Deleting raw_df for performance reasons
del raw_df
print("Total unique trips after boundaries: ", filtered_df["serviceJourney"].nunique())

Total unique trips before boundaries:  5194
Total unique trips after boundaries:  4022


In [3]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    delete_old_files(OUTPUT_FOLDER, "META")

In [4]:
# Cell to extract traces that are of a minimum length of N locations and within a given distance window
counter = 0
name_counter = "AAA"

last_driver_id = None
last_timestamp = None

trajectory = []
skipped_trajs = 0
for index, row in filtered_df.iterrows():
    current_driver_id = row["serviceJourney"]
    current_timestamp = pd.to_datetime(row["recordedAtTime"])

    lat = row["latitude"]
    lon = row["longitude"]
    current_location = [lat, lon]

    # On first run or change in driver_id:
    if (not last_timestamp) or (last_driver_id != current_driver_id):
        last_timestamp = current_timestamp
        last_driver_id = current_driver_id
        # Check if trajectory is empty
        if len(trajectory) >= 100:
            save_current_trajectory(
                OUTPUT_FOLDER=OUTPUT_FOLDER,
                file_name=name_counter,
                trajectory=trajectory,
                trajectory_file_prefix="K",
            )
            counter += 1
            name_counter = increment_alphabetical(name_counter)
        else:
            skipped_trajs += 1
        trajectory.clear()
        continue


    trajectory.append((float(lat), float(lon)))

    if counter >= NUMBER_OF_TRACES:
        break

print(f"Skipped trajectories: {skipped_trajs}")

Skipped trajectories: 11


In [6]:
create_meta_files(path_to_files=OUTPUT_FOLDER, data_prefix="K_", number=50, create_test_set=True)