Itererer gjennom listen med datapunkter og oppretter trajectories.

Det opprettes en trajectory, der alle påfølgende punkter legges inn, så lenge det er samme sjåfør, innenfor et tidsvindu og innenfor et bounding rectangle.

Et trajectory blir lagret dersom det er minst MIN_LEN punkter langt, er innenfor MIN_DIST og MAX_DIST, samt at avstanden mellom start og sluttpunkt ikke er mer enn 2.5 ganger sporets reelle lengde.

Sporene blir konstruert slik at siste punkt i et spor, kan ha sitt neste (naturlige) punkt i det neste genererte sporet, såfremt det tilfreddsstiller de andre kravene under generering. Dette bør ikke bli et problem, men er greit å opplyse om i oppgaven. De genererte sporene fra Roma-settet vil trolig ha mer komplisert data enn Porto, da det ikke er gitt om de genererte sporene er faktiske taxiturer, deler av taxiturer, snirkling - det kan være hva som helst, og mest sannsynlig en blanding av flere kategorier av kjøretyper.


In [1]:

from haversine import haversine, Unit
import shutil
from datetime import datetime, timedelta
import numpy as np
import re
import sys
import os
import pandas as pd

currentdir = os.path.dirname(os.path.abspath("__file__"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)
from utils.helpers.metafile_handler import create_meta_files
from utils.helpers.save_trajectory import save_current_trajectory
from utils.helpers.alphabetical_number import increment_alphabetical
from utils.helpers.file_handler import delete_old_files

def calculate_trajectory_distance(positions: list[tuple[float]]) -> float:
    """
    Calculate the trajectory distance for a trajectory

    :param: List of coordinates (lat, lon)

    :return: Float (km) -> Combined distance between all pairs of points in km
    """
    distance = 0
    for i in range(1, len(positions)):
        from_location = positions[i - 1]
        to_location = positions[i]

        distance += haversine(from_location, to_location, unit=Unit.KILOMETERS)
    return distance

In [2]:
from constants import ROME_OUTPUT_FOLDER, R_MAX_LAT, R_MAX_LON, R_MIN_LAT, R_MIN_LON, R_MIN_LON
# Declaring variables and constants for this sheet

SHOULD_DELETE_OLD_FILES = True

OUTPUT_FOLDER = f"../{ROME_OUTPUT_FOLDER}"
RAW_DATA_FILE = "../dataset/rome/rome_raw.txt"
OUTPUT_DATA_FILE = "../dataset/rome/rome.csv"


LOG = False  # Set to true for printing/debug during data extraction

MIN_LEN = 40  # Perhaps change this one
MIN_DIST = 4  # In km
MAX_DIST = 6  # In km

MAX_TIME_DIFF_SECONDS = (
    32  # Maximum time difference between two consecutive trackpoints in a trajectory
)
MAX_DIST_BETWEEN_LOCATIONS = 0.4  # Km

NUMBER_OF_TRACES = 100000

X = calculate_trajectory_distance([(R_MIN_LAT, R_MIN_LON), (R_MAX_LAT, R_MIN_LON)])
Y = calculate_trajectory_distance([(R_MIN_LAT, R_MIN_LON), (R_MIN_LAT, R_MAX_LON)])

In [12]:
# Reading the dataset into a dataframe

raw_df = pd.read_csv(
    RAW_DATA_FILE, delimiter=";", names=["index", "timestamp", "location"]
)

print(raw_df.info)

<bound method DataFrame.info of           index                      timestamp  \
0           156  2014-02-01 00:00:00.739166+01   
1           187  2014-02-01 00:00:01.148457+01   
2           297  2014-02-01 00:00:01.220066+01   
3            89  2014-02-01 00:00:01.470854+01   
4            79  2014-02-01 00:00:01.631136+01   
...         ...                            ...   
21817846    220  2014-03-02 23:59:58.282599+01   
21817847    324  2014-03-02 23:59:58.460481+01   
21817848    266  2014-03-02 23:59:58.468372+01   
21817849    360  2014-03-02 23:59:58.886768+01   
21817850    113  2014-03-02 23:59:58.943143+01   

                                          location  
0         POINT(41.8836718276551 12.4877775603346)  
1         POINT(41.9285433333333 12.4690366666667)  
2         POINT(41.8910686119733 12.4927045625339)  
3         POINT(41.7931766914244 12.4321219603157)  
4                   POINT(41.90027472 12.46274618)  
...                                            ..

In [4]:
# Must sort the values according to their index and timestamp
# Also copying the dataframe as we will touch the values in the "location" column
# Will probably take some time

df = raw_df.head(1500000).sort_values(by=["index", "timestamp"]).copy()

# Deleting raw_df for performance reasons
del raw_df

In [5]:
# Cell that translates the location column of the dataframe to only the locations
# Will probably take some time to finish

df["location"] = df["location"].apply(
    lambda x: re.search(r"\(.*?\)", x).group(0)[1:-1].replace(" ", ",")
)

In [6]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
   delete_old_files(OUTPUT_FOLDER, "META")

In [7]:
# Cell to extract traces that are of a minimum length of N locations and within a given distance window
counter = 0
name_counter = "AAA"

last_driver_id = None
last_timestamp = None
last_location = None
trajectory = []

cleaned_trajectories = []
for index, row in df.iterrows():
    current_driver_id = row["index"]
    current_timestamp = row["timestamp"]

    current_location = row["location"].split(",")
    lat, lon = current_location

    # On first run or change in driver_id:
    if (not last_timestamp) or (last_driver_id != current_driver_id):
        last_timestamp = current_timestamp
        last_location = current_location
        last_driver_id = current_driver_id
        trajectory.clear()
        continue

    # Location outside bounded rectangle, go to next row
    if (not (R_MIN_LON <= float(lon) <= R_MAX_LON)) or (
        not (R_MIN_LAT <= float(lat) <= R_MAX_LAT)
    ):
        if LOG:
            print("Location outside bounded rectangle: ", lat, lon)
        continue

    try:
        # Checking whether time between two consecutive points are greater than a given const
        t_last = datetime.strptime(
            last_timestamp[0 : last_timestamp.index(".")], "%Y-%m-%d %H:%M:%S"
        )
        t_current = datetime.strptime(
            current_timestamp[0 : current_timestamp.index(".")], "%Y-%m-%d %H:%M:%S"
        )

    except:
        last_timestamp = current_timestamp
        last_location = current_location
        last_driver_id = current_driver_id
        trajectory.clear()
        continue

    # If timedelta small enough
    if t_current - t_last <= timedelta(seconds=MAX_TIME_DIFF_SECONDS):
        if LOG:
            print("Timedelta is small enough")

        trajectory.append((float(lat), float(lon)))
    if MIN_LEN <= len(trajectory):
        trajectory_distance = calculate_trajectory_distance(trajectory)
        shortest_distance = calculate_trajectory_distance(
            [trajectory[0], trajectory[-1]]
        )

        if (
            MIN_DIST <= trajectory_distance <= MAX_DIST
            and trajectory_distance < shortest_distance * 2.5
        ):
            # TODO Ensure that all points in trajectory are valid

            old_loc = trajectory[0]
            for loc in trajectory:
                if old_loc != loc:
                    distance = calculate_trajectory_distance([(old_loc), (loc)])

                    # If distance between two consecutive coordinates are too great:
                    if distance > MAX_DIST_BETWEEN_LOCATIONS:
                        if LOG:
                            print("Possibly noisy data - continuing with next trace")
                        break

                old_loc = loc

            else:
                # Saving trajectory and updating variables
                # print("------------------------------------")
                # print(trajectory)
                # coordinates_lists = [[lat, lon] for lat, lon in trajectory]
                cleaned_trajectories.append(
                    [current_driver_id, current_timestamp, [float(lat), float(lon)]]
                )
                # if LOG:
                #     print(
                #         f"Total distance: {trajectory_distance,
                #                            shortest_distance, len(trajectory)}"
                #     )
                save_current_trajectory(
                    OUTPUT_FOLDER=OUTPUT_FOLDER,
                    file_name=name_counter,
                    trajectory=trajectory,
                    trajectory_file_prefix="R",
                )
                # if LOG:

                #     print(
                #         f"Total distance: {
                #             name_counter, trajectory_distance, shortest_distance, len(trajectory)}"
                #     )

                name_counter = increment_alphabetical(name_counter)

                counter += 1

            trajectory.clear()

        if trajectory_distance > MAX_DIST:
            trajectory.clear()

    # If enough trajetories are generated:
    if counter >= NUMBER_OF_TRACES:
        break

    last_timestamp = current_timestamp
    # last_driver_id = last_driver_id
    # last_location = last_location
print(cleaned_trajectories[0:10])
cleaned_df = pd.DataFrame(
    cleaned_trajectories, columns=["index", "timestamp", "location"]
)
cleaned_df.to_csv(OUTPUT_DATA_FILE, index=False)

print(f"Created {counter} trajectories")

[[2, '2014-02-01 00:14:10.132878+01', [41.9033913036255, 12.4876087503095]], [2, '2014-02-01 01:14:53.440003+01', [41.8999142436576, 12.4441906822934]], [2, '2014-02-01 01:52:19.7449+01', [41.8959594244013, 12.4496450994481]], [2, '2014-02-01 02:58:57.352827+01', [41.8824816421777, 12.454844364531]], [2, '2014-02-01 04:42:58.23288+01', [41.9077406913423, 12.4910772764437]], [2, '2014-02-01 21:42:43.685802+01', [41.9053566960219, 12.4493586509094]], [2, '2014-02-01 21:58:53.645706+01', [41.9031474155762, 12.4867018941079]], [2, '2014-02-01 23:40:25.264024+01', [41.9127874299986, 12.4929061846492]], [2, '2014-02-02 04:58:28.729974+01', [41.9126299945405, 12.4845480461226]], [2, '2014-02-02 05:19:42.090741+01', [41.9129688470329, 12.4502509292558]]]
Created 3059 trajectories


In [8]:
create_meta_files(path_to_files=OUTPUT_FOLDER,
                  data_prefix="R_", number=50, create_test_set=True)

## Average number of points per trajectory

In [10]:
total_lines = 0
file_count = 0

for root, dirs, files in os.walk(OUTPUT_FOLDER):
    for file in files:
        if file.startswith("META-"):
            continue
        try:
            with open(os.path.join(root, file), 'r') as f:
                total_lines += sum(1 for line in f)
                file_count += 1
        except Exception as e:
            print(f"Error reading {file}: {e}")

if file_count > 0:
    print(total_lines / file_count)
    
print(total_lines)
print(file_count)

111.0228832951945
339619
3059
