# Sheet for extracting raw-data from Porto dataset


In [10]:
# Importing necessary modules
import pandas as pd
import os
import sys
import shutil
from haversine import haversine, Unit

currentdir = os.path.dirname(os.path.abspath("__file__"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from utils.helpers.file_handler import delete_old_files
from utils.helpers.alphabetical_number import increment_alphabetical
from utils.helpers.save_trajectory import save_current_trajectory
from utils.helpers.metafile_handler import create_meta_files

# Helper function to compute distance between a list of coordinates (Trajectory distance)
# Haversine distance used
def calculate_trajectory_distance(positions: list[tuple[float]]) -> float:
    """
    Calculate the trajectory distance for a trajectory

    :param: List of coordinates (lat, lon)

    :return: Float (km) -> Combined distance between all pairs of points in km
    """
    distance = 0
    for i in range(1, len(positions)):
        from_location = positions[i - 1]
        to_location = positions[i]

        distance += haversine(from_location, to_location, unit=Unit.KILOMETERS)
    return distance

In [11]:
from constants import P_MAX_LAT, P_MAX_LON, P_MIN_LAT, P_MIN_LON, PORTO_OUTPUT_FOLDER

# Containning variables and constants for this spreadsheet
SHOULD_DELETE_OLD_FILES = True

OUTPUT_FOLDER = f"../{PORTO_OUTPUT_FOLDER}"
RAW_DATA_FILE = "../dataset/porto/porto_raw.csv"
OUTPUT_DATA_FILE = "../dataset/porto/porto.csv"

LOG = False  # Set to true for printing during data extraction


MIN_LEN = 40

NUMBER_OF_TRACES = 3000
MAX_DIST_BETWEEN_COORDINATES = 0.2  # Km

X = calculate_trajectory_distance([(P_MIN_LAT, P_MIN_LON), (P_MAX_LAT, P_MIN_LON)])
Y = calculate_trajectory_distance([(P_MIN_LAT, P_MIN_LON), (P_MIN_LAT, P_MAX_LON)])

print(X, Y)

5.559754011677007 7.536733782089804


In [12]:
# Reading the dataset into dataframe

raw_df = pd.read_csv(RAW_DATA_FILE)

In [13]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    delete_old_files(OUTPUT_FOLDER, "META")

In [14]:
# Read the data, clean it and insert to cleaned csv file

cleaned_trajectories = []
counter = 0
name_counter = "AAA"

for index, row in raw_df.iterrows():
    trace_id = row["TRIP_ID"]

    # If row is missing data: ignore row
    if row["MISSING_DATA"] == True:
        if LOG:
            print(trace_id, "is missing data")
        continue
    trace = row["POLYLINE"][2:-2].split("],[")

    # If trace-length less than "MIN_LEN": ignore row
    if len(trace) < MIN_LEN:
        if LOG:
            print(trace_id, "is less than preferred length")
        continue

    # If trace are outside bounded rectangle or noisy data: ignore row

    last_coordinate = trace[0]
    for coordinate in trace:
        lon, lat = list(map(float, coordinate.split(",")))

        # Outside bounded rectangle
        if (not (P_MIN_LAT <= lat <= P_MAX_LAT)) or (
            not (P_MIN_LON <= lon <= P_MAX_LON)
        ):
            if LOG:
                print(trace_id, "is outside bounded rectangle")
            break

        # Traces with noisy data:
        if last_coordinate != coordinate:
            last_lon, last_lat = list(map(float, last_coordinate.split(",")))
            distance = calculate_trajectory_distance([(last_lat, last_lon), (lat, lon)])

            # If distance between two consecutive coordinates are too great:
            if distance > MAX_DIST_BETWEEN_COORDINATES:
                if LOG:
                    print("Possibly noisy data - continuing with next trace")
                break

        last_coordinate = coordinate
        # IMPLEMENT HERE, use distance function

    # Else, everything is good so far --> Write trajectory to file
    else:
        cleaned_trajectories.append(row)
        trajectory = []
        for coordinate in trace:
            lon, lat = list(map(float, coordinate.split(",")))
            trajectory.append((lat, lon))
        save_current_trajectory(
            OUTPUT_FOLDER=OUTPUT_FOLDER,
            file_name=name_counter,
            trajectory=trajectory,
            trajectory_file_prefix="P",
        )

        counter += 1
        name_counter = increment_alphabetical(name_counter)

        if counter >= NUMBER_OF_TRACES:
            break

cleaned_df_columns = [
    "TRIP_ID",
    "CALL_TYPE",
    "ORIGIN_CALL",
    "ORIGIN_STAND",
    "TAXI_ID",
    "TIMESTAMP",
    "DAY_TYPE",
    "MISSING_DATA",
    "POLYLINE",
]
cleaned_df = pd.DataFrame(cleaned_trajectories, columns=cleaned_df_columns)
cleaned_df.to_csv(OUTPUT_DATA_FILE, index=False)


print("Number of cleaned trajectories written to file:", counter)

Number of cleaned trajectories written to file: 3000


In [15]:
create_meta_files(
    path_to_files=OUTPUT_FOLDER, data_prefix="P_", number=50, create_test_set=True
)

## Average number of points per trajectory


In [16]:
total_lines = 0
file_count = 0

for root, dirs, files in os.walk(OUTPUT_FOLDER):
    for file in files:
        if file.startswith("META-"):
            continue
        try:
            with open(os.path.join(root, file), "r") as f:
                total_lines += sum(1 for line in f)
                file_count += 1
        except Exception as e:
            print(f"Error reading {file}: {e}")

if file_count > 0:
    print(total_lines / file_count)

57.50966666666667
