# Sheet for extracting raw-data from Porto dataset

In [1]:
# Importing necessary modules

import pandas as pd
import os, shutil
from haversine import haversine, Unit
# Helper function to compute distance between a list of coordinates (Trajectory distance)
# Haversine distance used

def calculate_trajectory_distance(positions: list[tuple[float]]) -> float:
    """
    Calculate the trajectory distance for a trajectory

    :param: List of coordinates (lat, lon)
    
    :return: Float (km) -> Combined distance between all pairs of points in km
    """
    distance = 0
    for i in range(1, len(positions)):
        from_location = positions[i-1]
        to_location = positions[i]

        distance += haversine(from_location, to_location, unit=Unit.KILOMETERS)
    return distance



In [2]:
# Containning variables and constants for this spreadsheet
SHOULD_DELETE_OLD_FILES= False

OUTPUT_FOLDER = "../dataset/porto"
RAW_DATA_FILE = "../dataset/porto/porto_raw.csv"
CLEAN_DATA_FILE = "../dataset/porto/porto.csv"

LOG = False # Set to true for printing during data extraction

MAX_LON = -8.57
MIN_LON = -8.66
MAX_LAT = 41.19
MIN_LAT = 41.14
MIN_LEN = 40

NUMBER_OF_TRACES = 1000
MAX_DIST_BETWEEN_COORDINATES = 0.2 # Km

X = calculate_trajectory_distance([ ( MIN_LAT, MIN_LON ) , ( MAX_LAT, MIN_LON ) ])
Y = calculate_trajectory_distance([ ( MIN_LAT, MIN_LON ) , ( MIN_LAT, MAX_LON ) ])


In [3]:
# Reading the dataset into dataframe

raw_df = pd.read_csv(RAW_DATA_FILE)

In [4]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/porto/output/'

In [None]:
#Read the data, clean it and insert to cleaned csv file

cleaned_trajectories = []
counter = 0
for index, row in raw_df.iterrows():
    trace_id = row["TRIP_ID"] 

    # If row is missing data: ignore row
    if row["MISSING_DATA"] == True: 
        if LOG: print(trace_id, "is missing data") 
        continue

    trace = row["POLYLINE"][2:-2].split("],[")

    # If trace-length less than "MIN_LEN": ignore row
    if len(trace) < MIN_LEN: 
        if LOG:  print(trace_id, "is less than preferred length")
        continue


    # If trace are outside bounded rectangle or noisy data: ignore row
    
    last_coordinate = trace[0]
    for coordinate in trace:

        lon, lat = list(map(float, coordinate.split(",")))

        # Outside bounded rectangle
        if ( not ( MIN_LAT <= lat <= MAX_LAT )) or ( not ( MIN_LON <= lon <= MAX_LON )):
            if LOG: print(trace_id, "is outside bounded rectangle")
            break

        # Traces with noisy data:
        if last_coordinate != coordinate:
            last_lon, last_lat = list(map(float, last_coordinate.split(",")))
            distance = calculate_trajectory_distance([(last_lat, last_lon), (lat, lon)])
            
            # If distance between two consecutive coordinates are too great:
            if distance > MAX_DIST_BETWEEN_COORDINATES:
                if LOG: print("Possibly noisy data - continuing with next trace")
                break 
            


        last_coordinate = coordinate
        # IMPLEMENT HERE, use distance function 

    # Else, everything is good so far --> Write trajectory to file
    else:
        cleaned_trajectories.append(row)
        counter += 1
        if counter >= NUMBER_OF_TRACES: 
            break

cleaned_df_columns = ["TRIP_ID","CALL_TYPE","ORIGIN_CALL","ORIGIN_STAND","TAXI_ID","TIMESTAMP","DAY_TYPE","MISSING_DATA","POLYLINE"]
cleaned_df = pd.DataFrame(cleaned_trajectories, columns=cleaned_df_columns)
cleaned_df.to_csv(CLEAN_DATA_FILE, index=False)


print("Number of cleaned trajectories written to file:", counter)

Number of cleaned trajectories written to file: 1000
