# Notebook for computing hashes, buckets and similarity values for the disk scheme. 

Utilizes the disk scheme

Incorporates:
* Hashing of trajectories using disk scheme
* Bucketing of hashes made from disk scheme
* Similarity computation between trajectories within buckets.
    * Both for DTW and Frechet

Produces:
* JSON file containing buckets
* Similarity values for trajectories within buckets


In [1]:
import os
import sys

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")


Project root found: c:\Users\eivin\dev\JoonEndreLSH\masteroppgave


# Rome


In [None]:
#write to file

### DTW


In [2]:
from computation.disk_similarity import generate_disk_hash_similarity_with_bucketing


measure = "dtw"
similarities, bucket_system = generate_disk_hash_similarity_with_bucketing(
    city="rome", diameter=0.6, layers=5, disks=50, measure=measure, size=50
)

# # print all elements in bucket_system
# for key, value in bucket_system.items():
#     print(key, value)



print(similarities)
output_path = "../../../results_hashed/similarity_values/disk/rome/dtw/"


similarities.to_csv(os.path.join(output_path, "disk_rome-dtw.csv"))

Project root found: c:\Users\eivin\dev\JoonEndreLSH\masteroppgave
          R_ABU     R_ADV  R_AFZ     R_AKK  R_AKY    R_ARC    R_ARU     R_AVB  \
R_ABU  0.000000  0.000000    0.0  0.000000    0.0  0.00000  0.12624  0.000000   
R_ADV  0.000000  0.000000    0.0  0.128451    0.0  0.00000  0.00000  0.181262   
R_AFZ  0.000000  0.000000    0.0  0.000000    0.0  0.00000  0.00000  0.000000   
R_AKK  0.000000  0.128451    0.0  0.000000    0.0  0.00000  0.00000  0.090051   
R_AKY  0.000000  0.000000    0.0  0.000000    0.0  0.00000  0.00000  0.000000   
R_ARC  0.000000  0.000000    0.0  0.000000    0.0  0.00000  0.00000  0.000000   
R_ARU  0.126240  0.000000    0.0  0.000000    0.0  0.00000  0.00000  0.000000   
R_AVB  0.000000  0.181262    0.0  0.090051    0.0  0.00000  0.00000  0.000000   
R_AVD  0.000000  0.000000    0.0  0.000000    0.0  0.00000  0.00000  0.000000   
R_AVF  0.000000  0.000000    0.0  0.000000    0.0  0.00000  0.00000  0.000000   
R_AVK  0.000000  0.000000    0.0  0.104621 

In [3]:
import pandas as pd

ROME_TRUE_SIMILARITY_FILE = "../../../results_true/similarity_values/rome/dtw/rome-dtw-3050.csv"

def get_true_similarity(filename1: str, filename2: str) -> float | None:
    """
    Find the true similarity between two trajectory filenames using a similarity matrix file.

    Args:
        filename1 (str): First trajectory file (with or without `.txt`).
        filename2 (str): Second trajectory file (with or without `.txt`).

    Returns:
        float | None: The similarity value if found, otherwise None.
    """
    # Load the similarity matrix CSV
    similarity_df = pd.read_csv(ROME_TRUE_SIMILARITY_FILE, index_col=0)
    
    # Clean file names by removing '.txt'
    t1_clean = filename1.replace('.txt', '')
    t2_clean = filename2.replace('.txt', '')

    # Ensure correct row-column order for the matrix
    if t1_clean < t2_clean:
        t1_clean, t2_clean = t2_clean, t1_clean

    # Check if both are in the DataFrame
    if t1_clean in similarity_df.index and t2_clean in similarity_df.columns:
        print(f"Accessing row {t1_clean} and column {t2_clean}")
        return float(similarity_df.loc[t1_clean, t2_clean])
    elif t2_clean in similarity_df.index and t1_clean in similarity_df.columns:
        print(f"Accessing row {t2_clean} and column {t1_clean}")
        return float(similarity_df.loc[t2_clean, t1_clean])
    else:
        print(f"Missing pair in similarity matrix: {filename1}, {filename2}")
        return None
    

get_true_similarity("R_CAV", "R_DVK")


Accessing row R_DVK and column R_CAV


0.4753952612424382

### Frechet


In [None]:
measure = "frechet"
similarities = generate_disk_hash_similarity(
    city="rome", diameter=1.6, layers=5, disks=100, measure=measure, size=500
)
output_path = f"similarity_values/disk/rome/disk_rome-{measure}.csv"
# similarities.to_csv(os.path.abspath(output_path))

# Porto


In [None]:
measure = "dtw"
similarities = generate_disk_hash_similarity(
    "porto", diameter=2.2, layers=4, disks=60, measure=measure, size=50
)
output_path = f"similarity_values/disk/porto/disk_porto-{measure}.csv"
similarities.to_csv(os.path.abspath(output_path))