# Notebook for placing the hashed trajectories into buckets

In [34]:
# Importing nescessary modules
import os
import sys
import shutil
import timeit as ti
from tqdm import tqdm

from multiprocessing import Pool

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")

from utils.helpers.save_trajectory import save_trajectory_hashes
from utils.helpers import file_handler as fh
from utils.helpers import metafile_handler as mfh
from schemes.lsh_disk import DiskLSH

Project root found: c:\Users\eivin\dev\JoonEndreLSH\masteroppgave


# GRID

## Using CityHash

In [46]:
import cityhash
import os
from constants import NUMBER_OF_TRAJECTORIES

# Paths
ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/hashed_data/grid/rome/"
ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE = f"{ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"

ROME_FULL_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/rome/output/"

# Dictionary for the bucket system
bucket_system = {}

# Get filenames from the metafile
files = mfh.read_meta_file(ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE)

ABU_HASH = []

# Iterate through trajectory files and read their hashes
for filename in files:
    file_path = os.path.join(ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER, filename)
    
    # Read the hashes for the trajectory
    trajectory_hashes = fh.read_hash_file(file_path)
    
    # Iterate over each layer's hash
    
    for layer_hash in trajectory_hashes:
        # Convert the list of coordinates into a string
        hash_string = "_".join(map(str, layer_hash))
        
        # Use CityHash for creating a unique key
        hash_key = cityhash.CityHash128(hash_string)
        
        # Place trajectory into the appropriate bucket
        if hash_key not in bucket_system:
            bucket_system[hash_key] = []
        bucket_system[hash_key].append(filename)
        if filename == "R_ABU.txt":
            ABU_HASH.append(hash_key)


# Print the contents of the buckets for "R_ABU.txt"
for hash_key in ABU_HASH:
    print(f"Bucket {hash_key}: {bucket_system[hash_key]}")
   


# Analyze and display results

total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())
largest_bucket = max(bucket_system, key=lambda key: len(bucket_system[key]))
print(f"Largest Bucket: {largest_bucket}")

print(f"Total Buckets: {total_buckets}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")




Bucket 9708320301858696672609084931132828922: ['R_CAV.txt', 'R_DYX.txt', 'R_CDU.txt', 'R_ECN.txt', 'R_EFS.txt', 'R_DOY.txt', 'R_CFV.txt', 'R_AVK.txt', 'R_EDS.txt', 'R_COQ.txt', 'R_AKY.txt', 'R_BCU.txt', 'R_CCZ.txt', 'R_CPD.txt', 'R_AVF.txt', 'R_AVD.txt', 'R_CYW.txt', 'R_AVB.txt', 'R_BNG.txt', 'R_DVK.txt', 'R_ADV.txt', 'R_EHK.txt', 'R_AKK.txt', 'R_BRF.txt', 'R_ARU.txt', 'R_DUB.txt', 'R_CEX.txt', 'R_DJT.txt', 'R_EBK.txt', 'R_CCQ.txt', 'R_AWU.txt', 'R_DGV.txt', 'R_BTH.txt', 'R_DUV.txt', 'R_AFZ.txt', 'R_EDX.txt', 'R_CIV.txt', 'R_ABU.txt', 'R_BDC.txt', 'R_BML.txt', 'R_ECP.txt', 'R_AZS.txt', 'R_BFS.txt', 'R_BUX.txt', 'R_CCJ.txt', 'R_CRC.txt', 'R_ARC.txt', 'R_DDN.txt', 'R_DAQ.txt', 'R_CNH.txt']
Bucket 29427960669666322764846783719845337828: ['R_CAV.txt', 'R_DYX.txt', 'R_CDU.txt', 'R_ECN.txt', 'R_EFS.txt', 'R_DOY.txt', 'R_CFV.txt', 'R_AVK.txt', 'R_EDS.txt', 'R_COQ.txt', 'R_AKY.txt', 'R_BCU.txt', 'R_CCZ.txt', 'R_CPD.txt', 'R_AVF.txt', 'R_AVD.txt', 'R_CYW.txt', 'R_AVB.txt', 'R_BNG.txt', 'R_DVK.t

In [47]:
for i in range(len(ABU_HASH)):
    print(f"ABU.txt buckets: {bucket_system[ABU_HASH[i]]}")





ABU.txt buckets: ['R_CAV.txt', 'R_DYX.txt', 'R_CDU.txt', 'R_ECN.txt', 'R_EFS.txt', 'R_DOY.txt', 'R_CFV.txt', 'R_AVK.txt', 'R_EDS.txt', 'R_COQ.txt', 'R_AKY.txt', 'R_BCU.txt', 'R_CCZ.txt', 'R_CPD.txt', 'R_AVF.txt', 'R_AVD.txt', 'R_CYW.txt', 'R_AVB.txt', 'R_BNG.txt', 'R_DVK.txt', 'R_ADV.txt', 'R_EHK.txt', 'R_AKK.txt', 'R_BRF.txt', 'R_ARU.txt', 'R_DUB.txt', 'R_CEX.txt', 'R_DJT.txt', 'R_EBK.txt', 'R_CCQ.txt', 'R_AWU.txt', 'R_DGV.txt', 'R_BTH.txt', 'R_DUV.txt', 'R_AFZ.txt', 'R_EDX.txt', 'R_CIV.txt', 'R_ABU.txt', 'R_BDC.txt', 'R_BML.txt', 'R_ECP.txt', 'R_AZS.txt', 'R_BFS.txt', 'R_BUX.txt', 'R_CCJ.txt', 'R_CRC.txt', 'R_ARC.txt', 'R_DDN.txt', 'R_DAQ.txt', 'R_CNH.txt']
ABU.txt buckets: ['R_CAV.txt', 'R_DYX.txt', 'R_CDU.txt', 'R_ECN.txt', 'R_EFS.txt', 'R_DOY.txt', 'R_CFV.txt', 'R_AVK.txt', 'R_EDS.txt', 'R_COQ.txt', 'R_AKY.txt', 'R_BCU.txt', 'R_CCZ.txt', 'R_CPD.txt', 'R_AVF.txt', 'R_AVD.txt', 'R_CYW.txt', 'R_AVB.txt', 'R_BNG.txt', 'R_DVK.txt', 'R_ADV.txt', 'R_EHK.txt', 'R_AKK.txt', 'R_BRF.txt', 'R

In [48]:
import pandas as pd
from itertools import combinations

ROME_TRUE_SIMILARITY_FILE = "../../results_true/similarity_values/rome/dtw/rome-dtw-3050.csv"

# Load the similarity matrix CSV
similarity_df = pd.read_csv(ROME_TRUE_SIMILARITY_FILE, index_col=0)

# Results storage for statistics
bucket_stats = {}

# Filter buckets to process only those with more than one trajectory
filtered_buckets = {bucket: trajectories for bucket, trajectories in bucket_system.items() if len(trajectories) > 1}

# Track the bucket with the most trajectories
max_bucket_size = 0
max_bucket = None

# Process each filtered bucket
for bucket, trajectories in filtered_buckets.items():
    similarities = []
    best_pair = None
    worst_pair = None

    # Compute all pairwise similarities within the bucket
    for t1, t2 in combinations(trajectories, 2):
        # Strip `.txt` from the trajectory names
        t1_clean = t1.replace('.txt', '')
        t2_clean = t2.replace('.txt', '')

        # Ensure correct row-column order (t1_clean should be lexicographically larger)
        if t1_clean < t2_clean:
            t1_clean, t2_clean = t2_clean, t1_clean  # Swap for correct matrix access

        if t1_clean in similarity_df.index and t2_clean in similarity_df.columns:
            similarity = float(similarity_df.at[t1_clean, t2_clean])  # Convert to native Python float
            similarities.append(similarity)
            
            # Track the pair with the best similarity (lowest value)
            if best_pair is None or similarity < best_pair[0]:
                best_pair = (similarity, t1, t2)

            # Track the pair with the worst similarity (highest value)
            if worst_pair is None or similarity > worst_pair[0]:
                worst_pair = (similarity, t1, t2)

    if similarities:
        best_similarity = min(similarities)
        worst_similarity = max(similarities)
        avg_similarity = sum(similarities) / len(similarities)
        bucket_stats[bucket] = {
            "best": best_similarity,
            "worst": worst_similarity,
            "average": avg_similarity,
            "best_pair": best_pair,
            "worst_pair": worst_pair,
        }
        
        # Check if this bucket has the most trajectories
        if len(trajectories) > max_bucket_size:
            max_bucket_size = len(trajectories)
            max_bucket = bucket
    else:
        # Handle missing trajectory pairs in the CSV
        bucket_stats[bucket] = {"best": None, "worst": None, "average": None}

# Display results for the bucket with the most trajectories
if max_bucket:
    print(f"Bucket with the most trajectories: {max_bucket} ({max_bucket_size} trajectories)")
    best_similarity, best_t1, best_t2 = bucket_stats[max_bucket]["best_pair"]
    worst_similarity, worst_t1, worst_t2 = bucket_stats[max_bucket]["worst_pair"]
    print(f"Most Similar Pair (Lowest similarity value): {best_similarity} between {best_t1} and {best_t2}")
    print(f"Least Similar Pair (Highest similarity value): {worst_similarity} between {worst_t1} and {worst_t2}")
    print(f"Average Similarity inside bucket '{max_bucket}': {bucket_stats[max_bucket]['average']:.2f}")
else:
    print("No buckets with more than one trajectory found.")



Bucket with the most trajectories: 9708320301858696672609084931132828922 (50 trajectories)
Most Similar Pair (Lowest similarity value): 0.1406260090323557 between R_DUB.txt and R_ECP.txt
Least Similar Pair (Highest similarity value): 10.327691364230498 between R_AVD.txt and R_BDC.txt
Average Similarity inside bucket '9708320301858696672609084931132828922': 2.25


In [49]:
def get_similarity_between_files(file1, file2, similarity_df):
    # Clean the filenames by removing the '.txt' extension
    file1_clean = file1.replace('.txt', '')
    file2_clean = file2.replace('.txt', '')

    # Ensure correct row-column order (file1_clean should be lexicographically larger)
    if file1_clean < file2_clean:
        file1_clean, file2_clean = file2_clean, file1_clean  # Swap for correct matrix access

    # Check if both files are in the DataFrame
    if file1_clean in similarity_df.index and file2_clean in similarity_df.columns:
        similarity_value = float(similarity_df.at[file1_clean, file2_clean])  # Get the similarity value
        return similarity_value
    else:
        # If files are not in the matrix, return None
        return None


get_similarity_between_files("R_DUB.txt", "R_ECP.txt", similarity_df)

0.1406260090323557

# DISK

## Using CityHash

In [65]:
import cityhash
import os
from constants import NUMBER_OF_TRAJECTORIES

# Paths
ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/hashed_data/disk/rome/"
ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE = f"{ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"

ROME_FULL_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/rome/output/"

# Dictionary for the bucket system
bucket_system = {}

# Get filenames from the metafile
files = mfh.read_meta_file(ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE)

ABU_HASH = []

# Iterate through trajectory files and read their hashes
for filename in files:
    file_path = os.path.join(ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER, filename)
    
    # Read the hashes for the trajectory
    trajectory_hashes = fh.read_hash_file(file_path)
    
    # Iterate over each layer's hash
    
    for layer_hash in trajectory_hashes:

        # Convert the list of coordinates into a string
        hash_string = "_".join(map(str, layer_hash))
        print(filename, hash_string)
        # Use CityHash for creating a unique key
        hash_key = cityhash.CityHash128(hash_string)
        if hash_key == "":
            print(filename, hash_string)
        
        # Place trajectory into the appropriate bucket
        if hash_key not in bucket_system:
            bucket_system[hash_key] = []
        bucket_system[hash_key].append(filename)
        if filename == "R_CAV.txt":
            ABU_HASH.append(hash_key)


# Print the contents of the buckets for "R_ABU.txt"
for hash_key in ABU_HASH:
    print(f"Bucket {hash_key}: {bucket_system[hash_key]}")
   


# Analyze and display results

total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())
largest_bucket = max(bucket_system, key=lambda key: len(bucket_system[key]))
print(f"Largest Bucket: {largest_bucket}")

print(f"Total Buckets: {total_buckets}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")




R_CAV.txt AO_AH_AC
R_CAV.txt 
R_CAV.txt 
R_CAV.txt AK_AL_AO
R_DYX.txt 
R_DYX.txt AC_AC_AA_AA
R_DYX.txt AD_AD_AD_AD_AR
R_DYX.txt 
R_CDU.txt AG_AB
R_CDU.txt AC_AC_AC_AC_AM_AB
R_CDU.txt AD_AD_AL
R_CDU.txt 
R_ECN.txt AN_AN
R_ECN.txt AN
R_ECN.txt AG_AF_AK_AD_AR
R_ECN.txt 
R_EFS.txt 
R_EFS.txt AC_AH
R_EFS.txt AD_AR
R_EFS.txt AJ_AB
R_DOY.txt AG_AJ
R_DOY.txt AH_AC_AA_AA
R_DOY.txt AD_AI
R_DOY.txt AR
R_CFV.txt 
R_CFV.txt 
R_CFV.txt AD_AA_AD_AF
R_CFV.txt AJ
R_AVK.txt AG_AB_AB_AH_AC
R_AVK.txt AA_AC_AH
R_AVK.txt 
R_AVK.txt AL
R_EDS.txt AH_AC_AD_AJ_AG
R_EDS.txt AI_AA
R_EDS.txt 
R_EDS.txt AC_AL_AO_AR_AR
R_COQ.txt AB_AG
R_COQ.txt AC_AC_AH_AA
R_COQ.txt AI
R_COQ.txt AR
R_AKY.txt AG
R_AKY.txt AA
R_AKY.txt AR_AD_AR
R_AKY.txt 
R_BCU.txt 
R_BCU.txt AA
R_BCU.txt AF_AD_AR_AR
R_BCU.txt AJ
R_CCZ.txt AE
R_CCZ.txt 
R_CCZ.txt AF_AD_AR
R_CCZ.txt AJ
R_CPD.txt AD_AJ_AB_AG_AD_AJ_AB
R_CPD.txt AH_AM
R_CPD.txt 
R_CPD.txt AL_AO_AR_AO_AL_AK
R_AVF.txt AB_AG
R_AVF.txt AC_AC_AA
R_AVF.txt AD_AI
R_AVF.txt AR_AQ
R_AVD.txt AB
R_A

In [63]:
for i in range(len(ABU_HASH)):
    print(f"ABU.txt buckets: {bucket_system[ABU_HASH[i]]}")



ABU.txt buckets: ['R_CAV.txt', 'R_CCQ.txt']
ABU.txt buckets: ['R_CAV.txt', 'R_CAV.txt', 'R_DYX.txt', 'R_DYX.txt', 'R_CDU.txt', 'R_ECN.txt', 'R_EFS.txt', 'R_CFV.txt', 'R_CFV.txt', 'R_AVK.txt', 'R_EDS.txt', 'R_AKY.txt', 'R_BCU.txt', 'R_CCZ.txt', 'R_CPD.txt', 'R_AVD.txt', 'R_CYW.txt', 'R_BNG.txt', 'R_DVK.txt', 'R_ADV.txt', 'R_EHK.txt', 'R_BRF.txt', 'R_ARU.txt', 'R_CEX.txt', 'R_DJT.txt', 'R_EBK.txt', 'R_CCQ.txt', 'R_AWU.txt', 'R_DGV.txt', 'R_DGV.txt', 'R_BTH.txt', 'R_BTH.txt', 'R_DUV.txt', 'R_AFZ.txt', 'R_BDC.txt', 'R_BDC.txt', 'R_BUX.txt', 'R_CCJ.txt', 'R_DAQ.txt', 'R_DAQ.txt', 'R_DAQ.txt', 'R_CNH.txt', 'R_CNH.txt']
ABU.txt buckets: ['R_CAV.txt', 'R_CAV.txt', 'R_DYX.txt', 'R_DYX.txt', 'R_CDU.txt', 'R_ECN.txt', 'R_EFS.txt', 'R_CFV.txt', 'R_CFV.txt', 'R_AVK.txt', 'R_EDS.txt', 'R_AKY.txt', 'R_BCU.txt', 'R_CCZ.txt', 'R_CPD.txt', 'R_AVD.txt', 'R_CYW.txt', 'R_BNG.txt', 'R_DVK.txt', 'R_ADV.txt', 'R_EHK.txt', 'R_BRF.txt', 'R_ARU.txt', 'R_CEX.txt', 'R_DJT.txt', 'R_EBK.txt', 'R_CCQ.txt', 'R_AWU.txt

In [64]:
import pandas as pd
from itertools import combinations

ROME_TRUE_SIMILARITY_FILE = "../../results_true/similarity_values/rome/dtw/rome-dtw-3050.csv"

# Load the similarity matrix CSV
similarity_df = pd.read_csv(ROME_TRUE_SIMILARITY_FILE, index_col=0)

# Results storage for statistics
bucket_stats = {}

# Filter buckets to process only those with more than one trajectory
filtered_buckets = {bucket: trajectories for bucket, trajectories in bucket_system.items() if len(trajectories) > 1}

# Track the bucket with the most trajectories
max_bucket_size = 0
max_bucket = None

# Process each filtered bucket
for bucket, trajectories in filtered_buckets.items():
    similarities = []
    best_pair = None
    worst_pair = None

    # Compute all pairwise similarities within the bucket
    for t1, t2 in combinations(trajectories, 2):
        # Strip `.txt` from the trajectory names
        t1_clean = t1.replace('.txt', '')
        t2_clean = t2.replace('.txt', '')

        # Ensure correct row-column order (t1_clean should be lexicographically larger)
        if t1_clean < t2_clean:
            t1_clean, t2_clean = t2_clean, t1_clean  # Swap for correct matrix access

        if t1_clean in similarity_df.index and t2_clean in similarity_df.columns:
            similarity = float(similarity_df.at[t1_clean, t2_clean])  # Convert to native Python float
            similarities.append(similarity)
            
            # Track the pair with the best similarity (lowest value)
            if best_pair is None or similarity < best_pair[0]:
                best_pair = (similarity, t1, t2)

            # Track the pair with the worst similarity (highest value)
            if worst_pair is None or similarity > worst_pair[0]:
                worst_pair = (similarity, t1, t2)

    if similarities:
        best_similarity = min(similarities)
        worst_similarity = max(similarities)
        avg_similarity = sum(similarities) / len(similarities)
        bucket_stats[bucket] = {
            "best": best_similarity,
            "worst": worst_similarity,
            "average": avg_similarity,
            "best_pair": best_pair,
            "worst_pair": worst_pair,
        }
        
        # Check if this bucket has the most trajectories
        if len(trajectories) > max_bucket_size:
            max_bucket_size = len(trajectories)
            max_bucket = bucket
    else:
        # Handle missing trajectory pairs in the CSV
        bucket_stats[bucket] = {"best": None, "worst": None, "average": None}

# Display results for the bucket with the most trajectories
if max_bucket:
    print(f"Bucket with the most trajectories: {max_bucket} ({max_bucket_size} trajectories)")
    best_similarity, best_t1, best_t2 = bucket_stats[max_bucket]["best_pair"]
    worst_similarity, worst_t1, worst_t2 = bucket_stats[max_bucket]["worst_pair"]
    print(f"Most Similar Pair (Lowest similarity value): {best_similarity} between {best_t1} and {best_t2}")
    print(f"Least Similar Pair (Highest similarity value): {worst_similarity} between {worst_t1} and {worst_t2}")
    print(f"Average Similarity inside bucket '{max_bucket}': {bucket_stats[max_bucket]['average']:.2f}")
else:
    print("No buckets with more than one trajectory found.")



Bucket with the most trajectories: 82332263323914296566372529678324145705 (43 trajectories)
Most Similar Pair (Lowest similarity value): 0.0 between R_CAV.txt and R_CAV.txt
Least Similar Pair (Highest similarity value): 10.327691364230498 between R_AVD.txt and R_BDC.txt
Average Similarity inside bucket '82332263323914296566372529678324145705': 2.55
