# Notebook for placing the hashed trajectories into buckets

In [21]:
# Importing nescessary modules
import os
import sys
import shutil
import timeit as ti
from tqdm import tqdm

from multiprocessing import Pool

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")

from utils.helpers.save_trajectory import save_trajectory_hashes
from utils.helpers import file_handler as fh
from utils.helpers import metafile_handler as mfh
from schemes.lsh_disk import DiskLSH

Project root found: c:\Users\eivin\dev\JoonEndreLSH\masteroppgave


In [24]:

import json
import os
from constants import NUMBER_OF_TRAJECTORIES

# Paths
ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/hashed_data/grid/rome/"

ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE = f"{ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"


# Dictionary for the bucket system
bucket_system = {}

# Get filenames from the metafile
files = mfh.read_meta_file(ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE)

# Iterate through trajectory files and read their hashes
for filename in files:
    file_path = os.path.join(ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER, filename)
    
    # print(f"Reading hashes for {filename}")
    # Read the hashes for the trajectory
    trajectory_hashes = fh.read_hash_file(file_path)
    # print(trajectory_hashes)
    
    # Iterate over each layer's hash
    for layer_hash in trajectory_hashes:
        # Clean up the coordinates: remove the square brackets and convert to float
        cleaned_hash = [float(coord.strip('[]')) for coord in layer_hash]  # Remove brackets and convert to float
        
        # Convert cleaned list to a tuple for immutability and hashing
        hash_key = tuple(cleaned_hash)
        
        # Place trajectory into the appropriate bucket
        if hash_key not in bucket_system:
            bucket_system[hash_key] = []
        bucket_system[hash_key].append(filename)


# Display the result
# for bucket, trajectories in bucket_system.items():
#     print(f"Bucket {bucket}: {trajectories}")


# Flag to check if any bucket contains more than one trajectory
found_multiple = False

# Check if any bucket has more than one trajectory
for bucket, trajectories in bucket_system.items():
    if len(trajectories) > 1:
        print(f"Bucket {bucket} contains more than one trajectory: {trajectories}")
        found_multiple = True

# If no bucket has more than one trajectory
if not found_multiple:
    print("No buckets contain more than one trajectory.")

Bucket (41.88235006936425, 12.479330961679777, 41.88235006936425, 12.443135232662291, 41.88235006936425, 12.479330961679777) contains more than one trajectory: ['R_CAV.txt', 'R_ADV.txt', 'R_BXO.txt', 'R_DBJ.txt', 'R_END.txt', 'R_DCA.txt', 'R_CKC.txt', 'R_BDJ.txt', 'R_BIS.txt', 'R_AEO.txt', 'R_ACD.txt', 'R_EHB.txt', 'R_CBI.txt', 'R_DQH.txt', 'R_BZJ.txt', 'R_DRW.txt', 'R_BKF.txt', 'R_DSA.txt', 'R_BSS.txt', 'R_CRT.txt', 'R_BFQ.txt', 'R_CIZ.txt', 'R_CVY.txt', 'R_CBE.txt', 'R_CHP.txt', 'R_DDO.txt', 'R_BLT.txt', 'R_APK.txt', 'R_DEI.txt', 'R_AXJ.txt', 'R_AHB.txt', 'R_ADH.txt', 'R_CQV.txt', 'R_DGP.txt', 'R_ANY.txt', 'R_ADK.txt', 'R_AJG.txt', 'R_EEF.txt', 'R_BCA.txt', 'R_CHI.txt', 'R_CCW.txt', 'R_BYG.txt', 'R_CXE.txt', 'R_ELW.txt', 'R_APG.txt', 'R_AVE.txt', 'R_BUU.txt', 'R_EAS.txt', 'R_CFR.txt', 'R_AJE.txt', 'R_CBF.txt', 'R_CIJ.txt', 'R_CGX.txt', 'R_AFW.txt']
Bucket (41.88994808286067, 12.489467488049982, 41.88994808286067, 12.453271759032496) contains more than one trajectory: ['R_CAV.txt', 'R

## Using CityHash

In [29]:
import cityhash
import os
from constants import NUMBER_OF_TRAJECTORIES

# Paths
ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/hashed_data/grid/rome/"
ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE = f"{ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"

# Dictionary for the bucket system
bucket_system = {}

# Get filenames from the metafile
files = mfh.read_meta_file(ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE)

# Iterate through trajectory files and read their hashes
for filename in files:
    file_path = os.path.join(ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER, filename)
    
    # Read the hashes for the trajectory
    trajectory_hashes = fh.read_hash_file(file_path)
    
    # Iterate over each layer's hash
    
    for layer_hash in trajectory_hashes:
        # Convert the list of coordinates into a string
        hash_string = "_".join(map(str, layer_hash))
        
        # Use CityHash for creating a unique key
        hash_key = cityhash.CityHash128(hash_string)
        
        # Place trajectory into the appropriate bucket
        if hash_key not in bucket_system:
            bucket_system[hash_key] = []
        bucket_system[hash_key].append(filename)

    
# print the bucket system
for bucket, trajectories in bucket_system.items():
    print(f"Bucket {bucket}: {trajectories}")

        


# Analyze and display results

total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())

print(f"Total Buckets: {total_buckets}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")



Bucket 100657865843187534431046306598972899699: ['R_CAV.txt']
Bucket 86809954938569504382459289287904174922: ['R_CAV.txt', 'R_BFR.txt']
Bucket 323626341694653821992615051140435304208: ['R_CAV.txt']
Bucket 320405125132832260922402498010467510323: ['R_CAV.txt', 'R_BFR.txt']
Bucket 329523963297630838221542793380673161684: ['R_CAV.txt']
Bucket 52999031790255740084782228167102238468: ['R_DYX.txt']
Bucket 160952401644028276865477555861846523046: ['R_DYX.txt']
Bucket 295754741011221879136206658381508381782: ['R_DYX.txt', 'R_BBX.txt', 'R_ATX.txt']
Bucket 37634255560314215975791031883093853729: ['R_DYX.txt']
Bucket 277765703504900986145943480527168186607: ['R_DYX.txt']
Bucket 118104786251226571872439303983142147339: ['R_CDU.txt']
Bucket 78642392749710630521755069986730046611: ['R_CDU.txt']
Bucket 220434460052649977856163781573367668780: ['R_CDU.txt', 'R_CCX.txt', 'R_EKT.txt', 'R_EMY.txt']
Bucket 81883059127918837032680031855881612793: ['R_CDU.txt']
Bucket 179131858031829573740529581944015237479

In [11]:
import sys
number = 209355297435745564371190308466757490001
print(f"Memory used: {sys.getsizeof(number)} bytes")

key = '41.90297084408296, 12.48512374477741, 41.91382330487844, 12.48512374477741, 41.91382330487844, 12.470645453170416'
print(f"Memory used: {sys.getsizeof(key)} bytes")

Memory used: 44 bytes
Memory used: 154 bytes
