# Notebook for placing the hashed trajectories into buckets

In [15]:
# Importing nescessary modules
import os
import sys
import shutil
import timeit as ti
from tqdm import tqdm

from multiprocessing import Pool

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")

from utils.helpers.save_trajectory import save_trajectory_hashes
from utils.helpers import file_handler as fh
from utils.helpers import metafile_handler as mfh
from schemes.lsh_disk import DiskLSH

Project root found: c:\Users\eivin\dev\JoonEndreLSH\masteroppgave


## Using CityHash

In [16]:
import cityhash
import os
from constants import NUMBER_OF_TRAJECTORIES

# Paths
ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/hashed_data/grid/rome/"
ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE = f"{ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"

ROME_FULL_TRAJECTORIES_OUTPUT_FOLDER = "../../dataset/rome/output/"

# Dictionary for the bucket system
bucket_system = {}

# Get filenames from the metafile
files = mfh.read_meta_file(ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE)

# Iterate through trajectory files and read their hashes
for filename in files:
    file_path = os.path.join(ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER, filename)
    
    # Read the hashes for the trajectory
    trajectory_hashes = fh.read_hash_file(file_path)
    
    # Iterate over each layer's hash
    
    for layer_hash in trajectory_hashes:
        # Convert the list of coordinates into a string
        hash_string = "_".join(map(str, layer_hash))
        
        # Use CityHash for creating a unique key
        hash_key = cityhash.CityHash128(hash_string)
        
        # Place trajectory into the appropriate bucket
        if hash_key not in bucket_system:
            bucket_system[hash_key] = []
        bucket_system[hash_key].append(filename)

    
# # print the bucket system
# for bucket, trajectories in bucket_system.items():
#     print(f"Bucket {bucket}: {trajectories}")

        


# Analyze and display results

total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())
largest_bucket = max(bucket_system, key=lambda key: len(bucket_system[key]))
print(f"Largest Bucket: {largest_bucket}")

print(f"Total Buckets: {total_buckets}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")



Largest Bucket: 284810363214875717150971988534373310147
Total Buckets: 4587
Buckets with more than one trajectory: 1340
Buckets with only one trajectory: 3247
Largest Bucket Size: 40
Percentage of buckets with more than one trajectory: 29.21%
Percentage of buckets with only one trajectory: 70.79%


In [3]:
import sys
number = 244816790760326619827359513135372348218
print(f"Memory used: {sys.getsizeof(number)} bytes")

key = '41.90297084408296, 12.48512374477741, 41.91382330487844, 12.48512374477741, 41.91382330487844, 12.470645453170416'
print(f"Memory used: {sys.getsizeof(key)} bytes")

Memory used: 44 bytes
Memory used: 154 bytes
