# Notebook for placing the hashed trajectories into buckets

In [1]:
# Importing nescessary modules
import os
import sys
import shutil
import timeit as ti
from tqdm import tqdm

from multiprocessing import Pool

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")

from utils.helpers.save_trajectory import save_trajectory_hashes
from utils.helpers import file_handler as fh
from utils.helpers import metafile_handler as mfh
from schemes.lsh_disk import DiskLSH
from schemes.lsh_grid import GridLSH

Project root found: /Users/thomasnitsche/Developer/skole/master/masteroppgave


In [4]:
from constants import (
    P_MAX_LON,
    P_MIN_LON,
    P_MAX_LAT,
    P_MIN_LAT,
    R_MAX_LON,
    R_MIN_LON,
    R_MAX_LAT,
    R_MIN_LAT,
    NUMBER_OF_TRAJECTORIES
)

# Declaring global variables:

SHOULD_DELETE_OLD_FILES = True

PORTO_OUTPUT_FOLDER = "../dataset/hashed_data/grid/porto/"
ROME_OUTPUT_FOLDER = "../dataset/hashed_data/grid/rome/"

PORTO_DATA_FOLDER = "../dataset/porto/output/"
ROME_DATA_FOLDER = "../dataset/rome/output/"


In [6]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    fh.delete_old_files(ROME_OUTPUT_FOLDER, ".gitkeep")
    
# Create Grid hash object for Rome and saves them to output folder. Also copies the metafiles denoting the different datasets

resolution = 1.2  # km
layers = 3
meta_file = f"{ROME_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"

GridRome = GridLSH(
    "Rome G1",
    R_MIN_LAT,
    R_MAX_LAT,
    R_MIN_LON,
    R_MAX_LON,
    resolution,
    layers,
    meta_file,
    ROME_DATA_FOLDER,
)

print(GridRome)

# Copying the meta_files:
meta_files = mfh.get_meta_files(ROME_DATA_FOLDER)

for filename in meta_files:
    shutil.copy(ROME_DATA_FOLDER + filename, ROME_OUTPUT_FOLDER)

# Generate the hashes and save them to output folder

hashes = GridRome.compute_dataset_hashes()

save_trajectory_hashes(ROME_OUTPUT_FOLDER, hashes)



Grid: Rome G1
Covering: (5.559754011676299, 7.451072531046803) km 
Resolution: 1.2 km 
Distortion: [0.7227265475179453, 0.7142753456966077, 0.2408295525289906] km 
Dimensions: (4, 6) cells


## Using CityHash

In [11]:
import cityhash
import os
from constants import NUMBER_OF_TRAJECTORIES

# Paths
ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER = "../dataset/hashed_data/grid/rome/"
ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE = f"{ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER}META-{NUMBER_OF_TRAJECTORIES}.txt"

ROME_FULL_TRAJECTORIES_OUTPUT_FOLDER = "../dataset/rome/output/"

# Dictionary for the bucket system
bucket_system = {}
bucket_meta = {}

# Get filenames from the metafile
files = mfh.read_meta_file(ROME_HASHED_TRAJECTORIES_FOLDER_META_FILE)

# Iterate through trajectory files and read their hashes
for filename in files:
    file_path = os.path.join(ROME_HASHED_TRAJECTORIES_OUTPUT_FOLDER, filename)
    
    # Read the hashes for the trajectory
    trajectory_hashes = fh.read_hash_file(file_path)
    
    # Iterate over each layer's hash
    
    for idx, layer_hash in enumerate(trajectory_hashes):
        # Convert the list of coordinates into a string
        hash_string = "_".join(map(str, layer_hash))
        
        # Use CityHash for creating a unique key
        hash_key = cityhash.CityHash128(hash_string)
        
        # Place trajectory into the appropriate bucket
        if hash_key not in bucket_system:
            bucket_system[hash_key] = []
        bucket_system[hash_key].append(filename)
        bucket_meta[hash_key] = idx
        

    
# # print the bucket system
# for bucket, trajectories in bucket_system.items():
#     print(f"Bucket {bucket}: {trajectories}")

        


# Analyze and display results

total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())
largest_bucket = max(bucket_system, key=lambda key: len(bucket_system[key]))
print(f"Largest Bucket: {largest_bucket}")

print(f"Total Buckets: {total_buckets}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")

# Print the meta data for the largest bucket
print(f"Meta data for the largest bucket:")
print(bucket_meta[largest_bucket])


Largest Bucket: 223863007633513911083797650969882632785
Total Buckets: 4735
Buckets with more than one trajectory: 1469
Buckets with only one trajectory: 3266
Largest Bucket Size: 31
Percentage of buckets with more than one trajectory: 31.02%
Percentage of buckets with only one trajectory: 68.98%
Meta data for the largest bucket:
2
{208492699008294433848431044087067989336: 0, 88611886760239931386444773291785641695: 1, 147552373173644790283683933928472211582: 2, 199402383768227516194608955274422955287: 0, 12211205129200059904063639686572106553: 1, 104321442063092589903660764460046422200: 2, 117754908651903379734417171605179527048: 0, 137841443500791052560343067597970517037: 1, 52429190530537764384641199587898940407: 2, 93400544077314867127264996328402412818: 0, 108385268092536705412830117822225558560: 1, 330409430379281936322883331344201511274: 2, 237740209875100249059839050043927129241: 0, 317427068383185683749970880286455179805: 1, 80889307511881570358640595349136332684: 2, 1546663845

In [9]:
import folium
from folium import plugins
from folium.plugins import FeatureGroupSubGroup


def visualize_bucket_trajectories(grid_lsh, bucket_key, bucket_system, folder_path):
    """
    Visualize all trajectories from a specific bucket key on a Folium map.

    Parameters:
    -----------
    bucket_key : str or int
        The key of the bucket to visualize.
    bucket_system : dict
        A dictionary containing bucket keys and corresponding file names.
    folder_path : str
        Path to the folder where trajectory files are stored.
    """
    
    # Define center of the map (average lat/lon)
    center_lat = (grid_lsh.min_lat + grid_lsh.max_lat) / 2
    center_lon = (grid_lsh.min_lon + grid_lsh.max_lon) / 2
    
    trajectory_map = folium.Map(location=[center_lat, center_lon], zoom_start=12, tiles="OpenStreetMap")
    layer_colors = ["red", "blue", "green", "purple", "orange"]
    base_layer = folium.FeatureGroup(name="Base Map").add_to(trajectory_map)
    
    # Add bounding box (dataset boundary)
    folium.Rectangle(
        bounds=[(grid_lsh.min_lat, grid_lsh.min_lon), (grid_lsh.max_lat, grid_lsh.max_lon)],
        color="black",
        weight=2,
        fill=True,
        fill_opacity=0.1,
        popup="Bounding Box"
    ).add_to(base_layer)
    
    # Iterate over each layer in the grid
    for layer_index, (layer, grid_points) in enumerate(grid_lsh.grid.items()):
        color = layer_colors[layer_index % len(layer_colors)]  # Cycle colors

        # Create a subgroup for each layer
        layer_group = FeatureGroupSubGroup(base_layer, name=f"Layer {layer_index + 1}")
        trajectory_map.add_child(layer_group)

        latitudes, longitudes = grid_points  # Unpack grid points

        # Draw grid cells as rectangles
        for i in range(len(latitudes) - 1):
            for j in range(len(longitudes) - 1):
                # Define corners of each cell
                top_left = (latitudes[i], longitudes[j])
                bottom_right = (latitudes[i + 1], longitudes[j + 1])

                folium.Rectangle(
                    bounds=[top_left, bottom_right],
                    color=color,
                    fill=True,
                    fill_opacity=0.3,
                    popup=f"Layer {layer_index + 1}\nCell: ({i}, {j})",
                ).add_to(layer_group)
                
        # Add layer control to toggle between layers
    folium.LayerControl(collapsed=False).add_to(trajectory_map)
    
    
    # Fetch the file names in the bucket
    if bucket_key not in bucket_system:
        print(f"No bucket found with key {bucket_key}.")
        return

    trajectory_files = bucket_system[bucket_key]

    # Load trajectory coordinates from files
    trajectories = fh.load_trajectory_files(trajectory_files, folder_path)

    # Plot each trajectory on the map
    for name, coordinates in trajectories.items():
        if not coordinates:
            continue
        
        # Ensure that each trajectory is added as a polyline
        folium.PolyLine(
            locations=coordinates,
            color='blue',
            weight=2.5,
            opacity=0.8,
            tooltip=name
        ).add_to(trajectory_map)

    # Add Layer Control
    folium.LayerControl().add_to(trajectory_map)

    # Return the map object for display in the notebook
    return trajectory_map


In [10]:
bucket_key = 223863007633513911083797650969882632785  # Replace with your bucket key
visualize_bucket_trajectories(GridRome,bucket_key, bucket_system, ROME_FULL_TRAJECTORIES_OUTPUT_FOLDER)
