# Notebook for the creation of the disk lsh hashes for both datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/disk


In [13]:
# Importing nescessary modules
import os
import sys
import shutil
import timeit as ti
from tqdm import tqdm

from multiprocessing import Pool

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")

from utils.helpers.save_trajectory import save_trajectory_hashes
from utils.helpers import file_handler as fh
from utils.helpers import metafile_handler as mfh
from schemes.lsh_disk import DiskLSH

Project root found: /Users/thomasnitsche/Developer/skole/master/masteroppgave


In [14]:
# Declaring global variables:

from constants import (
    P_MAX_LON,
    P_MIN_LON,
    P_MAX_LAT,
    P_MIN_LAT,
    R_MAX_LON,
    R_MIN_LON,
    R_MAX_LAT,
    R_MIN_LAT,
    NUMBER_OF_TRAJECTORIES
)

SHOULD_DELETE_OLD_FILES = True


PORTO_OUTPUT_FOLDER = "../../dataset/hashed_data/disk/porto/"
ROME_OUTPUT_FOLDER = "../../dataset/hashed_data/disk/rome/"

PORTO_DATA_FOLDER = "../../dataset/porto/output/"
ROME_DATA_FOLDER = "../../dataset/rome/output/"

NUMBER_OF_TRAJECTORIES = 3050

# Rome LSH Disk

Continuing with the rome set


In [15]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    fh.delete_old_files(ROME_OUTPUT_FOLDER, ".gitkeep")

In [16]:
# Creating a disk based LSH hashing object over rome

layers = 4
diameter = 1.2
num_disks = 10
meta_file = f"{ROME_OUTPUT_FOLDER}/META-{NUMBER_OF_TRAJECTORIES}.txt"

DiskRome = DiskLSH(
    "Rome D1",
    R_MIN_LAT,
    R_MAX_LAT,
    R_MIN_LON,
    R_MAX_LON,
    num_disks,
    layers,
    diameter,
    meta_file,
    ROME_DATA_FOLDER,
)

In [17]:
# Copying meta_files as well
meta_files = mfh.get_meta_files(ROME_DATA_FOLDER)

for filename in meta_files:
    shutil.copy(ROME_DATA_FOLDER + filename, ROME_OUTPUT_FOLDER)

# Generating the hashes and storing them in output folder along with the meta-files
hashes = DiskRome.compute_dataset_hashes_with_KD_tree()

save_trajectory_hashes(ROME_OUTPUT_FOLDER, hashes)

# Porto LSH Disk

Beginning with the porto set


In [18]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    fh.delete_old_files(PORTO_OUTPUT_FOLDER, ".gitkeep")

In [19]:
# Create a disk-based LSH object over Porto

layers = 4
diameter = 1.5
num_disks = 50
meta_file = f"{PORTO_OUTPUT_FOLDER}META-100.txt"

DiskPorto = DiskLSH(
    "Porto D1",
    P_MIN_LAT,
    P_MAX_LAT,
    P_MIN_LON,
    P_MAX_LON,
    num_disks,
    layers,
    diameter,
    meta_file,
    PORTO_DATA_FOLDER,
)

In [20]:
# Copying meta_files as well
meta_files = mfh.get_meta_files(PORTO_DATA_FOLDER)

for filename in meta_files:
    shutil.copy(PORTO_DATA_FOLDER + filename, PORTO_OUTPUT_FOLDER)

# Generating the disk-based LSH objects hashes and saving them to file

hashes = DiskPorto.compute_dataset_hashes_with_KD_tree()

save_trajectory_hashes(PORTO_OUTPUT_FOLDER, hashes)

# Measuring run-times of hash generation

The cells below are created to measure the time-efficiency of the hash computation


In [21]:
# #Cell for measrung DiskLSH hash generation times for all methods in both datasets - using coordiante hashes
# import pandas as pd
# from itertools import chain

# output_folder = "schemes/experiments/runtimes/"
# file_name = "hashing_runtimes_disk_lsh.csv"

# hashing_map = {
#     "porto_naive" : hashing.fun_wrapper_p_naive,
#     "porto_quadrants" : hashing.fun_wrapper_p_quadrants,
#     "porto_kd_tree" : hashing.fun_wrapper_p_KD_tree,
#     "rome_naive" : hashing.fun_wrapper_r_naive,
#     "rome_quadrants" : hashing.fun_wrapper_r_quadrants,
#     "rome_kd_tree" : hashing.fun_wrapper_r_KD_tree
# }

# config = {
#     "porto" : [1000, 60, 4, 2.2],
#     # "rome" : [1000, 50, 5, 1.6]

# }

# runs = 10

# df = pd.DataFrame(columns=[f"Run_{run+1}" for run in range(runs)])

# for key in hashing_map.keys():
#     with Pool() as pool:
#         result = pool.map(hashing_map[key], [config[key.split("_")[0]] for _ in range(runs)])
#         df.loc[key] = list(chain.from_iterable(result))

# df.to_csv(os.path.join(output_folder, file_name))

# Visualize disk and trajectories on Map

## Setup

In [22]:
CITY = "rome"

In [23]:
if CITY == "rome":
    output_folder = ROME_DATA_FOLDER
    prefix = "R_"
    disk = DiskRome

if CITY == "porto":
    output_folder = PORTO_DATA_FOLDER
    prefix = "P_"
    disk = DiskPorto

trajectories = fh.load_trajectory_files([f"{prefix}ABA.txt"], output_folder) ## Reads a list of trajectory files, return a dictionary with the filename as key and coordinates as values


In [24]:
import folium
from folium.plugins import FeatureGroupSubGroup
import random

def visualize_disks_with_boundary(disk_lsh, trajectories=None):
    """
    Visualizes the disks of the DiskLSH object using Folium and adds a bounding box.

    Parameters:
    - disk_lsh (DiskLSH): An instance of the DiskLSH class.

    Returns:
    - A Folium map object.
    """

    # Define center of the map (average lat/lon)
    center_lat = (disk_lsh.min_lat + disk_lsh.max_lat) / 2
    center_lon = (disk_lsh.min_lon + disk_lsh.max_lon) / 2

    # Initialize folium map
    map_disks = folium.Map(location=[center_lat, center_lon], zoom_start=14, tiles="OpenStreetMap")

    # Define colors for different layers
    layer_colors = ["red", "blue", "green", "purple", "orange"]

    # Create a base layer group
    base_layer = folium.FeatureGroup(name="Base Map").add_to(map_disks)

    # Add bounding box (dataset boundary)
    folium.Rectangle(
        bounds=[(disk_lsh.min_lat, disk_lsh.min_lon), (disk_lsh.max_lat, disk_lsh.max_lon)],
        color="black",
        weight=2,
        fill=True,
        fill_opacity=0.1,
        popup="Bounding Box"
    ).add_to(base_layer)

    # Iterate over each layer in the disk structure
    for layer_index, (layer, disks) in enumerate(disk_lsh.disks.items()):
        color = layer_colors[layer_index % len(layer_colors)]  # Cycle colors

        # Create a subgroup for each layer
        layer_group = FeatureGroupSubGroup(base_layer, name=f"Layer {layer_index + 1}")
        map_disks.add_child(layer_group)

        # Plot disks as circles
        for disk in disks:
            lat, lon = disk  # Disk center
            folium.Circle(
                location=[lat, lon],
                radius=disk_lsh.diameter * 500,  # Convert km to meters
                color=color,
                fill=True,
                fill_opacity=0.4,
                popup=f"Layer {layer_index + 1}\nDisk: ({lat:.5f}, {lon:.5f})",
            ).add_to(layer_group)
    
    # Define a set of distinct colors for trajectories
    traj_colors = [
        "red", "blue", "green", "purple", "orange", "pink", "brown", "cyan", "magenta", "yellow", "lime"
    ]
    random.shuffle(traj_colors)  # Shuffle colors for randomness
    
    # Add a FeatureGroup for each trajectory to enable toggling
    for idx, (traj_name, coords) in enumerate(trajectories.items()):
        traj_color = traj_colors[idx % len(traj_colors)]  # Assign a unique color
        traj_layer = folium.FeatureGroup(name=f"Trajectory: {traj_name}")  # Create a feature group

        # Add trajectory line to its feature group
        folium.PolyLine(
            coords,
            color=traj_color,
            weight=6.5,
            opacity=1,
            popup=f"Trajectory: {traj_name}",
        ).add_to(traj_layer)

        # Add markers for each point along the trajectory
        for lat, lon in coords:
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,  # Bigger markers
                color="black",
                fill=True,
                fill_color="red",
                fill_opacity=1,
                popup=f"Point: ({lat:.5f}, {lon:.5f})"
            ).add_to(traj_layer)

        traj_layer.add_to(map_disks)  # Add trajectory layer to map

    # Add LayerControl to enable checkboxes
    folium.LayerControl(collapsed=False).add_to(map_disks)

    return map_disks

# Example usage
map_disk = visualize_disks_with_boundary(disk, trajectories)

map_disk
