# Notebook for the creation of the disk lsh hashes for both datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/disk


In [1]:
# Importing nescessary modules
import os, sys
import shutil
import timeit as ti
from tqdm import tqdm

from multiprocessing import Pool

currentdir = os.path.dirname(os.path.abspath("__file__"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)


from helpers.lsh_disk import DiskLSH
from utils.helpers import metafile_handler as mfh
from utils.helpers import file_handler as fh
from schemes.experiments import hashing



In [3]:
from constants import P_MAX_LON, P_MIN_LON,P_MAX_LAT,P_MIN_LAT,R_MAX_LON,R_MIN_LON,R_MAX_LAT,R_MIN_LAT,K_MAX_LON,K_MIN_LON,K_MAX_LAT,K_MIN_LAT, PORTO_OUTPUT_FOLDER, ROME_OUTPUT_FOLDER, KOLUMBUS_OUTPUT_FOLDER
# Declaring global variables:

SHOULD_DELETE_OLD_FILES = True

OUTPUT_FOLDER_PORTO = "../hashed_data/disk/porto/"
OUTPUT_FOLDER_ROME = "../hashed_data/disk/rome/"
OUTPUT_FOLDER_KOLUMBUS = "../hashed_data/disk/kolumbus/"

PORTO_DATA = f"../{PORTO_OUTPUT_FOLDER}/"
ROME_DATA = f"../{ROME_OUTPUT_FOLDER}/"
KOLUMBUS_DATA = f"../{KOLUMBUS_OUTPUT_FOLDER}/"

# Rome LSH Disk

Continuing with the rome set


In [5]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    fh.delete_old_files(OUTPUT_FOLDER_ROME, ".gitkeep")

['R_DYR.txt', 'R_AKD.txt', 'R_CRS.txt', 'META-2000.txt', 'META-200.txt', 'R_BOX.txt', 'R_DNZ.txt', 'META-50.txt', 'R_DYD.txt', 'R_AIO.txt', 'R_DMT.txt', 'R_CHI.txt', 'R_BVN.txt', 'META-2200.txt', 'R_DAV.txt', 'R_DTW.txt', 'R_AFR.txt', 'META-2600.txt', 'R_BMW.txt', 'R_BZI.txt', 'R_BYR.txt', 'R_BLR.txt', 'R_EIO.txt', 'R_DLC.txt', 'META-400.txt', 'R_AHO.txt', 'R_DWI.txt', 'R_DZN.txt', 'META-600.txt', 'R_AJE.txt', 'R_CFS.txt', 'R_CFD.txt', 'R_AQX.txt', 'R_DLA.txt', 'R_BMT.txt', 'META-1900.txt', 'R_CKB.txt', 'R_DTF.txt', 'META-2400.txt', 'R_ALT.txt', 'META-100.txt', 'R_DKC.txt', 'R_DJG.txt', 'R_CLV.txt', 'R_BRQ.txt', 'R_BSB.txt', 'R_DQL.txt', 'META-2300.txt', 'R_CWH.txt', 'META-2100.txt', 'R_CMF.txt', 'R_EAC.txt', '.gitkeep', 'R_BFW.txt', 'R_BDH.txt', 'R_EBY.txt', 'META-300.txt', 'R_CLC.txt', 'R_DGB.txt', 'R_CUE.txt', 'META-700.txt', 'META-2500.txt', 'R_CNY.txt', 'R_DFC.txt', 'R_CBM.txt', 'META-1800.txt', 'R_CBX.txt', 'META-2700.txt', 'R_BRU.txt', 'META-500.txt', 'R_CLR.txt', 'R_CAB.txt', '

In [9]:
# Creating a disk based LSH hashing object over rome

layers = 4
diameter = 1.5
num_disks = 50
meta_file = f"../{ROME_OUTPUT_FOLDER}/META-100.txt"

DiskRome = DiskLSH(
    "Rome D1",
    R_MIN_LAT,
    R_MAX_LAT,
    R_MIN_LON,
    R_MAX_LON,
    num_disks,
    layers,
    diameter,
    meta_file,
    ROME_DATA,
)

In [10]:
# Generating the hashes and storing them in output folder along with the meta-files
hashes = DiskRome.compute_dataset_hashes_with_KD_tree()

for key in hashes:
    with open(f"{OUTPUT_FOLDER_ROME}/{key}.txt", "w") as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying meta_files as well
meta_files = mfh.get_meta_files(ROME_DATA)

for filename in meta_files:
    shutil.copy(ROME_DATA + filename, OUTPUT_FOLDER_ROME)

# Porto LSH Disk

Beginning with the porto set


In [11]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    fh.delete_old_files(OUTPUT_FOLDER_PORTO, ".gitkeep")

In [12]:
# Create a disk-based LSH object over Porto

layers = 4
diameter = 1.5
num_disks = 50
meta_file = f"../{PORTO_OUTPUT_FOLDER}/META-100.txt"

DiskPorto = DiskLSH(
    "Porto D1",
    P_MIN_LAT,
    P_MAX_LAT,
    P_MIN_LON,
    P_MAX_LON,
    num_disks,
    layers,
    diameter,
    meta_file,
    PORTO_DATA,
)

In [13]:
# Generating the disk-based LSH objects hashes and saving them to file

hashes = DiskPorto.compute_dataset_hashes_with_KD_tree()

for key in hashes:
    with open(f"{OUTPUT_FOLDER_PORTO}/{key}.txt", "w") as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close

# Copying meta_files as well
meta_files = mfh.get_meta_files(PORTO_DATA)

for filename in meta_files:
    shutil.copy(PORTO_DATA + filename, OUTPUT_FOLDER_PORTO)

# Kolumbus LSH Disk


In [14]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    fh.delete_old_files(OUTPUT_FOLDER_KOLUMBUS, ".gitkeep")

In [15]:
# Creating a disk based LSH hashing object over Kolumbus

layers = 4
diameter = 1.5
num_disks = 50
meta_file = f"../{KOLUMBUS_OUTPUT_FOLDER}/META-100.txt"

DiskKolumbus = DiskLSH(
    "Kolumbus D1",
    K_MIN_LAT,
    K_MAX_LAT,
    K_MIN_LON,
    K_MAX_LON,
    num_disks,
    layers,
    diameter,
    meta_file,
    KOLUMBUS_DATA,
)

In [16]:
# Generating the hashes and storing them in output folder along with the meta-files
hashes = DiskKolumbus.compute_dataset_hashes_with_KD_tree()

for key in hashes:
    with open(f"{OUTPUT_FOLDER_KOLUMBUS}/{key}.txt", "w") as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying meta_files as well
meta_files = mfh.get_meta_files(KOLUMBUS_DATA)

for filename in meta_files:
    shutil.copy(KOLUMBUS_DATA + filename, OUTPUT_FOLDER_KOLUMBUS)

# Measuring run-times of hash generation

The cells below are created to measure the time-efficiency of the hash computation


In [17]:
# #Cell for measrung DiskLSH hash generation times for all methods in both datasets - using coordiante hashes
# import pandas as pd
# from itertools import chain

# output_folder = "schemes/experiments/runtimes/"
# file_name = "hashing_runtimes_disk_lsh.csv"

# hashing_map = {
#     "porto_naive" : hashing.fun_wrapper_p_naive,
#     "porto_quadrants" : hashing.fun_wrapper_p_quadrants,
#     "porto_kd_tree" : hashing.fun_wrapper_p_KD_tree,
#     "rome_naive" : hashing.fun_wrapper_r_naive,
#     "rome_quadrants" : hashing.fun_wrapper_r_quadrants,
#     "rome_kd_tree" : hashing.fun_wrapper_r_KD_tree
# }

# config = {
#     "porto" : [1000, 60, 4, 2.2],
#     # "rome" : [1000, 50, 5, 1.6]

# }

# runs = 10

# df = pd.DataFrame(columns=[f"Run_{run+1}" for run in range(runs)])

# for key in hashing_map.keys():
#     with Pool() as pool:
#         result = pool.map(hashing_map[key], [config[key.split("_")[0]] for _ in range(runs)])
#         df.loc[key] = list(chain.from_iterable(result))

# df.to_csv(os.path.join(output_folder, file_name))