# Notebook for computing hashes, buckets and similarity values for the disk scheme. 

Utilizes the disk scheme

Incorporates:
* Hashing of trajectories using disk scheme
* Bucketing of hashes made from disk scheme
* Similarity computation between trajectories within buckets.
    * Both for DTW and Frechet
* Analysis of the produced bucket system

Produces:
* JSON file containing buckets
* Similarity values for trajectories within buckets


In [None]:
import os
import sys

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")

from computation.similarity import generate_disk_hash_similarity_with_bucketing
from utils.helpers.bucket_evaluation import *
import json
import pandas as pd


# Setup

In [611]:
CITY = "rome" # "rome" or "porto"
MEASURE = "dtw" # "dtw" or "frechet"
DIAMETER = 1.6 # Diameter of the disks
LAYERS = 10 # Number of layers
DISKS = 4 # NUmber of disks
SIZE = 3050 #How many trajectories to use

## Retrieving the true similarity values

In [612]:
file_path = f"../../../results_true/similarity_values/{CITY}/{MEASURE}/{CITY}-{MEASURE}-{SIZE}.csv"

# Read CSV, telling pandas to take the first column as the row labels:
true_sim_matrix_df = pd.read_csv(file_path, index_col=0)

# Function to convert values to float if possible
def convert_to_float(value):
    try:
        return float(value)
    except ValueError:
        return value

# Apply the function to each cell in the DataFrame
true_sim_matrix_df = true_sim_matrix_df.map(convert_to_float)
true_sim_matrix_df = (true_sim_matrix_df + true_sim_matrix_df.T)

## Generate hashes with disk scheme, bucket system and similarity values for the given city and measure

In [None]:
hashed_similarities, bucket_system = generate_disk_hash_similarity_with_bucketing(
    city=CITY, diameter=DIAMETER, layers=LAYERS, disks=DISKS, measure=MEASURE, size=SIZE
)

In [607]:
# hashed_similarities.head(40)

# Bucket analysis 

## Bucket stats

In [None]:
total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())
largest_bucket = max(bucket_system, key=lambda key: len(bucket_system[key]))
smallest_bucket = min(bucket_system, key=lambda key: len(bucket_system[key]))
smallest_bucket_size = len(bucket_system[smallest_bucket])


print(f"Total Buckets: {total_buckets}")
print(f"Largest Bucket(id): {largest_bucket}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")
print("smallest bucket: ", smallest_bucket_size)

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")



# print(bucket_system[largest_bucket])
# print(bucket_system[smallest_bucket])

# for key, value in bucket_system.items():
#     print(key, value)

## TP, FP, FN, PRECISION, RECALL

In [None]:
# THRESHOLD = 5
import numpy as np
THRESHOLDS = np.arange(1, 6.0, 1)  # Generates [0.5, 1.0, 1.5, ..., 5.5]

results = {
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}



for treshold in THRESHOLDS:

    #Variables
    all_trajectory_names = list(hashed_similarities.keys()) # All trajectory names
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    precision = 0 
    recall = 0
    f1_Score = 0

    # Loop through all trajectory names
    for trajectory in all_trajectory_names:
        
        # Pred and ground truth
        predicted_similar = find_predicted_similar_trajectories(trajectory, bucket_system)
        ground_truth = get_nearest_neighbour_under_threshold(trajectory, treshold, true_sim_matrix_df).index.to_list()  
        true_positives += calculate_true_positives(predicted_similar, ground_truth)
        false_positives += calculate_false_positives(predicted_similar, ground_truth)
        false_negatives += calculate_false_negatives(predicted_similar, ground_truth)
        
    # Calculate precision and recall
    precision = compute_bucket_system_precision(true_positives, false_positives)
    recall = compute_bucket_system_recall(true_positives, false_negatives)
    f1_score = compute_bucket_system_f1_score(precision, recall)

    # print(f"Bucket system statistics for city: {CITY}, measure: {MEASURE}, diameter: {DIAMETER}, layers: {LAYERS}, disks: {DISKS}, size: {SIZE}")
    # print(f"Precision: {precision}")
    # print(f"Recall: {recall}")
    # print(f"F1-Score: {f1_score}")
    

    results["Precision"].append(precision)
    results["Recall"].append(recall)
    results["F1 Score"].append(f1_score)
    
    
print(f"Bucket system statistics for city: {CITY}, measure: {MEASURE}, diameter: {DIAMETER}, layers: {LAYERS}, disks: {DISKS}, size: {SIZE}")
# Create DataFrame with thresholds as columns and metrics as row indexes
df = pd.DataFrame(results, index=[f"Threshold = {t}" for t in THRESHOLDS]).T

df

# Write to CSV