# Notebook for computing hashes, buckets and similarity values for the grid scheme. 

Utilizes the grid scheme

Incorporates:
* Hashing of trajectories using grid scheme
* Bucketing of hashes made from grid scheme
* Similarity computation between trajectories within buckets.
    * Both for DTW and Frechet
* Analysis of the produced bucket system

Produces:
* JSON file containing buckets
* Similarity values for trajectories within buckets


In [None]:

# Importing nescessary modules
import os
import sys

def find_project_root(target_folder="masteroppgave"):
    """Find the absolute path of a folder by searching upward."""
    currentdir = os.path.abspath("__file__")  # Get absolute script path
    while True:
        if os.path.basename(currentdir) == target_folder:
            return currentdir  # Found the target folder
        parentdir = os.path.dirname(currentdir)
        if parentdir == currentdir:  # Stop at filesystem root
            return None
        currentdir = parentdir  # Move one level up

# Example usage
project_root = find_project_root("masteroppgave")

if project_root:
    sys.path.append(project_root)
    print(f"Project root found: {project_root}")
else:
    raise RuntimeError("Could not find 'masteroppgave' directory")


#Other imports
from computation.similarity import generate_grid_hash_similarity_with_bucketing
from utils.helpers.bucket_evaluation import *
import json
import pandas as pd



# Setup

In [None]:
CITY = "rome" # "rome" or "porto"
MEASURE = "dtw" # "dtw" or "frechet"
RESOLUTION = 10 # Resolution of the grid
LAYERS = 3 # Number of layers
SIZE = 3050 #How many trajectories to use

## Retrieving the true similarity values

In [None]:
file_path = f"../../../results_true/similarity_values/{CITY}/{MEASURE}/{CITY}-{MEASURE}-{SIZE}.csv"

# Read CSV, telling pandas to take the first column as the row labels:
true_sim_matrix_df = pd.read_csv(file_path, index_col=0)

# Function to convert values to float if possible
def convert_to_float(value):
    try:
        return float(value)
    except ValueError:
        return value

# Apply the function to each cell in the DataFrame
true_sim_matrix_df = true_sim_matrix_df.map(convert_to_float)
true_sim_matrix_df = (true_sim_matrix_df + true_sim_matrix_df.T)

## Generate hashes with grid scheme, bucket system and similarity values for the given city and measure

# Rome


### DTW


In [None]:
#Creates buckets and similarity matrix
hashed_similarities, bucket_system = generate_grid_hash_similarity_with_bucketing(
    city=CITY, res=RESOLUTION, layers=LAYERS, measure=MEASURE, size=SIZE
)

# Bucket analysis 

## Bucket stats

In [None]:
total_buckets = len(bucket_system)
buckets_with_multiple = sum(1 for trajectories in bucket_system.values() if len(trajectories) > 1)
buckets_with_single = total_buckets - buckets_with_multiple
largest_bucket_size = max(len(trajectories) for trajectories in bucket_system.values())
largest_bucket = max(bucket_system, key=lambda key: len(bucket_system[key]))

print(f"Total Buckets: {total_buckets}")
print(f"Largest Bucket(id): {largest_bucket}")
print(f"Buckets with more than one trajectory: {buckets_with_multiple}")
print(f"Buckets with only one trajectory: {buckets_with_single}")
print(f"Largest Bucket Size: {largest_bucket_size}")

# Optional: Display distribution percentages
multiple_bucket_percentage = (buckets_with_multiple / total_buckets) * 100 if total_buckets > 0 else 0
single_bucket_percentage = (buckets_with_single / total_buckets) * 100 if total_buckets > 0 else 0

print(f"Percentage of buckets with more than one trajectory: {multiple_bucket_percentage:.2f}%")
print(f"Percentage of buckets with only one trajectory: {single_bucket_percentage:.2f}%")

## TP, FP, FN, PRECISION, RECALL

In [None]:
THRESHOLD = 2.5


#Variables
all_trajectory_names = list(hashed_similarities.keys()) # All trajectory names
true_positives = 0
false_positives = 0
false_negatives = 0
precision = 0 
recall = 0
f1_Score = 0

# Loop through all trajectory names
for trajectory in all_trajectory_names:
    
    # Pred and ground truth
    predicted_similar = find_predicted_similar_trajectories(trajectory, bucket_system)
    ground_truth = get_nearest_neighbour_under_threshold(trajectory, THRESHOLD, true_sim_matrix_df).index.to_list()
    # print(predicted_similar)
    # print(ground_truth)    
    true_positives += calculate_true_positives(predicted_similar, ground_truth)
    false_positives += calculate_false_positives(predicted_similar, ground_truth)
    false_negatives += calculate_false_negatives(predicted_similar, ground_truth)


# Calculate precision and recall
precision = compute_bucket_system_precision(true_positives, false_positives)
recall = compute_bucket_system_recall(true_positives, false_negatives)
f1_Score = compute_bucket_system_f1_score(precision, recall)

print(f"Bucket system statistics for city: {CITY}, measure: {MEASURE}, diameter: {DIAMETER}, layers: {LAYERS}, disks: {DISKS}, size: {SIZE}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_Score}")
    