# Notebook for generating ground truth for bucketing. 

Takes a dataset and creates a dictionary containing all trajectories from dataset and the corresponding trajectories meeting similarity below threshold

## Code for checking true similarity between a pair of trajectories

In [2]:
import pandas as pd

ROME_TRUE_SIMILARITY_FILE = "../../results_true/similarity_values/rome/dtw/rome-dtw-3050.csv"

def get_true_similarity(filename1: str, filename2: str) -> float | None:
    """
    Find the true similarity between two trajectory filenames using a similarity matrix file.

    Args:
        filename1 (str): First trajectory file (with or without `.txt`).
        filename2 (str): Second trajectory file (with or without `.txt`).

    Returns:
        float | None: The similarity value if found, otherwise None.
    """
    # Load the similarity matrix CSV
    similarity_df = pd.read_csv(ROME_TRUE_SIMILARITY_FILE, index_col=0)
    
    # Clean file names by removing '.txt'
    t1_clean = filename1.replace('.txt', '')
    t2_clean = filename2.replace('.txt', '')

    # Ensure correct row-column order for the matrix
    if t1_clean < t2_clean:
        t1_clean, t2_clean = t2_clean, t1_clean

    # Check if both are in the DataFrame
    if t1_clean in similarity_df.index and t2_clean in similarity_df.columns:
        print(f"Accessing row {t1_clean} and column {t2_clean}")
        return float(similarity_df.loc[t1_clean, t2_clean])
    elif t2_clean in similarity_df.index and t1_clean in similarity_df.columns:
        print(f"Accessing row {t2_clean} and column {t1_clean}")
        return float(similarity_df.loc[t2_clean, t1_clean])
    else:
        print(f"Missing pair in similarity matrix: {filename1}, {filename2}")
        return None
    

get_true_similarity("R_CAV", "R_DVK")


Accessing row R_DVK and column R_CAV


0.4753952612424382

## Setup

In [120]:
#City to use (used for filepath)
CITY = "rome"
#Measure to use (used for filepath)
MEASURE = "dtw"
#Number of trajectories (used for filepath, which metafile to use)
NUMBER_OF_TRAJECTORIES= 3050
#Threshold for similarity value
THRESHOLD = 0.1

## Importing data

In [None]:
import pandas as pd
import numpy as np

#Filepath to dataset containing similarity values
file_path = f"../../results_true/similarity_values/{CITY}/{MEASURE}/{CITY}-{MEASURE}-{NUMBER_OF_TRAJECTORIES}.csv"

# Read CSV, telling pandas to take the first column as the row labels:
similarity_df = pd.read_csv(file_path, index_col=0)

# Function to convert values to float if possible
def convert_to_float(value):
    try:
        return float(value)
    except ValueError:
        return value

# Apply the function to each cell in the DataFrame
similarity_df = similarity_df.map(convert_to_float)
number_of_trajectories_in_dataframe = similarity_df.shape[1]
column_names = similarity_df.columns

print(f"Currently working with true similarity values for {CITY} using {MEASURE} with {NUMBER_OF_TRAJECTORIES} trajectories")

print(f"Number of trajectories: {number_of_trajectories_in_dataframe}")
# print(f"Column names: {column_names}")

#Make the dataframe symmetric
symmetric_df = (similarity_df + similarity_df.T)
# Display the DataFrame
symmetric_df.head(40)



### print single row from dataset

In [None]:
single_row = symmetric_df[["R_ABU"]]
single_row

### Find lowest similarity value in dataset

In [None]:
# Create a copy of the DataFrame to avoid modifying the original
df_no_diag = symmetric_df.copy()

# Replace diagonal values (self-similarity) with NaN to exclude them
np.fill_diagonal(df_no_diag.values, np.nan)

# Find the minimum value excluding the diagonal
min_value = df_no_diag.min().min()

# Find the corresponding row and column (trajectory names)
min_location = df_no_diag.stack().idxmin()  # Finds the index location of the minimum value

# Print results
print(f"Lowest similarity value (excluding diagonal): {min_value}")
print(f"Between trajectories: {min_location[0]} and {min_location[1]}")


### Find average similarity between all col x rows

In [None]:
# Convert DataFrame to NumPy array

avg_df = symmetric_df.copy()

similarity_matrix = avg_df.values.astype(float)

# Compute the mean including the diagonal (all values)
mean_including_diagonal = similarity_matrix.mean()

# Compute the mean excluding the diagonal (ignore self-similarity)
np.fill_diagonal(similarity_matrix, np.nan)  # Replace diagonal with NaN
mean_excluding_diagonal = np.nanmean(similarity_matrix)  # Compute mean excluding NaN values

# Print results
print(f"Mean similarity including diagonal: {mean_including_diagonal}")
print(f"Mean similarity excluding diagonal: {mean_excluding_diagonal}")

## Generate ground truth for trajectories in dataset

In [125]:
result_df = []

#Loops over each column of the dataframe. 
for i in range(number_of_trajectories_in_dataframe):
    filtered_column = symmetric_df.iloc[:, i]
      
    #Filter column rows on treshold value
    df_threshold = filtered_column[filtered_column< THRESHOLD]
    
    series = pd.Series(df_threshold, name=filtered_column.name)  # Assign name for self-reference
    
    filtered_series = series[series.index != series.name]
    
    #Convert to dictionary
    result_dict = {f"{filtered_column.name}": filtered_series.index.tolist()}  # Store index names
    
    result_df.append(result_dict)

### Print all lists

In [None]:
for item in result_df:
    print(item)

### Print one list and corresponding values from dataframe

In [None]:
TRAJECTORY = "R_ABU"
values = []

for item in result_df:
    if item.get(TRAJECTORY):
        values = item.get(TRAJECTORY)
        print(f"{TRAJECTORY}: {item.get(TRAJECTORY)}")
        break

#Print the group with corresponding similarity values
filtered_column = symmetric_df[[TRAJECTORY]]
filtered_result = filtered_column.loc[values]
filtered_result
