In [None]:
from datetime import datetime
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

BASE_DATA_PATH = os.path.join("out")
OUTPUT_FOLDER = os.path.join("out", "plots")

def get_dataset_file(cloud_name, timestamp = "latest", data_path = BASE_DATA_PATH):
    # Get all CSV files in the folder
    csv_folder = os.path.join(data_path, cloud_name)
    csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
    df = None
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in the folder: {csv_folder}")
    
    if timestamp == 'latest':
        # Parse filenames and find the latest based on the timestamp in the name
        file = max(csv_files, key=lambda x: datetime.strptime(
            '-'.join(x.split('-')[1:]).replace('.csv', ''),
            "%Y-%m-%d-%H:%M:%S"
        ))
        print(f"Loading latest file: {file}")
        df = pd.read_csv(file)
    else:
        # Check for exact match with the date_str in the filename (ignoring the prefix)
        for file in csv_files:
            filename = os.path.basename(file)
            file_timestamp = filename.split('-')[1:]  # Split to get timestamp part
            file_timestamp = '-'.join(file_timestamp).replace('.csv', '')  # Rebuild timestamp string
            if timestamp == file_timestamp:
                print(f"Loading file: {file}")
                df = pd.read_csv(file)
        if df is None:
               FileNotFoundError(f"File with date '{timestamp}' not found in folder: {csv_folder}")
    # Convert times to milliseconds
    df['mean'] = df['mean'] * 1000
    df['stdev'] = df['stdev'] * 1000# 
    df['warmup_time'] = df['warmup_time'] * 1000
    return df

def read_multiple_datasets(clouds_datasets, data_path = BASE_DATA_PATH):
    dfs = {}
    for cloud, dataset in clouds_datasets.items():
        dfs[cloud] = get_dataset_file(cloud, "latest", data_path)
    return dfs

def output_fig(fig, filename, dataset = "all", cloud = None):
    output_folder = ""
    if cloud is None:
        output_folder = os.path.join(OUTPUT_FOLDER, dataset)
    else:
        output_folder = os.path.join(OUTPUT_FOLDER, dataset, cloud)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    fig.savefig(os.path.join(output_folder, filename), dpi=300, bbox_inches='tight')
    plt.close(fig)

In [None]:
plt.rcParams.update({
    'font.family': 'sans-serif',  # Use sans-serif as a fallback
    'font.sans-serif': ['Arial', 'Helvetica', 'DejaVu Sans'],
    
    'font.size': 10,
    'axes.labelsize': 9,
    'axes.titlesize': 11,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'legend.fontsize': 8,
    'figure.titlesize': 12,
    
    'axes.grid': True,
    'axes.grid.axis': 'y',  # Only horizontal grid lines
    'grid.linestyle': '-',  # Solid line
    'grid.linewidth': 0.4,  # Very thin grid lines
    'grid.color': '#CCCCCC',  # Light gray grid
    
    'axes.axisbelow': True,
    'figure.figsize': (6, 4),  # Standard publication-friendly figure size
    'figure.dpi': 100, # Smaller preview
    # 'figure.dpi': 300, 
    
    'lines.linewidth': 1.0,  # Consistent line thickness
    'lines.markersize': 4,  # Consistent marker size

    'figure.facecolor': 'white',  # Background color of the figure
    'figure.edgecolor': 'white'   # Edge color of the figure
})

In [None]:
# Some aux. functions for plot information
def add_execution_details(cloud, dataset, searches, repeats, points, fig, h_ex=0.93):
    execution_details = [
        f"- {searches:,} searches x {repeats:,} repeats",
        f"- Point cloud: {cloud} ({points:,} points)",
        f"- Dataset: {dataset}",
    ]
    fig.text(0.10, h_ex,
            '\n'.join(execution_details),
            fontfamily='monospace',
            color='#505050',
            ha='left',
            va='top',
            linespacing=1.3)
    
def add_execution_details_multiple_datasets(clouds, searches, repeats, fig, h_ex=0.90):
    execution_details = [
        f"- {searches:,} searches x {repeats:,} repeats",
        f"- {len(clouds)} point clouds",
    ]
    fig.text(0.10, h_ex,
            '\n'.join(execution_details),
            fontfamily='monospace',
            color='#505050',
            ha='left',
            va='top',
            linespacing=1.3)

def add_title_subtitle(title, subtitle, fig, h_title=0.98, h_subtitle=0.95):
    fig.text(0.10, h_title, 
            title, 
            fontsize=16, 
            fontweight='bold', 
            ha='left', 
            va='top')

    fig.text(0.10, h_subtitle,
            subtitle,
            fontsize=12,
            fontstyle='italic',
            color='#404040',
            ha='left',
            va='top')

def add_octree_types_legend(legend_handles, legend_labels, legend_title, fig):
    fig.legend(
        legend_handles,
        legend_labels,
        title=legend_title,
        loc="upper right",
        bbox_to_anchor=(0.9, 1)
    )

In [None]:
# This dicts store information about the type parameters and possible combinations used in each visualization, 
# along with the palette of colors
TYPES_INFO_OCTREE_ENCODER = {
    "type_parameters":  ['octree', 'encoder'],
    "available_types": pd.DataFrame({
        'octree': ['LinearOctree', 'LinearOctree', 'Octree', 'Octree', 'Octree'],
        'encoder': ['HilbertEncoder3D', 'MortonEncoder3D', 'HilbertEncoder3D', 'MortonEncoder3D', 'Unencoded']
    }),

    "palette": {
        ('LinearOctree', 'HilbertEncoder3D'): '#1984c5',
        ('LinearOctree', 'MortonEncoder3D'): '#63bff0',
        ('Octree', 'HilbertEncoder3D'): '#c23728',
        ('Octree', 'MortonEncoder3D'): '#de6e56',
        ('Octree', 'Unencoded'): '#e1a692'
    }
}

TYPES_INFO_OCTREE_POINT = {
    "type_parameters":  ['octree', 'point_type'],
    "available_types": pd.DataFrame({
        'octree': ['LinearOctree', 'LinearOctree', 'LinearOctree', 'Octree', 'Octree', 'Octree'],
        'point_type': ['Point', 'Lpoint64', 'Lpoint', 'Point', 'Lpoint64', 'Lpoint']
    }),
    "palette": {
        ('LinearOctree', 'Point'): '#0f5f87',
        ('LinearOctree', 'Lpoint64'): '#1984c5',
        ('LinearOctree', 'Lpoint'): '#63bff0',
        ('Octree', 'Point'): '#9f1b17',
        ('Octree', 'Lpoint64'): '#de6e56',
        ('Octree', 'Lpoint'): '#e1a692'
    }
}

In [None]:
OCTREE_COMP_DATA_PATH = os.path.join(BASE_DATA_PATH, "octree_comp")
ALGO_COMP_DATA_PATH = os.path.join(BASE_DATA_PATH, "algo_comp")
POINT_COMP_DATA_PATH = os.path.join(BASE_DATA_PATH, "point_comp")
APPROX_SEARCH_DATA_PATH = os.path.join(BASE_DATA_PATH, "approx_search")

In [None]:
# tpp plots
TPP_DATA_PATH = os.path.join("out_old", "tpp_comp")
CLOUDS_DATASETS_TPP = {"alcoy": "alcoy",
                        "Lille_0": "Paris_Lille", 
                        "5110_54320": "Dales_LAS",
                        "5135_54435": "Dales_LAS",
                        "bildstein_station1_xyz_intensity_rgb": "Semantic3D",
                        "sg27_station8_intensity_rgb": "Semantic3D",
                        "station1_xyz_intensity_rgb": "Semantic3D",
                        "Speulderbos_2017_TLS": "Speulderbos"
                        }

In [None]:
# Ideas para plots de multiples datasets:
# 1. Fijar kernel, radio (al máximo por ejemplo) y una operación. Iterar para cada dataframe e implementación del octree.
# Plottear tamaño del dataset vs tiempo de ejecución / avg_result_size. En cada punto del line graph debería estar el nombre del dataset para que sea fácil interpretarlo. 
# Plottear una línea para cada implementación del octree.
# Los tamaños de los datasets están ordenados y están en forma de log plot en el eje x.
# Se debería ver como un dataset más grande hace que la eficiencia por punto encontrado sea peor, ya que habrá más fallos de caché y más overheads.
# El problema es poder escoger buen radio/kernel para que todos los datasets tengan un avg_result_size similar, ya que si es muy distinto no se podrá comparar bien,
# cuando el avg_result_size es muy pequeño, el tiempo por punto encontrado es más alto por los overheads. Así que hice varios métodos, uno de ellos es fijar el radio,
# otro escoger el radio de cada dataset que tiene el avg_result_size más grande y el último es escoger el radio que se acerca más a un valor target (avg_size_target).
# Los 2 últimos métodos eligen distintos radios para cada dataframe, y el último es el más interesante en mi opinión.

# 2. Fijar kernel y para cada radio (no todos estarán disponibles en todos los datasets por densidades diferentes), imprimir cada uno de los runtimes
# para cada tipo de octree. Es el más sencillo pero no es independiente de la densidad de puntos. Es la generalización directa de los plots de antes y al contrario
# que el anterior no muestra como afecta el tamaño total del dataset a la eficiencia.

# Otra idea TODO: Fijar un kernel y una operacion, iterar por cada dataset, radio e implementación del octree
# Plottear el avg_result_size para cada par (dataset, radio) vs el tiempo de ejecución para cada implementación del octree. Una línea en un line graph
# por cada implementación del octree. Los avg_result_sizes están ordenados por tamaño, obviamente mayor radio => mayor avg_result_size, pero no necesariamente
# a través de datasets, porque la densidad varía. El problema que le veo a esta idea es que no ves cómo afecta el tamaño del dataset en el rendimiento, ya que no se
# plotea el dataset de cada avg_result_size, así que solo es una extensión de los plots que se podrían hacer con 1 solo dataframe.

In [None]:
# Idea 1.

# tpp = time per point found (ms/point)
# This one is not used right now because I did not do an execution with all datasets having same radius
def tpp_fixed_radius(clouds_datasets, kernel, radius, operation, operation_name, types_info=TYPES_INFO_OCTREE_ENCODER, data_path=TPP_DATA_PATH):
    dfs = read_multiple_datasets(clouds_datasets, data_path)
    fig, ax = plt.subplots(figsize=(12, 6))
    implementation_data = {}
    legend_handles, legend_labels = [], []
    x_labels, x_ticks = [], []
    for df_name, df in dfs.items():
        df = df[(df['kernel'] == kernel) & (df['radius'] == radius) & (df['operation'] == operation)]        
        df = df.reset_index(drop=True) 
        avg_result_size = df['avg_result_size'].iloc[0]
        dataset_size = df['npoints'].iloc[0]
        x_labels.append(f"{df_name}\nN = {dataset_size}")
        x_ticks.append(dataset_size)
        for j, (_, params) in enumerate(types_info["available_types"].iterrows()):
            key = tuple(params[col] for col in types_info["type_parameters"])
            df_data = df[
                (df[types_info["type_parameters"]] == pd.Series(key, index=types_info["type_parameters"])).all(axis=1)
            ]
            if df_data.empty:
                print("Warning, no data for ", key, " with kernel = ", kernel, 
                      ", radius = ", radius, " and operation = ", operation, " at dataset ", df_name)
                
            norm_time = df_data['mean'].iloc[0] / avg_result_size # ms / point
            if key not in implementation_data:
                implementation_data[key] = {'sizes': [], 'times': []}
            implementation_data[key]['sizes'].append(dataset_size)
            implementation_data[key]['times'].append(norm_time)

   # Plot a line for each implementation
    for key, data in implementation_data.items():
        # Sort by size to ensure proper line connection
        sizes = np.array(data['sizes'])
        times = np.array(data['times'])
        sort_idx = np.argsort(sizes)
        
        formatted_label = ", ".join(f"{value}" for value in key)
        line = ax.plot(sizes[sort_idx], times[sort_idx], 'o-', 
                      label=formatted_label, 
                      color=types_info["palette"][key],
                      linewidth=2, markersize=8)[0]
        

        if formatted_label not in legend_labels:
            legend_handles.append(line)
            legend_labels.append(formatted_label)
    
    ax.set_xscale('log')
    
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_labels, rotation=45, ha='right')

    ax.set_ylabel('Average time per point found (ms/point)', fontsize=12)
    
    ax.grid(True, which="both", ls="-", alpha=0.2)
    
    add_octree_types_legend(legend_handles, legend_labels, "Octree type", fig)
    add_title_subtitle(
        f"{operation_name} performance analysis",
        f"Average time per point found using {kernel} kernel and radius {radius}",
        fig,
        h_title = 0.98,
        h_subtitle  = 0.93
    )
    
    nsearches = df['num_searches'].iloc[0]
    nrepeats = df['repeats'].iloc[0]
    add_execution_details_multiple_datasets(dfs.keys(), nsearches, nrepeats, fig, h_ex = 0.88)
    
    plt.subplots_adjust(right=0.85, top=0.75)  # Make room for legend and title

    return fig



In [None]:
# Getting the biggest avg_result_size instead of fixing kernel,radius so datasets with different densities 
# all have a lot of points found in the search

def tpp_max_avg_result_size(clouds_datasets, kernel, operation, operation_name, types_info=TYPES_INFO_OCTREE_ENCODER, data_path=TPP_DATA_PATH):
    dfs = read_multiple_datasets(clouds_datasets, data_path)
    fig, ax = plt.subplots(figsize=(12, 6))
    implementation_data = {}
    legend_handles, legend_labels = [], []
    x_labels, x_ticks = [], []

    for df_name, df in dfs.items():
        # Filter the dataframe by kernel and operation
        df = df[(df['kernel'] == kernel) & (df['operation'] == operation)]
        # Need to reset the index so we can use idxmax
        df = df.reset_index(drop=True) 

        # Find the maximum row and get its maximum
        max_row = df.loc[df['avg_result_size'].idxmax()]
        avg_result_size, radius, dataset_size = max_row['avg_result_size'], max_row['radius'], max_row['npoints']

        print(f"Dataset: {df_name}, radius taken: {radius}, max avg_result_size: {avg_result_size}")

        x_labels.append(f"{df_name}\nN = {dataset_size}")
        x_ticks.append(dataset_size)

        for j, (_, params) in enumerate(types_info["available_types"].iterrows()):
            key = tuple(params[col] for col in types_info["type_parameters"])
            df_data = df[
                (df[types_info["type_parameters"]] == pd.Series(key, index=types_info["type_parameters"])).all(axis=1)
            ]
            if df_data.empty:
                print("Warning, no data for ", key, " with kernel = ", kernel, 
                      ", radius = ", radius, " and operation = ", operation, " at dataset ", df_name)
                
            norm_time = df_data['mean'].iloc[0] / avg_result_size  # ms / point
            if key not in implementation_data:
                implementation_data[key] = {'sizes': [], 'times': []}
            implementation_data[key]['sizes'].append(dataset_size)
            implementation_data[key]['times'].append(norm_time)

    # Plot a line for each implementation
    for key, data in implementation_data.items():
        # Sort by size to ensure proper line connection
        sizes = np.array(data['sizes'])
        times = np.array(data['times'])
        sort_idx = np.argsort(sizes)
        
        formatted_label = ", ".join(f"{value}" for value in key)
        line = ax.plot(sizes[sort_idx], times[sort_idx], 'o-', 
                      label=formatted_label, 
                      color=types_info["palette"][key],
                      linewidth=2, markersize=8)[0]
        
        if formatted_label not in legend_labels:
            legend_handles.append(line)
            legend_labels.append(formatted_label)
    
    ax.set_xscale('log')
    
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_labels, rotation=45, ha='right')

    ax.set_ylabel('Average time per point found (ms/point)', fontsize=12)
    
    ax.grid(True, which="both", ls="-", alpha=0.2)
    
    add_octree_types_legend(legend_handles, legend_labels, "Octree type", fig)
    add_title_subtitle(
        f"{operation_name} performance analysis",
        f"Average time per point found using {kernel} kernel, using maximum radius available",
        fig,
        h_title = 0.98,
        h_subtitle  = 0.93
    )
    
    nsearches = max_row['num_searches']
    nrepeats = max_row['repeats']
    add_execution_details_multiple_datasets(dfs.keys(), nsearches, nrepeats, fig, h_ex = 0.88)
    
    plt.subplots_adjust(right=0.85, top=0.75)

    return fig

In [None]:
fig = tpp_max_avg_result_size(
    CLOUDS_DATASETS_TPP,
    kernel="Sphere",
    operation='neighSearch',
    operation_name='Neighbor Search'
)

In [None]:
# Getting the biggest avg_result_size instead of fixing kernel,radius so datasets with different densities 
# all have a lot of points found

def tpp_closest_to_target_avg(clouds_datasets, kernel, target_avg, operation, operation_name, types_info=TYPES_INFO_OCTREE_ENCODER, data_path=TPP_DATA_PATH):
    dfs = read_multiple_datasets(clouds_datasets, data_path)
    fig, ax = plt.subplots(figsize=(12, 6))
    implementation_data = {}
    legend_handles, legend_labels = [], []
    x_labels, x_ticks = [], []

    for df_name, df in dfs.items():
        # Filter the dataframe by kernel and operation
        df = df[(df['kernel'] == kernel) & (df['operation'] == operation)]
        # Need to reset the index so we can use idxmax
        df = df.reset_index(drop=True) 

        # Find the row with avg_result_size closest to the target_avg_max
        closest_row = df.iloc[(df['avg_result_size'] - target_avg).abs().idxmin()]
        # Dataset size could have been taken from any row, but we take it from the closest row for convenience
        avg_size_found, chosen_radius, dataset_size = closest_row['avg_result_size'], closest_row['radius'], closest_row["npoints"]
        print(f"Dataset: {df_name}, radius chosen: {chosen_radius}, found avg_result_size: {avg_size_found}, with diff. to target: {abs(avg_size_found - target_avg)}")


        x_labels.append(f"{df_name}\nN = {dataset_size}")
        x_ticks.append(dataset_size)

        for j, (_, params) in enumerate(types_info["available_types"].iterrows()):
            key = tuple(params[col] for col in types_info["type_parameters"])
            # Filter the dataframe by octree type
            octree_impl_df = df[
                (df[types_info["type_parameters"]] == pd.Series(key, index=types_info["type_parameters"])).all(axis=1)
            ]
            if octree_impl_df.empty:
                print("Warning, no data for ", key, " with kernel = ", kernel, 
                      ", radius = ", chosen_radius, " and operation = ", operation, " at dataset ", df_name)
                
            norm_time = octree_impl_df['mean'].iloc[0] / avg_size_found  # ms / point
            if key not in implementation_data:
                implementation_data[key] = {'sizes': [], 'times': []}
            implementation_data[key]['sizes'].append(dataset_size)
            implementation_data[key]['times'].append(norm_time)

    # Plot a line for each implementation
    for key, data in implementation_data.items():
        # Sort by size to ensure proper line connection
        sizes = np.array(data['sizes'])
        times = np.array(data['times'])
        sort_idx = np.argsort(sizes)
        
        formatted_label = ", ".join(f"{value}" for value in key)
        line = ax.plot(sizes[sort_idx], times[sort_idx], 'o-', 
                      label=formatted_label, 
                      color=types_info["palette"][key],
                      linewidth=2, markersize=8)[0]
        
        if formatted_label not in legend_labels:
            legend_handles.append(line)
            legend_labels.append(formatted_label)
    
    ax.set_xscale('log')
    
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_labels, rotation=45, ha='right')

    ax.set_ylabel('Average time per point found (ms/point)', fontsize=12)
    
    ax.grid(True, which="both", ls="-", alpha=0.2)
    
    add_octree_types_legend(legend_handles, legend_labels, "Octree type", fig)
    add_title_subtitle(
        f"{operation_name} performance analysis",
        f"Analysis of average time to find points over multiple datasets using {kernel} kernel\n"
        f"Radius taken is the one making avg. result size closest to {target_avg}",
        fig,
        h_title = 0.98,
        h_subtitle  = 0.93
    )

    # This values should be the same for all datasets, otherwise data is inconsistent
    nsearches = closest_row['num_searches']
    nrepeats = closest_row['repeats']
    add_execution_details_multiple_datasets(dfs.keys(), nsearches, nrepeats, fig, h_ex = 0.86)
    
    plt.subplots_adjust(right=0.85, top=0.75)

    return fig

In [None]:
fig = tpp_closest_to_target_avg(
    CLOUDS_DATASETS_TPP,
    kernel="Sphere",
    target_avg=30000,
    operation='neighSearch',
    operation_name='Neighbor Search'
)

In [None]:
output_fig(tpp_max_avg_result_size(CLOUDS_DATASETS_TPP, kernel="Sphere", operation='neighSearch', operation_name='Neighbors Search'), "tpp_neigh_search_max_radius")
output_fig(tpp_max_avg_result_size(CLOUDS_DATASETS_TPP, kernel="Sphere", operation='numNeighSearch', operation_name='Num. of Neighbors Search'), "tpp_num_neigh_search_max_radius")
TARGET_AVG = 30000
output_fig(tpp_closest_to_target_avg(CLOUDS_DATASETS_TPP, kernel="Sphere", target_avg=TARGET_AVG, operation='neighSearch', operation_name='Neighbors Search'), f"tpp_num_neigh_search_avg_near_{TARGET_AVG}")
output_fig(tpp_closest_to_target_avg(CLOUDS_DATASETS_TPP, kernel="Sphere", target_avg=TARGET_AVG, operation='numNeighSearch', operation_name='Num. of Neighbors Search'),f"tpp_num_neigh_search_avg_near_{TARGET_AVG}")