# Benchmark Analysis
Benchmark data is found under /swann/benchmark

In [None]:
# Imports
import csv
import pandas as pd
import glob

In [None]:
# Helper functions

def csv_to_df(file: str):
    """ Takes in a benchmark file and returns as Pandas DataFrame """
    df = pd.read_csv(
        f'/swann/benchmark/{file}',
        header = 9
    )
    df.dropna(axis=1, inplace=True)
    return df

def get_benchmarks(p=""):
    """ Find all csv files in benchmark folder sorted by date """
    files = glob.glob(f'/swann/benchmark/{p}*.csv')
    files.sort()
    return files

def filter_df_with_wildcard_name(df: pd.DataFrame, name: str):
    """
        Example:
            - name = "LoadDatasets"
        Will return rows where name is "*LoadDatasets*"
    """
    return df.loc[df['name'].str.contains(name)]

def add_empty_time(df: pd.DataFrame, name: str):
    return pd.concat([
        pd.DataFrame(data={
            "name": [f"{name}/-1"],
            "iterations": [1],
            "real_time": [0],
            "cpu_time": [0],
            "time_unit": ["ms"],
            "input_size": [0]
        }),
        df
    ], ignore_index=True)

In [None]:
for bm in get_benchmarks('/'):
    print(f'[+] {bm}')

### How to load a dataset

In [None]:
# Load benchmark that has loading / parsing of datasets of size XS and S
df_bm_loadparse = csv_to_df('...')
df_bm_loadparse

# Visualize data

### Benchmark Query

In [None]:
df_bm_query = csv_to_df("hasc/benchmark-optimal.csv")

df_query = filter_df_with_wildcard_name(df_bm_query, "QueryXPointsLSHForest")

# Set input size
df_query.at[0, 'input_size'] = "100.000"
df_query.at[1, 'input_size'] = "300.000"
df_query.at[2, 'input_size'] = "10.000.000"

df_query.at[0, 'input_size_val'] = 100_000
df_query.at[1, 'input_size_val'] = 300_000
df_query.at[2, 'input_size_val'] = 10_000_000


# Show in ms
df_query.at[0, 'timePerQuery'] *= 1_000
df_query.at[1, 'timePerQuery'] *= 1_000
df_query.at[2, 'timePerQuery'] *= 1_000

df_query

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.bar(df_query['input_size'], df_query['timePerQuery'], color = 'teal')
plt.xlabel("Dataset size")
plt.ylabel("Avg. query time (ms)")
plt.title("Average query time compared to dataset size - (10 nn, 90% recall)")
plt.show()

In [None]:
plt.bar(df_query['input_size'], df_query['input_size_val']/df_query['timePerQuery'], color = 'teal')
plt.xlabel("Dataset size")
plt.ylabel("Avg. query time for 10 nn in ms")
plt.title("Average query time compared to dataset size")
plt.show()

In [None]:
df_query_line = df_query.plot.line(title='LSH Forest query - 10 nearest neighbors', x='input_size', y='cpu_time', legend=False)
df_query_line.set_xlabel("Number of points")
df_query_line.set_ylabel("Time (ms)")

### Loading of dataset

In [None]:
df_load = filter_df_with_wildcard_name(df_bm_loadparse, "LoadDatasets")
df_load_0 = pd.DataFrame(data={
    "name": ["LoadDatasets/000000"],
    "iterations": [1],
    "real_time": [0],
    "cpu_time": [0],
    "time_unit": ["ms"],
    "input_size": [0]
})
df_load = pd.concat([df_load_0, df_load])
df_load

In [None]:
load_line = df_load.plot.line(title='Load dataset', x='input_size', y='cpu_time', legend=False)
load_line.set_xlabel("Number of points")
load_line.set_ylabel("Time (ms)")