# Plotting notebook

Just a scratchbook to experiment with different plotting ideas and visualizations for the LCS benchmark results.

In [None]:
# 1. Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt

# Optional: display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

### Read the data

Adjust paths if necessary.

In [None]:
# 2. Load Raw Benchmark Data
raw_path = '../../Code/Results/raw_runs.csv'
raw_df = pd.read_csv(raw_path)
raw_df.head()

In [None]:
# 3. Select Scenario and Algorithm
# Example: filter for a scenario and algorithm
scenario = 'disjoint_alphabet'  # Change as needed
algorithm = 'Enhanced Suffix Array'  # Change as needed

filtered_df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == algorithm)]
filtered_df.head()

In [None]:
# 4. Plot Build Time vs String Length
plt.figure(figsize=(8, 5))
for alg in raw_df['algorithm'].unique():
    df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg)]
    grouped = df.groupby('length')['build_time_ms'].agg(['median', 'quantile'])
    plt.plot(df['length'], df['build_time_ms'], 'o', label=f'{alg} (raw)')
plt.xlabel('String length')
plt.ylabel('Build time [ms]')
plt.title(f'Build Time vs Length ({scenario})')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# 5. Plot Query Time vs String Length
plt.figure(figsize=(8, 5))
for alg in raw_df['algorithm'].unique():
    df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg)]
    plt.plot(df['length'], df['query_time_ms'], 'o', label=f'{alg} (raw)')
plt.xlabel('String length')
plt.ylabel('Query time [ms]')
plt.title(f'Query Time vs Length ({scenario})')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# 6. Plot Total Time Statistics
plt.figure(figsize=(8, 5))
for alg in raw_df['algorithm'].unique():
    df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg)]
    plt.plot(df['length'], df['total_time_ms'], 'o', label=f'{alg} (raw)')
plt.xlabel('String length')
plt.ylabel('Total time [ms]')
plt.title(f'Total Time vs Length ({scenario})')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# 7. Plot Memory Usage Statistics
plt.figure(figsize=(8, 5))
for alg in raw_df['algorithm'].unique():
    df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg)]
    plt.plot(df['length'], df['peak_memory_kib'], 'o', label=f'{alg} (raw)')
plt.xlabel('String length')
plt.ylabel('Peak memory [KiB]')
plt.title(f'Peak Memory vs Length ({scenario})')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# 8. Plot Index Size and Build/Query Memory
plt.figure(figsize=(8, 5))
for alg in raw_df['algorithm'].unique():
    df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg)]
    plt.plot(df['length'], df['index_size_kib'], 'o', label=f'{alg} (index size)')
    plt.plot(df['length'], df['build_peak_memory_kib'], 's', label=f'{alg} (build peak)')
    plt.plot(df['length'], df['query_extra_memory_kib'], '^', label=f'{alg} (query extra)')
plt.xlabel('String length')
plt.ylabel('Memory [KiB]')
plt.title(f'Index Size and Build/Query Memory vs Length ({scenario})')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

### Distribution Plot



In [None]:
length = 10000  # Change as needed (options are 100, 500, 1000, 5000, 10000 if going by raw_runs.csv in the repo)
scenario = 'disjoint_alphabet'  # Change as needed (options are "disjoint_alphabet", "mutated_implant", "repetitive_with_noise", "random_uniform", and "near_identical")

algorithms = ['Suffix Automaton', 'Enhanced Suffix Array']

metric = 'build_time_ms'  # Change as needed. For options, look at the columns in raw_runs.csv.
metric_label = 'Build Time (ms)'
plt.figure(figsize=(12, 5))

# Suffix Automaton subplot
plt.subplot(1, 2, 1)
alg = 'Suffix Automaton'
df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg) & (raw_df['length'] == length)]
plt.hist(df[metric], bins=20, alpha=0.7, color='tab:blue', label=f'{alg} ({metric_label})')
plt.xlabel(metric_label)
plt.ylabel('Frequency')
plt.title(f'Suffix Automaton {metric_label} Distribution')
plt.grid(alpha=0.3)

# ESA subplot
plt.subplot(1, 2, 2)
alg = 'Enhanced Suffix Array'
df = raw_df[(raw_df['scenario'] == scenario) & (raw_df['algorithm'] == alg) & (raw_df['length'] == length)]
plt.hist(df[metric], bins=20, alpha=0.7, color='tab:orange', label=f'{alg} ({metric_label})')
plt.xlabel(metric_label)
plt.ylabel('Frequency')
plt.title(f'ESA {metric_label} Distribution')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

Clearly, everything is heavily skewed, so median would probably be a better "average".