# Evaluation of results
This notebook contains the evaluation of the results.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import ast
from plotly.subplots import make_subplots
import warnings
import re

warnings.filterwarnings("ignore")

In [2]:
def clean_np_literals(s):
    if not isinstance(s, str):
        return s
    # Tisztítjuk a np.float64(…) és np.int64(…) hívásokat
    s = re.sub(r'np\.float64\(([^)]+)\)', r'\1', s)
    s = re.sub(r'np\.int64\(([^)]+)\)', r'\1', s)
    return s

# OnlineKMeans v2

In [4]:
df_onlinekmeans = pd.read_excel("./data/results/onlinekmeans_v4_from360clusters_final.xlsx")
df_onlinekmeans['metrics'] = (df_onlinekmeans['metrics'].apply(clean_np_literals).apply(ast.literal_eval))
df_onlinekmeans['metrics'] = df_onlinekmeans['metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df_minibatchkmeans = pd.read_excel("./data/results/onlinekmeans_with_minibatchkmeans_v4_final.xlsx")
df_minibatchkmeans['metrics'] = (df_minibatchkmeans['metrics'].apply(clean_np_literals).apply(ast.literal_eval))
df_minibatchkmeans['metrics'] = df_minibatchkmeans['metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df_kmeans = pd.read_excel("./data/results/onlinekmeans_with_kmeans_v4_final.xlsx")
df_kmeans['metrics'] = (df_kmeans['metrics'].apply(clean_np_literals).apply(ast.literal_eval))
df_kmeans['metrics'] = df_kmeans['metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df_faiss = pd.read_excel("./data/results/onlinekmeans_with_faiss_v4_final.xlsx")
df_faiss['metrics'] = (df_faiss['metrics'].apply(clean_np_literals).apply(ast.literal_eval))
df_faiss['metrics'] = df_faiss['metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [5]:
df_onlinekmeans.loc[0, "metrics"]

{'doc_accuracy': 0.648,
 'chunk_accuracy': 0.5815,
 'doc_precision': 1.0,
 'doc_recall': 0.6422764227642277,
 'doc_f1': 0.7821782178217822,
 'chunk_precision': 1.0,
 'chunk_recall': 0.5746951219512195,
 'chunk_f1': 0.7299128751210068,
 'correct_chunk_accuracy': 0.11655,
 'doc_true_positives': 1264,
 'doc_true_negatives': 32,
 'doc_false_positives': 0,
 'doc_false_negatives': 704,
 'chunk_true_positives': 1131,
 'chunk_true_negatives': 32,
 'chunk_false_positives': 0,
 'chunk_false_negatives': 837}

In [9]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = go.Figure()

    for key in keys:
        fig.add_trace(go.Scatter(
            x=df['batch'],
            y=df['metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ))

    fig.update_layout(
        title=title,
        xaxis_title='Batch',
        yaxis_title=title,
        template='plotly_white'
    )
    fig.update_yaxes(range=[0, 1.1])
    fig.show()

NameError: name 'df' is not defined

In [10]:
df_kmeans.columns

Index(['Unnamed: 0', 'batch', 'init_time', 'update_time', 'retrieval_time',
       'metrics', 'n_clusters'],
      dtype='object')

In [15]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = make_subplots(rows=1, cols=4, subplot_titles=(f"OnlineKMeans - {title}", f"KMeans - {title}", f"MiniBatchKMeans - {title}", f"Faiss - {title}"))

    for key in keys:
        fig.add_trace(go.Scatter(
            x=df_onlinekmeans['batch'],
            y=df_onlinekmeans['metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=1)

        fig.add_trace(go.Scatter(
            x=df_kmeans['batch'],
            y=df_kmeans['metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=2)

        fig.add_trace(go.Scatter(
            x=df_minibatchkmeans['batch'],
            y=df_minibatchkmeans['metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=3)

        fig.add_trace(go.Scatter(
            x=df_faiss['batch'],
            y=df_faiss['metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=4)

    fig.update_yaxes(range=[0, 1.1], row=1, col=1)
    fig.update_yaxes(range=[0, 1.1], row=1, col=2)
    fig.update_yaxes(range=[0, 1.1], row=1, col=3)
    fig.update_yaxes(range=[0, 1.1], row=1, col=4)

    fig.update_xaxes(title_text='batch', row=1, col=1)
    fig.update_xaxes(title_text='batch', row=1, col=2)
    fig.update_xaxes(title_text='batch', row=1, col=3)
    fig.update_xaxes(title_text='batch', row=1, col=4)

    fig.show()

In [7]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"]
}

algorithms = {
    "OnlineKMeans": df_onlinekmeans,
    "KMeans": df_kmeans,
    "MiniBatchKMeans": df_minibatchkmeans,
    "FAISS": df_faiss
}

for title, keys in groups.items():
    fig = go.Figure()
    all_values = []  # collect all y-values to compute range
    
    for algo_name, df in algorithms.items():
        for key in keys:
            y_values = df['metrics'].apply(lambda m: m.get(key))
            all_values.extend(y_values.dropna())  # collect non-NaN values
            
            fig.add_trace(go.Scatter(
                x=df['batch'],
                y=y_values,
                mode='lines+markers',
                name=f"{algo_name} - {key}"
            ))

    # compute dynamic min/max with small padding
    if all_values:
        ymin, ymax = np.min(all_values), np.max(all_values)
        padding = (ymax - ymin) * 0.05 if ymax != ymin else 0.05
        yrange = [ymin - padding, ymax + padding]
    else:
        yrange = [0, 1]

    fig.update_layout(
        title=f"{title} Comparison Across Algorithms",
        xaxis_title="Batch",
        yaxis_title=title,
        yaxis=dict(range=yrange),
        legend_title="Algorithm + Metric",
        template="plotly_white",
        width=900,
        height=500
    )

    fig.show()


In [8]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np

groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"]
}

algorithms = {
    "OnlineKMeans": df_onlinekmeans,
    "KMeans": df_kmeans,
    "MiniBatchKMeans": df_minibatchkmeans,
    "FAISS": df_faiss
}

for title, keys in groups.items():
    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(f"Document-level {title}", f"Chunk-level {title}")
    )

    all_values = []

    # --- Left plot: document-level metrics ---
    for algo_name, df in algorithms.items():
        y_values = df['metrics'].apply(lambda m: m.get(keys[0]))  # doc metric
        all_values.extend(y_values.dropna())
        fig.add_trace(
            go.Scatter(
                x=df['batch'],
                y=y_values,
                mode='lines+markers',
                name=f"{algo_name} - {keys[0]}"
            ),
            row=1, col=1
        )

    # --- Right plot: chunk-level metrics ---
    for algo_name, df in algorithms.items():
        y_values = df['metrics'].apply(lambda m: m.get(keys[1]))  # chunk metric
        all_values.extend(y_values.dropna())
        fig.add_trace(
            go.Scatter(
                x=df['batch'],
                y=y_values,
                mode='lines+markers',
                name=f"{algo_name} - {keys[1]}"
            ),
            row=1, col=2
        )

    # Compute dynamic y-axis range (optional)
    if all_values:
        ymin, ymax = np.min(all_values), np.max(all_values)
        padding = (ymax - ymin) * 0.05 if ymax != ymin else 0.05
        yrange = [ymin - padding, ymax + padding]
    else:
        yrange = [0.5, 1]

    fig.update_yaxes(title_text=title, range=yrange, row=1, col=1)
    fig.update_yaxes(title_text=title, range=yrange, row=1, col=2)
    fig.update_xaxes(title_text="Batch", row=1, col=1)
    fig.update_xaxes(title_text="Batch", row=1, col=2)

    fig.update_layout(
        title=f"{title} Comparison Across Algorithms",
        legend_title="Algorithm + Metric",
        template="plotly_white",
        width=1100,
        height=500
    )

    fig.show()


In [12]:
algorithms = {
    "OnlineKMeans": df_onlinekmeans,
    "KMeans": df_kmeans,
    "MiniBatchKMeans": df_minibatchkmeans,
    "FAISS": df_faiss
}

fig = go.Figure()
all_values = []

# Add traces for each algorithm
for algo_name, df in algorithms.items():
    y_values = df["update_time"]
    x_values = df["batch"] if "batch" in df.columns else np.arange(len(df))
    if algo_name == "OnlineKMeans":
        x_values = df["batch"] + 0.1
    if algo_name == "FAISS":
        x_values = df["batch"] + 0.2
    all_values.extend(y_values.dropna())

    fig.add_trace(go.Scatter(
        x=x_values,
        y=y_values,
        mode='lines+markers',
        name=algo_name
    ))

# Compute dynamic y-axis range with small padding
if all_values:
    ymin, ymax = np.min(all_values), np.max(all_values)
    padding = (ymax - ymin) * 0.05 if ymax != ymin else 0.05
    yrange = [ymin - padding, ymax + padding]
else:
    yrange = [0, 1]

# Layout configuration
fig.update_layout(
    title="Update Time Comparison Across Algorithms",
    xaxis_title="Batch",
    yaxis_title="Update Time (seconds)",
    yaxis=dict(range=yrange),
    legend_title="Algorithm",
    template="plotly_white",
    width=900,
    height=500
)

fig.show()


In [13]:
for algo_name, df in algorithms.items():
    print(f"{algo_name}: {df.loc[0, 'init_time']}")

OnlineKMeans: 77.97085237503052
KMeans: 106.6367199420929
MiniBatchKMeans: 17.00993609428406
FAISS: 10.33733534812927


# Centroid vs full
hyperparameter optimization, data v2, tensors v2

In [15]:
centroid_df = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/centroid_results_kmeans500_v4_l2_final.xlsx")
centroid_df['centroid_metrics'] = (centroid_df['centroid_metrics'].apply(clean_np_literals).apply(ast.literal_eval))
centroid_df['centroid_metrics'] = centroid_df['centroid_metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
centroid_df['x_axis'] = centroid_df['top_k'].astype(str) + "_" + centroid_df['top_n_clusters'].astype(str)

full_df = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/full_results_kmeans500_v4_l2_final.xlsx")
full_df['full_metrics'] = (full_df['full_metrics'].apply(clean_np_literals).apply(ast.literal_eval))
full_df['full_metrics'] = full_df['full_metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

faiss_df = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/faiss_results_kmeans500_v4_l2_final_hnsw.xlsx")
faiss_df['faiss_metrics'] = (faiss_df['faiss_metrics'].apply(clean_np_literals).apply(ast.literal_eval))
faiss_df['faiss_metrics'] = faiss_df['faiss_metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [16]:
centroid_df['common_x'] = centroid_df.index
full_df['common_x'] = full_df.index * 4
faiss_df['common_x'] = faiss_df.index * 4

In [17]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = make_subplots(rows=1, cols=3, subplot_titles=(f"Centroid - {title}", f"Full - {title}", f"FAISS - {title}"))

    for key in keys:
        fig.add_trace(go.Scatter(
            x=centroid_df['common_x'],
            y=centroid_df['centroid_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=1)

        fig.add_trace(go.Scatter(
            x=full_df['common_x'],
            y=full_df['full_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=2)

        fig.add_trace(go.Scatter(
            x=faiss_df['common_x'],
            y=faiss_df['faiss_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=3)

    fig.update_yaxes(range=[0, 1.1], row=1, col=1)
    fig.update_yaxes(range=[0, 1.1], row=1, col=2)
    fig.update_yaxes(range=[0, 1.1], row=1, col=3)

    fig.update_xaxes(title_text='top k with top n clusters', row=1, col=1)
    fig.update_xaxes(title_text='top k', row=1, col=2)
    fig.update_xaxes(title_text='top k', row=1, col=3)

    fig.show()

In [18]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = go.Figure()

    for key in keys:
        fig.add_trace(go.Scatter(
            x=centroid_df['common_x'],
            y=centroid_df['centroid_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=f"Centroid - {key}"
        ))

        fig.add_trace(go.Scatter(
            x=full_df['common_x'],
            y=full_df['full_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=f"Full - {key}"
        ))

        fig.add_trace(go.Scatter(
            x=faiss_df['common_x'],
            y=faiss_df['faiss_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=f"FAISS - {key}"
        ))

    fig.update_layout(
        title=f"{title} Comparison Across Methods",
        xaxis_title="Top k (or clusters)",
        yaxis_title=title,
        yaxis=dict(range=[0, 1.1]),
        template="plotly_white",
    )

    fig.show()


In [19]:
faiss_df

Unnamed: 0.1,Unnamed: 0,top_k,faiss_metrics,faiss_time,common_x
0,0,3,"{'doc_accuracy': 0.669, 'chunk_accuracy': 0.60...",443.810484,0
1,1,5,"{'doc_accuracy': 0.6885, 'chunk_accuracy': 0.6...",476.954523,4
2,2,12,"{'doc_accuracy': 0.7105, 'chunk_accuracy': 0.6...",452.80258,8
3,3,25,"{'doc_accuracy': 0.719, 'chunk_accuracy': 0.68...",450.713647,12


In [20]:
df_queries = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")

In [29]:
df_queries.shape

(87599, 4)

In [22]:
centroid_df

Unnamed: 0.1,Unnamed: 0,top_k,top_n_clusters,centroid_metrics,centroid_time,x_axis,common_x
0,0,3,5,"{'doc_accuracy': 0.6355, 'chunk_accuracy': 0.5...",445.320294,3_5,0
1,1,3,10,"{'doc_accuracy': 0.653, 'chunk_accuracy': 0.59...",466.792119,3_10,1
2,2,3,20,"{'doc_accuracy': 0.667, 'chunk_accuracy': 0.60...",482.086512,3_20,2
3,3,3,35,"{'doc_accuracy': 0.675, 'chunk_accuracy': 0.61...",507.444328,3_35,3
4,4,5,5,"{'doc_accuracy': 0.6515, 'chunk_accuracy': 0.5...",467.517971,5_5,4
5,5,5,10,"{'doc_accuracy': 0.671, 'chunk_accuracy': 0.61...",489.392804,5_10,5
6,6,5,20,"{'doc_accuracy': 0.685, 'chunk_accuracy': 0.62...",503.066091,5_20,6
7,7,5,35,"{'doc_accuracy': 0.693, 'chunk_accuracy': 0.63...",517.023323,5_35,7
8,8,12,5,"{'doc_accuracy': 0.672, 'chunk_accuracy': 0.61...",474.814393,12_5,8
9,9,12,10,"{'doc_accuracy': 0.694, 'chunk_accuracy': 0.64...",477.631858,12_10,9


In [21]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("Brute Force", "FAISS", "Clustering"))

#divisor = df_queries.shape[0]
divisor = 2000

fig.add_trace(go.Scatter(
    x=full_df['top_k'],
    y=full_df['full_time'] / divisor,
    mode='lines+markers',
    name="Brute Force"
), row=1, col=1)
fig.add_trace(go.Scatter(
    x=centroid_df['x_axis'],
    y=centroid_df['centroid_time'] / divisor,
    mode='lines+markers',
    name="Clustering"
), row=1, col=3)
fig.add_trace(go.Scatter(
    x=faiss_df['top_k'],
    y=faiss_df['faiss_time'] / divisor,
    mode='lines+markers',
    name="FAISS"
), row=1, col=2)
fig.update_layout(
    template='plotly_white'
)
fig.update_xaxes(title_text='top k chunks', row=1, col=1)
fig.update_xaxes(title_text='top k chunks with top n clusters', row=1, col=3)
fig.update_xaxes(title_text='top k chunks', row=1, col=2)
fig.update_yaxes(title_text='Time (s)', row=1, col=1)
fig.update_yaxes(title_text='Time (s)', row=1, col=2)
fig.update_yaxes(title_text='Time (s)', row=1, col=3)

fig.update_yaxes(range=[0, 15000 / divisor], row=1, col=1)
fig.update_yaxes(range=[0, 15000 / divisor], row=1, col=2)
fig.update_yaxes(range=[0, 15000 / divisor], row=1, col=3)

fig.update_layout(title="Time Comparison Across Methods")

fig.show()

In [None]:
centroid_df = centroid_df.join(pd.json_normalize(centroid_df.pop("centroid_metrics")))
full_df = full_df.join(pd.json_normalize(full_df.pop("full_metrics")))
faiss_df = faiss_df.join(pd.json_normalize(faiss_df.pop("faiss_metrics")))

In [53]:
faiss_df = np.round(faiss_df, 3)
centroid_df = np.round(centroid_df, 3)
full_df = np.round(full_df, 3)

In [54]:
faiss_df[['top_k', 'doc_accuracy', 'chunk_accuracy', 'doc_recall', 'chunk_recall']].to_excel("./faiss_summary.xlsx", index=False)
centroid_df[['top_k', 'top_n_clusters', 'doc_accuracy', 'chunk_accuracy', 'doc_recall', 'chunk_recall']].to_excel("./centroid_summary.xlsx", index=False)
full_df[['top_k', 'doc_accuracy', 'chunk_accuracy', 'doc_recall', 'chunk_recall']].to_excel("./full_summary.xlsx", index=False)

# Average speed up / accuracy decrease

In [41]:
chunk_num = 84007

centroid_df['time_per_chunk'] = centroid_df['centroid_time'] / chunk_num
full_df['time_per_chunk'] = full_df['full_time'] / chunk_num

top_ks = [3, 5, 12, 25]

average_ratios = {}
for top_k in top_ks:
    top_k_centroid_df = centroid_df[centroid_df['top_k'] == top_k]
    full_time = full_df[full_df['top_k'] == top_k]['time_per_chunk'].values[0]

    average_time = 0
    for idx, row in top_k_centroid_df.iterrows():
        time_ratio = full_time / row['time_per_chunk']
        average_time += time_ratio
    average_time /= len(top_k_centroid_df)
    average_ratios[top_k] = average_time

total_average_time = sum(average_ratios.values()) / len(average_ratios)
print(f"Total Average Time Ratio (Full / Centroid): {total_average_time:.4f}")

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
results = []
for top_k in top_ks:
    top_k_centroid_df = centroid_df[centroid_df['top_k'] == top_k]


    for title, keys in groups.items():
        for key in keys:
            average_metric_value = 0
            for idx, row in top_k_centroid_df.iterrows():
                metric_value = row['centroid_metrics'].get(key)
                average_metric_value += metric_value
            average_metric_value /= len(top_k_centroid_df)
            full_value = full_df[full_df['top_k'] == top_k]['full_metrics'].apply(lambda m: m.get(key)).values[0]
            ratio_metric_value = (average_metric_value - full_value) / full_value * 100
            results.append((top_k, key, ratio_metric_value, average_ratios[top_k]))


results_df = pd.DataFrame(results, columns=['top_k', 'Metric', 'Ratio of metrics (%)', 'Average Time Speed Up'])

In [None]:
metrics = results_df['Metric'].unique()

for metric in metrics:
    metric_df = results_df[results_df['Metric'] == metric]
    metric_df['Ratio of metrics (%)'] = np.log10(metric_df['Ratio of metrics (%)'] + 10 + 1)  # Shift to avoid log(0)
    metric_df['Average Time Speed Up'] = np.log10(metric_df['Average Time Speed Up'])

    fig = go.Figure()
    fig.update_layout(
        title=f"Metric: {metric}",
        xaxis_title='top k',
        yaxis_title='Log Scale',
        template='plotly_white'
    )
    fig.add_trace(go.Scatter(
        x=metric_df['top_k'],
        y=metric_df['Ratio of metrics (%)'],
        mode='lines+markers',
        name='Ratio of metrics (%)'
    ))
    fig.add_trace(go.Scatter(
        x=metric_df['top_k'],
        y=metric_df['Average Time Speed Up'],
        mode='lines+markers',
        name='Average Time Speed Up'
    ))
    fig.show()

# Centroid vs full
kmeans with 180 cluster on v2 data, v2 tensors

In [None]:
df = pd.read_excel("./data/results/kmeans180_v2_comparison_v2tensor.xlsx")

In [None]:
df

Unnamed: 0,doc_accuracy,chunk_accuracy,doc_precision,doc_recall,doc_f1,chunk_precision,chunk_recall,chunk_f1
0,0.768308,0.698741,1,0.768308,0.868975,1,0.909454,0.95258
1,0.852247,0.784096,1,0.852247,0.920231,1,0.920033,0.958351


# Clustering runtimes

In [58]:
v2_kmeans_df = pd.read_excel("./data/results/runtime_clustering/v2_kmeans.xlsx")
v4_kmeans_df = pd.read_excel("./data/results/runtime_clustering/v4_kmeans.xlsx")
v2_minibatchkmeans_df = pd.read_excel("./data/results/runtime_clustering/v2_minibatchkmeans.xlsx")
v4_minibatchkmeans_df = pd.read_excel("./data/results/runtime_clustering/v4_minibatchkmeans.xlsx")
v2_onlinekmeans_batch256_df = pd.read_excel("./data/results/runtime_clustering/v2_onlinekmeans_batch256.xlsx")
v4_onlinekmeans_batch256_df = pd.read_excel("./data/results/runtime_clustering/v4_onlinekmeans_batch256.xlsx")
v2_onlinekmeans_batch1000_df = pd.read_excel("./data/results/runtime_clustering/v2_onlinekmeans_batch1000.xlsx")
v4_onlinekmeans_batch1000_df = pd.read_excel("./data/results/runtime_clustering/v4_onlinekmeans_batch1000.xlsx")

In [64]:
dataframes = [v2_minibatchkmeans_df, v4_minibatchkmeans_df, v2_onlinekmeans_batch1000_df, v4_onlinekmeans_batch1000_df, v2_kmeans_df, v4_kmeans_df]
names = ['MiniBatchKMeans<br>(384 D embedding)', 'MiniBatchKMeans<br>(1024 D embedding)', 'OnlineKMeans<br>(384 D embedding)', 'OnlineKMeans<br>(1024 D embedding)', 'KMeans<br>(384 D embedding)', 'KMeans<br>(1024 D embedding)']

fig = go.Figure()
for df, name in zip(dataframes, names):
    fig.add_trace(go.Scatter(
        x=df['n_clusters'],
        y=df['time_sec'],
        mode='lines+markers',
        name=name
    ))

fig.update_layout(
    title="Clustering Time vs Number of Clusters",
    xaxis_title='Number of Clusters',
    yaxis_title='Time (seconds)',
    template='plotly_white'
)
fig.show()

In [63]:
dataframes = [v2_minibatchkmeans_df, v4_minibatchkmeans_df, v2_onlinekmeans_batch1000_df, v4_onlinekmeans_batch1000_df]
names = ['MiniBatchKMeans<br>(384 D embedding)', 'MiniBatchKMeans<br>(1024 D embedding)', 'OnlineKMeans<br>(384 D embedding)', 'OnlineKMeans<br>(1024 D embedding)']

fig = go.Figure()
for df, name in zip(dataframes, names):
    fig.add_trace(go.Scatter(
        x=df['n_clusters'],
        y=df['time_sec'],
        mode='lines+markers',
        name=name
    ))

fig.update_layout(
    title="Clustering Time vs Number of Clusters",
    xaxis_title='Number of Clusters',
    yaxis_title='Time (seconds)',
    template='plotly_white'
)
fig.show()

# Clustering accuracy on benchmark datasets

In [2]:
df = pd.read_excel("./data/results/clustering_accuracy_comparison.xlsx")

In [3]:
df

Unnamed: 0,n_clusters,cluster_std,time_sec,NMI,V_measure,ARI,method
0,200,2,4.689652,0.998174,0.998174,0.987992,KMeans
1,200,2,0.940467,0.997647,0.997647,0.979598,MiniBatchKMeans
2,200,2,0.237555,1.000000,1.000000,1.000000,OnlineKMeans
3,200,4,3.876991,0.997145,0.997145,0.981473,KMeans
4,200,4,1.008888,0.995043,0.995043,0.964542,MiniBatchKMeans
...,...,...,...,...,...,...,...
70,800,8,3.492713,0.655753,0.655753,0.018893,MiniBatchKMeans
71,800,8,0.985539,0.986283,0.986283,0.940101,OnlineKMeans
72,800,10,19.289603,0.980249,0.980249,0.562017,KMeans
73,800,10,3.470346,0.488414,0.488414,0.006756,MiniBatchKMeans


In [12]:
import pandas as pd

# Assume df is your DataFrame

# 1️⃣ Group by n_clusters and method, compute mean ARI and NMI across all cluster_std
agg = df.groupby(['n_clusters', 'method'])[['ARI', 'NMI']].mean().reset_index()

# 2️⃣ Create separate DataFrames for ARI and NMI, pivot so methods are columns
ari_df = agg.pivot(index='n_clusters', columns='method', values='ARI')
nmi_df = agg.pivot(index='n_clusters', columns='method', values='NMI')

ari_df.index = [f"ARI {i} clusters" for i in ari_df.index]
nmi_df.index = [f"NMI {i} clusters" for i in nmi_df.index]

final_df = pd.concat([ari_df, nmi_df])

final_df = final_df.reset_index().rename(columns={'index': 'metric_n_clusters'})

print(final_df)


method metric_n_clusters    KMeans  MiniBatchKMeans  OnlineKMeans
0       ARI 200 clusters  0.968956         0.897212      0.964422
1       ARI 300 clusters  0.957147         0.664325      0.963343
2       ARI 400 clusters  0.959660         0.500419      0.961720
3       ARI 500 clusters  0.950622         0.498221      0.963569
4       ARI 800 clusters  0.869723         0.013832      0.951842
5       NMI 200 clusters  0.995371         0.989518      0.987679
6       NMI 300 clusters  0.994991         0.957036      0.988512
7       NMI 400 clusters  0.995060         0.715969      0.988951
8       NMI 500 clusters  0.995302         0.815966      0.989938
9       NMI 800 clusters  0.992489         0.704166      0.988858


In [13]:
final_df = np.round(final_df, 3)

In [16]:
final_df[final_df['metric_n_clusters'].str.contains('ARI', na=False)].to_excel("./data/results/clustering_accuracy_comparison_pivoted_ARI.xlsx", index=False)

In [17]:
final_df[final_df['metric_n_clusters'].str.contains('NMI', na=False)].to_excel("./data/results/clustering_accuracy_comparison_pivoted_NMI.xlsx", index=False)