# Evaluation of results
This notebook contains the evaluation of the results.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import ast
from plotly.subplots import make_subplots
import warnings
import re

warnings.filterwarnings("ignore")

# OnlineKMeans v2

In [30]:
df = pd.read_excel("./data/results/onlinekmeans_v2.xlsx")
df['metrics'] = df['metrics'].apply(ast.literal_eval)

In [31]:
df_semantic_train = pd.read_excel("./data/prepared/squad_train_v2_semantic_chunking.xlsx")

In [32]:
df.loc[0, "metrics"]

{'doc_accuracy': 0.7526476432390355,
 'chunk_accuracy': 0.6237990863838417,
 'doc_precision': 1.0,
 'doc_recall': 0.7526476432390355,
 'doc_f1': 0.8588693182482263,
 'chunk_precision': 1.0,
 'chunk_recall': 0.6237990863838417,
 'chunk_f1': 0.7683205288322043,
 'correct_chunk_accuracy': 0.21714156953520292}

In [33]:
top_k_total = 5
divisor = df_semantic_train.shape[0] / df_semantic_train['context_id'].nunique() / top_k_total

df['metrics'] = df['metrics'].apply(
    lambda x: {**x, 'correct_chunk_accuracy': x.get('correct_chunk_accuracy', 0) / divisor}
)


In [34]:
df.loc[0, "metrics"]

{'doc_accuracy': 0.7526476432390355,
 'chunk_accuracy': 0.6237990863838417,
 'doc_precision': 1.0,
 'doc_recall': 0.7526476432390355,
 'doc_f1': 0.8588693182482263,
 'chunk_precision': 1.0,
 'chunk_recall': 0.6237990863838417,
 'chunk_f1': 0.7683205288322043,
 'correct_chunk_accuracy': 0.24414759425342641}

In [35]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = go.Figure()

    for key in keys:
        fig.add_trace(go.Scatter(
            x=df['batch'],
            y=df['metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ))

    fig.update_layout(
        title=title,
        xaxis_title='Batch',
        yaxis_title=title,
        template='plotly_white'
    )
    fig.update_yaxes(range=[0, 1.1])
    fig.show()

# Centroid vs full
hyperparameter optimization, data v2, tensors v2

In [36]:
def clean_np_literals(s):
    if not isinstance(s, str):
        return s
    # Tisztítjuk a np.float64(…) és np.int64(…) hívásokat
    s = re.sub(r'np\.float64\(([^)]+)\)', r'\1', s)
    s = re.sub(r'np\.int64\(([^)]+)\)', r'\1', s)
    return s

In [37]:
centroid_df = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/centroid_results_kmeans500_v2_l2_1.xlsx")
centroid_df['centroid_metrics'] = (centroid_df['centroid_metrics'].apply(clean_np_literals).apply(ast.literal_eval))
full_df = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/full_results_kmeans500_v2_l2_1.xlsx")
centroid_df['centroid_metrics'] = centroid_df['centroid_metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
full_df['full_metrics'] = full_df['full_metrics'].apply(ast.literal_eval)
centroid_df['x_axis'] = centroid_df['top_k'].astype(str) + "_" + centroid_df['top_n_clusters'].astype(str)


# centroid_df2 = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/centroid_results_kmeans500_v2_l2_2.xlsx")
# centroid_df2['centroid_metrics'] = centroid_df2['centroid_metrics'].apply(ast.literal_eval)
# centroid_df2['x_axis'] = centroid_df2['top_k'].astype(str) + "_" + centroid_df2['top_n_clusters'].astype(str)
# full_df2 = pd.read_excel("./data/results/hyperparameter_for_centroid_vs_full/full_results_kmeans500_v2_l2_2.xlsx")
# full_df2['full_metrics'] = full_df2['full_metrics'].apply(ast.literal_eval)
# centroid_df = pd.concat([centroid_df, centroid_df2], ignore_index=True)
# full_df = pd.concat([full_df, full_df2], ignore_index=True)

In [38]:
divisor = df_semantic_train['context_id'].nunique() / df_semantic_train.shape[0]

centroid_df['centroid_metrics'] = centroid_df.apply(
    lambda row: {**row['centroid_metrics'], 
                 'correct_chunk_accuracy': row['centroid_metrics'].get('correct_chunk_accuracy', 0) / divisor / row['top_k']},
    axis=1
)

full_df['full_metrics'] = full_df.apply(
    lambda row: {**row['full_metrics'], 
                 'correct_chunk_accuracy': row['full_metrics'].get('correct_chunk_accuracy', 0) / divisor / row['top_k']},
    axis=1
)


In [39]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = make_subplots(rows=1, cols=2, subplot_titles=(f"Centroid - {title}", f"Full - {title}"))

    for key in keys:
        fig.add_trace(go.Scatter(
            x=centroid_df['x_axis'],
            y=centroid_df['centroid_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=1)

        fig.add_trace(go.Scatter(
            x=full_df['top_k'],
            y=full_df['full_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=2)

    fig.update_yaxes(range=[0, 1.1], row=1, col=1)
    fig.update_yaxes(range=[0, 1.1], row=1, col=2)

    fig.update_xaxes(title_text='top k with top n clusters', row=1, col=1)
    fig.update_xaxes(title_text='top k', row=1, col=2)

    fig.show()

In [40]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Full Time", "Centroid Time"))

fig.add_trace(go.Scatter(
    x=full_df['top_k'],
    y=full_df['full_time'],
    mode='lines+markers',
    name="Full Time"
), row=1, col=1)
fig.add_trace(go.Scatter(
    x=centroid_df['x_axis'],
    y=centroid_df['centroid_time'],
    mode='lines+markers',
    name="Centroid Time"
), row=1, col=2)
fig.update_layout(
    template='plotly_white'
)
fig.update_xaxes(title_text='top k', row=1, col=1)
fig.update_xaxes(title_text='top k with top n clusters', row=1, col=2)
fig.update_yaxes(title_text='Time (s)', row=1, col=1)
fig.update_yaxes(title_text='Time (s)', row=1, col=2)

fig.update_yaxes(range=[0, 15000], row=1, col=1)
fig.update_yaxes(range=[0, 15000], row=1, col=2)

fig.show()

# Average speed up / accuracy decrease

In [41]:
chunk_num = 84007

centroid_df['time_per_chunk'] = centroid_df['centroid_time'] / chunk_num
full_df['time_per_chunk'] = full_df['full_time'] / chunk_num

top_ks = [3, 5, 12, 25]

average_ratios = {}
for top_k in top_ks:
    top_k_centroid_df = centroid_df[centroid_df['top_k'] == top_k]
    full_time = full_df[full_df['top_k'] == top_k]['time_per_chunk'].values[0]

    average_time = 0
    for idx, row in top_k_centroid_df.iterrows():
        time_ratio = full_time / row['time_per_chunk']
        average_time += time_ratio
    average_time /= len(top_k_centroid_df)
    average_ratios[top_k] = average_time

total_average_time = sum(average_ratios.values()) / len(average_ratios)
print(f"Total Average Time Ratio (Full / Centroid): {total_average_time:.4f}")

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
results = []
for top_k in top_ks:
    top_k_centroid_df = centroid_df[centroid_df['top_k'] == top_k]


    for title, keys in groups.items():
        for key in keys:
            average_metric_value = 0
            for idx, row in top_k_centroid_df.iterrows():
                metric_value = row['centroid_metrics'].get(key)
                average_metric_value += metric_value
            average_metric_value /= len(top_k_centroid_df)
            full_value = full_df[full_df['top_k'] == top_k]['full_metrics'].apply(lambda m: m.get(key)).values[0]
            ratio_metric_value = (average_metric_value - full_value) / full_value * 100
            results.append((top_k, key, ratio_metric_value, average_ratios[top_k]))


results_df = pd.DataFrame(results, columns=['top_k', 'Metric', 'Ratio of metrics (%)', 'Average Time Speed Up'])

In [None]:
metrics = results_df['Metric'].unique()

for metric in metrics:
    metric_df = results_df[results_df['Metric'] == metric]
    metric_df['Ratio of metrics (%)'] = np.log10(metric_df['Ratio of metrics (%)'] + 10 + 1)  # Shift to avoid log(0)
    metric_df['Average Time Speed Up'] = np.log10(metric_df['Average Time Speed Up'])

    fig = go.Figure()
    fig.update_layout(
        title=f"Metric: {metric}",
        xaxis_title='top k',
        yaxis_title='Log Scale',
        template='plotly_white'
    )
    fig.add_trace(go.Scatter(
        x=metric_df['top_k'],
        y=metric_df['Ratio of metrics (%)'],
        mode='lines+markers',
        name='Ratio of metrics (%)'
    ))
    fig.add_trace(go.Scatter(
        x=metric_df['top_k'],
        y=metric_df['Average Time Speed Up'],
        mode='lines+markers',
        name='Average Time Speed Up'
    ))
    fig.show()

# Centroid vs full
kmeans with 180 cluster on v2 data, v2 tensors

In [None]:
df = pd.read_excel("./data/results/kmeans180_v2_comparison_v2tensor.xlsx")

In [None]:
df

Unnamed: 0,doc_accuracy,chunk_accuracy,doc_precision,doc_recall,doc_f1,chunk_precision,chunk_recall,chunk_f1
0,0.768308,0.698741,1,0.768308,0.868975,1,0.909454,0.95258
1,0.852247,0.784096,1,0.852247,0.920231,1,0.920033,0.958351


# Clustering runtimes

In [58]:
v2_kmeans_df = pd.read_excel("./data/results/runtime_clustering/v2_kmeans.xlsx")
v4_kmeans_df = pd.read_excel("./data/results/runtime_clustering/v4_kmeans.xlsx")
v2_minibatchkmeans_df = pd.read_excel("./data/results/runtime_clustering/v2_minibatchkmeans.xlsx")
v4_minibatchkmeans_df = pd.read_excel("./data/results/runtime_clustering/v4_minibatchkmeans.xlsx")
v2_onlinekmeans_batch256_df = pd.read_excel("./data/results/runtime_clustering/v2_onlinekmeans_batch256.xlsx")
v4_onlinekmeans_batch256_df = pd.read_excel("./data/results/runtime_clustering/v4_onlinekmeans_batch256.xlsx")
v2_onlinekmeans_batch1000_df = pd.read_excel("./data/results/runtime_clustering/v2_onlinekmeans_batch1000.xlsx")
v4_onlinekmeans_batch1000_df = pd.read_excel("./data/results/runtime_clustering/v4_onlinekmeans_batch1000.xlsx")

In [64]:
dataframes = [v2_minibatchkmeans_df, v4_minibatchkmeans_df, v2_onlinekmeans_batch1000_df, v4_onlinekmeans_batch1000_df, v2_kmeans_df, v4_kmeans_df]
names = ['MiniBatchKMeans<br>(384 D embedding)', 'MiniBatchKMeans<br>(1024 D embedding)', 'OnlineKMeans<br>(384 D embedding)', 'OnlineKMeans<br>(1024 D embedding)', 'KMeans<br>(384 D embedding)', 'KMeans<br>(1024 D embedding)']

fig = go.Figure()
for df, name in zip(dataframes, names):
    fig.add_trace(go.Scatter(
        x=df['n_clusters'],
        y=df['time_sec'],
        mode='lines+markers',
        name=name
    ))

fig.update_layout(
    title="Clustering Time vs Number of Clusters",
    xaxis_title='Number of Clusters',
    yaxis_title='Time (seconds)',
    template='plotly_white'
)
fig.show()

In [63]:
dataframes = [v2_minibatchkmeans_df, v4_minibatchkmeans_df, v2_onlinekmeans_batch1000_df, v4_onlinekmeans_batch1000_df]
names = ['MiniBatchKMeans<br>(384 D embedding)', 'MiniBatchKMeans<br>(1024 D embedding)', 'OnlineKMeans<br>(384 D embedding)', 'OnlineKMeans<br>(1024 D embedding)']

fig = go.Figure()
for df, name in zip(dataframes, names):
    fig.add_trace(go.Scatter(
        x=df['n_clusters'],
        y=df['time_sec'],
        mode='lines+markers',
        name=name
    ))

fig.update_layout(
    title="Clustering Time vs Number of Clusters",
    xaxis_title='Number of Clusters',
    yaxis_title='Time (seconds)',
    template='plotly_white'
)
fig.show()

# Clustering accuracy on benchmark datasets

In [2]:
df = pd.read_excel("./data/results/clustering_accuracy_comparison.xlsx")

In [3]:
df

Unnamed: 0,n_clusters,cluster_std,time_sec,NMI,V_measure,ARI,method
0,200,2,4.689652,0.998174,0.998174,0.987992,KMeans
1,200,2,0.940467,0.997647,0.997647,0.979598,MiniBatchKMeans
2,200,2,0.237555,1.000000,1.000000,1.000000,OnlineKMeans
3,200,4,3.876991,0.997145,0.997145,0.981473,KMeans
4,200,4,1.008888,0.995043,0.995043,0.964542,MiniBatchKMeans
...,...,...,...,...,...,...,...
70,800,8,3.492713,0.655753,0.655753,0.018893,MiniBatchKMeans
71,800,8,0.985539,0.986283,0.986283,0.940101,OnlineKMeans
72,800,10,19.289603,0.980249,0.980249,0.562017,KMeans
73,800,10,3.470346,0.488414,0.488414,0.006756,MiniBatchKMeans


In [12]:
import pandas as pd

# Assume df is your DataFrame

# 1️⃣ Group by n_clusters and method, compute mean ARI and NMI across all cluster_std
agg = df.groupby(['n_clusters', 'method'])[['ARI', 'NMI']].mean().reset_index()

# 2️⃣ Create separate DataFrames for ARI and NMI, pivot so methods are columns
ari_df = agg.pivot(index='n_clusters', columns='method', values='ARI')
nmi_df = agg.pivot(index='n_clusters', columns='method', values='NMI')

ari_df.index = [f"ARI {i} clusters" for i in ari_df.index]
nmi_df.index = [f"NMI {i} clusters" for i in nmi_df.index]

final_df = pd.concat([ari_df, nmi_df])

final_df = final_df.reset_index().rename(columns={'index': 'metric_n_clusters'})

print(final_df)


method metric_n_clusters    KMeans  MiniBatchKMeans  OnlineKMeans
0       ARI 200 clusters  0.968956         0.897212      0.964422
1       ARI 300 clusters  0.957147         0.664325      0.963343
2       ARI 400 clusters  0.959660         0.500419      0.961720
3       ARI 500 clusters  0.950622         0.498221      0.963569
4       ARI 800 clusters  0.869723         0.013832      0.951842
5       NMI 200 clusters  0.995371         0.989518      0.987679
6       NMI 300 clusters  0.994991         0.957036      0.988512
7       NMI 400 clusters  0.995060         0.715969      0.988951
8       NMI 500 clusters  0.995302         0.815966      0.989938
9       NMI 800 clusters  0.992489         0.704166      0.988858


In [13]:
final_df = np.round(final_df, 3)

In [16]:
final_df[final_df['metric_n_clusters'].str.contains('ARI', na=False)].to_excel("./data/results/clustering_accuracy_comparison_pivoted_ARI.xlsx", index=False)

In [17]:
final_df[final_df['metric_n_clusters'].str.contains('NMI', na=False)].to_excel("./data/results/clustering_accuracy_comparison_pivoted_NMI.xlsx", index=False)