# Retrieval

This notebook contains the code for the retrival pipeline.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import time
import warnings
import re

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans

from OnlineKMeans import OnlineKMeans

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Classes and functions

In [2]:
# Encoder model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
def compute_cluster_centroids(chunk_embeddings, cluster_labels):
    """
    Compute centroids once for all clusters.
    Returns:
        centroid_matrix: np.ndarray of shape (n_clusters, embedding_dim)
        centroid_ids: list of cluster IDs
    """
    unique_clusters = np.unique(cluster_labels)
    cluster_centroids = {
        cid: chunk_embeddings[cluster_labels == cid].mean(axis=0)
        for cid in unique_clusters
    }
    centroid_matrix = np.vstack(list(cluster_centroids.values()))
    centroid_ids = list(cluster_centroids.keys())
    return centroid_matrix, centroid_ids


def retrieve_top_chunks_by_cluster(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    centroid_matrix,
    centroid_ids,
    top_n_clusters=2,
    top_k_total=5
):
    # --- Use precomputed centroids ---
    cluster_sims = cosine_similarity([query_embedding], centroid_matrix)[0]
    top_n_idx = cluster_sims.argsort()[::-1][:top_n_clusters]
    selected_clusters = [centroid_ids[i] for i in top_n_idx]

    # Collect all chunks from selected clusters
    mask = np.isin(cluster_labels, selected_clusters)
    selected_chunk_embeddings = chunk_embeddings[mask]
    selected_df = df_chunks[mask].reset_index(drop=True)

    # Compute similarity for all these chunks
    sims = cosine_similarity([query_embedding], selected_chunk_embeddings)[0]

    # Get top-K chunks overall
    top_k_idx = sims.argsort()[::-1][:top_k_total]
    results = []

    for idx in top_k_idx:
        results.append({
            "cluster": cluster_labels[mask][idx],
            "context_id": selected_df.iloc[idx]["context_id"],
            "chunk_id": selected_df.iloc[idx]["chunk_id"],
            "title": selected_df.iloc[idx]["title"],
            "chunk_embed_text": selected_df.iloc[idx]["chunk_embed_text"],
            "chunk_start": selected_df.iloc[idx]["chunk_start"],
            "chunk_end": selected_df.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })

    return pd.DataFrame(results).sort_values("similarity", ascending=False).reset_index(drop=True)


In [4]:
def retrieve_top_chunks_full(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    top_k_chunks=10
):
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k_chunks]
    
    results = []
    for idx in top_idx:
        results.append({
            "context_id": df_chunks.iloc[idx]["context_id"],
            "chunk_id": df_chunks.iloc[idx]["chunk_id"],
            "title": df_chunks.iloc[idx]["title"],
            "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
            "chunk_start": df_chunks.iloc[idx]["chunk_start"],
            "chunk_end": df_chunks.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [5]:
# # ---------- Answer Containment ----------
# def is_answer_in_chunk(chunk_text, answer_text):
#     return answer_text.lower().strip() in chunk_text.lower()

# def is_answer_in_chunk(chunk_text, answer_text):
#     # Normalize
#     chunk_tokens = set(re.findall(r"\w+", chunk_text.lower()))
#     answer_tokens = set(re.findall(r"\w+", answer_text.lower()))

#     # Require that most/all answer tokens are present
#     return len(answer_tokens & chunk_tokens) / max(1, len(answer_tokens)) >= 0.8


# from rapidfuzz import fuzz

# def is_answer_in_chunk(chunk_text, answer_text, threshold=80):
#     score = fuzz.partial_ratio(answer_text.lower(), chunk_text.lower())
#     return score >= threshold

def is_answer_in_chunk(answer_start, chunk_start, chunk_length):
    if answer_start is None or chunk_start is None or chunk_length is None:
        return False
    return chunk_start <= answer_start < (chunk_start + chunk_length)

In [6]:
def evaluate_top_k_accuracy(
    df_queries,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    top_n_clusters=2,
    top_k_total=5
):
    # ✅ Compute centroids once
    centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_by_cluster(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=cluster_labels,
            centroid_matrix=centroid_matrix,
            centroid_ids=centroid_ids,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )

        # Document-level
        found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
        y_true_doc.append(1)
        y_pred_doc.append(1 if found_doc_id else 0)

        if found_doc_id:
            found_chunk_context = any(
                is_answer_in_chunk(row["answer_start"], chunk['chunk_start'], chunk['chunk_end'] - chunk['chunk_start'])
                for _, chunk in results.iterrows()
            )
            y_true_chunk.append(1)
            y_pred_chunk.append(1 if found_chunk_context else 0)
        else:
            y_true_chunk.append(1)
            y_pred_chunk.append(0)

    # Compute metrics
    metrics = {
        "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
        "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
        "doc_precision": precision_score(y_true_doc, y_pred_doc),
        "doc_recall": recall_score(y_true_doc, y_pred_doc),
        "doc_f1": f1_score(y_true_doc, y_pred_doc),
        "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk, y_pred_chunk),
        "chunk_f1": f1_score(y_true_chunk, y_pred_chunk)
    }

    return metrics

In [7]:
def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5):
    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_full(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )

        # Document-level
        found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
        y_true_doc.append(1)
        y_pred_doc.append(1 if found_doc_id else 0)

        # Chunk-level
        if found_doc_id:
            found_chunk_context = any(
                is_answer_in_chunk(row["answer_start"], chunk['chunk_start'], chunk['chunk_end'] - chunk['chunk_start'])
                for _, chunk in results.iterrows()
            )
            y_true_chunk.append(1)
            y_pred_chunk.append(1 if found_chunk_context else 0)
        else:
            y_true_chunk.append(1)
            y_pred_chunk.append(0)

    # Compute metrics
    metrics = {
        "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
        "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
        "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
        "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
        "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0)
    }

    return metrics

In [8]:
def minibatchkmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.1
):
    n_samples = chunk_embeddings.shape[0]
    n_batches = int(np.ceil(n_samples / batch_size))

    results = []

    # --- Inicializáló klaszterezés ---
    init_start = time.time()
    init_size = max(1, int(n_samples * init_fraction))
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=batch_size)
    kmeans.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start

    print("Start batch processing...")
    for batch_idx in tqdm(range(1, n_batches + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, n_samples)
        X_batch = chunk_embeddings[start_idx:end_idx]

        # --- Online update ---
        update_start = time.time()
        kmeans.partial_fit(X_batch)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Klasztercímkék frissítése ---
        labels = kmeans.predict(chunk_embeddings)

        # --- Retrieval + pontosság ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=labels,
            top_n_clusters=5,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
        })
        print(f"[Batch {batch_idx}/{n_batches}] Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}")

    return pd.DataFrame(results)


In [9]:
def online_kmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.5,  # fraction of data used for initialization
    max_clusters=None,
    metric="cosine",
    new_cluster_threshold=None,
    merge_threshold=None,
    decay=None
):
    """
    OnlineKMeans clustering + retrieval evaluation on growing dataset.
    Only evaluates on the chunks that have been clustered so far.
    """

    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size

    # --- Step 1: Initialization ---
    print(f"🔧 Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")
    init_start = time.time()
    okm = OnlineKMeans(
        n_clusters=n_clusters,
        max_clusters=max_clusters,
        metric=metric,
        new_cluster_threshold=new_cluster_threshold,
        merge_threshold=merge_threshold,
        random_state=42,
        decay=decay
    )
    okm.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start
    print(f"✅ Initialization done in {init_time:.4f} s")

    # --- Step 2: Online updates on the remaining data ---
    results = []
    for batch_idx in tqdm(range(1, int(np.ceil(remaining_size / batch_size)) + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        batch_embeddings = chunk_embeddings[init_size + start_idx : init_size + end_idx]

        # --- Online update ---
        update_start = time.time()
        okm.partial_fit(batch_embeddings)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Only evaluate on seen data so far ---
        seen_end_idx = init_size + end_idx
        seen_embeddings = chunk_embeddings[:seen_end_idx]
        seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

        # --- Predict cluster labels for seen data ---
        labels_seen = okm.predict(seen_embeddings)

        # --- Filter queries to only those with seen context_ids ---
        df_queries_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())].reset_index(drop=True)
        print(f"df_queries_seen: {df_queries_seen.shape[0]}, seen_df_chunks: {seen_df_chunks.shape[0]}")

        # --- Retrieval accuracy ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries_seen,
            chunk_embeddings=seen_embeddings,
            df_chunks=seen_df_chunks,
            cluster_labels=labels_seen,
            top_n_clusters=5,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "n_clusters": len(okm.centroids)
        })

        print(f"[Batch {batch_idx}] Seen chunks: {seen_end_idx}, Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}, Clusters: {len(okm.centroids)}")

    return pd.DataFrame(results)

# Workflow

In [18]:
df_train = pd.read_excel("./data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_val = pd.read_excel("./data/labelled/squad_val_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")
df_queries_train = df_queries_train[df_queries_train["context_id"].isin(df_train["context_id"].unique())].reset_index(drop=True)

X_train = np.load("./data/labelled/squad_train_v2_semantic_chunking_clustered.npy")
X_val = np.load("./data/labelled/squad_val_v2_semantic_chunking_clustered.npy")
df_queries_val = pd.read_excel("./data/prepared/squad_val_v2_queries.xlsx")

labels_train = df_train["cluster"].values

In [19]:
df_train.shape

(2201, 10)

In [20]:
df_queries_train.shape

(2329, 4)

In [21]:
df_queries_train.loc[100, "question"]

'In what year did the team lead by Knute Rockne win the Rose Bowl?'

In [22]:
query = 'In what year did the team lead by Knute Rockne win the Rose Bowl?'
query_emb = model.encode([query])[0]

In [35]:
start_time = time.time()

top_chunks_cluster = retrieve_top_chunks_by_cluster(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    cluster_labels=labels_train,
    top_n_clusters=3,
    top_k_total=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Cluster-based retrieval:")
print(top_chunks_cluster['chunk_embed_text'].tolist())

Runtime: 0.01423 seconds
Cluster-based retrieval:
["'96, Heisman Trophy winner Ty Detmer '90, and two-time Super Bowl winner Jim McMahon.", '23-year-old Candice Glover won the season with Kree Harrison taking the runner-up spot.', "BYU also claims notable professional football players including two-time NFL MVP and Super Bowl MVP and Pro Football Hall of Fame quarterback Steve Young '84 & J.D."]


In [36]:
start_time = time.time()

top_chunks_full = retrieve_top_chunks_full(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    top_k_chunks=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Full retrieval:")
print(top_chunks_full['chunk_embed_text'].tolist())

Runtime: 0.00212 seconds
Full retrieval:
["'96, Heisman Trophy winner Ty Detmer '90, and two-time Super Bowl winner Jim McMahon.", '23-year-old Candice Glover won the season with Kree Harrison taking the runner-up spot.', "BYU also claims notable professional football players including two-time NFL MVP and Super Bowl MVP and Pro Football Hall of Fame quarterback Steve Young '84 & J.D."]


# Evaluation for retrival with cluster centroids vs full

In [11]:
# Load data
X_semantic_train = np.load("../data/labelled/squad_train_v2_semantic_chunking_clustered.npy")
df_semantic_train = pd.read_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

labels_train = df_semantic_train["cluster"].values

In [12]:
top_ks = [3, 5, 12, 25]
top_n_clusters = [5, 10, 20, 35]

results_centroid = []
results_full = []
for top_k in top_ks:
    for top_n_cluster in top_n_clusters:
        print(f"Evaluating: Top-{top_k} chunks in Top-{top_n_cluster} clusters")
        
        start_centroid = time.time()
        centroid_metrics = evaluate_top_k_accuracy(df_queries_train, X_semantic_train, df_semantic_train, labels_train, top_n_clusters=top_n_cluster, top_k_total=top_k)
        end_centroid = time.time()
        
        results_centroid.append({
            "top_k": top_k,
            "top_n_clusters": top_n_cluster,
            "centroid_metrics": centroid_metrics,
            "centroid_time": end_centroid - start_centroid
        })
        
        results_df_centroid = pd.DataFrame(results_centroid)
        results_df_centroid.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/centroid_results.xlsx")
        
    start_full = time.time()
    full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=top_k)
    end_full = time.time()
    results_full.append({
        "top_k": top_k,
        "full_metrics": full_metrics,
        "full_time": end_full - start_full
    })
    results_df_full = pd.DataFrame(results_full)
    results_df_full.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results.xlsx")

Evaluating: Top-3 chunks in Top-5 clusters


100%|██████████| 87599/87599 [31:29<00:00, 46.36it/s]  


Evaluating: Top-3 chunks in Top-10 clusters


100%|██████████| 87599/87599 [35:34<00:00, 41.03it/s]


Evaluating: Top-3 chunks in Top-20 clusters


100%|██████████| 87599/87599 [41:19<00:00, 35.32it/s]


Evaluating: Top-3 chunks in Top-35 clusters


100%|██████████| 87599/87599 [50:06<00:00, 29.14it/s]  
100%|██████████| 87599/87599 [3:02:51<00:00,  7.98it/s]  


Evaluating: Top-5 chunks in Top-5 clusters


100%|██████████| 87599/87599 [32:44<00:00, 44.60it/s]


Evaluating: Top-5 chunks in Top-10 clusters


100%|██████████| 87599/87599 [36:02<00:00, 40.51it/s]


Evaluating: Top-5 chunks in Top-20 clusters


100%|██████████| 87599/87599 [41:59<00:00, 34.77it/s]  


Evaluating: Top-5 chunks in Top-35 clusters


100%|██████████| 87599/87599 [50:30<00:00, 28.91it/s]  
100%|██████████| 87599/87599 [2:55:05<00:00,  8.34it/s]  


Evaluating: Top-12 chunks in Top-5 clusters


100%|██████████| 87599/87599 [36:41<00:00, 39.79it/s]


Evaluating: Top-12 chunks in Top-10 clusters


100%|██████████| 87599/87599 [40:05<00:00, 36.42it/s]  


Evaluating: Top-12 chunks in Top-20 clusters


100%|██████████| 87599/87599 [45:56<00:00, 31.78it/s]  


Evaluating: Top-12 chunks in Top-35 clusters


100%|██████████| 87599/87599 [58:24<00:00, 25.00it/s]  
100%|██████████| 87599/87599 [3:17:52<00:00,  7.38it/s]   


Evaluating: Top-25 chunks in Top-5 clusters


100%|██████████| 87599/87599 [55:07<00:00, 26.48it/s]  


Evaluating: Top-25 chunks in Top-10 clusters


100%|██████████| 87599/87599 [47:39<00:00, 30.64it/s]  


Evaluating: Top-25 chunks in Top-20 clusters


100%|██████████| 87599/87599 [1:04:50<00:00, 22.51it/s]


Evaluating: Top-25 chunks in Top-35 clusters


100%|██████████| 87599/87599 [1:11:56<00:00, 20.29it/s]
  8%|▊         | 7101/87599 [15:35<2:56:50,  7.59it/s]


KeyboardInterrupt: 

In [14]:
start_full = time.time()
full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=25)
end_full = time.time()
results_full.append({
    "top_k": 25,
    "full_metrics": full_metrics,
    "full_time": end_full - start_full
})
results_df_full = pd.DataFrame(results_full)

100%|██████████| 87599/87599 [3:28:17<00:00,  7.01it/s]   


In [16]:
results_df_full

Unnamed: 0,top_k,full_metrics,full_time
0,3,"{'doc_accuracy': 0.8036735579173279, 'chunk_ac...",10972.37997
1,5,"{'doc_accuracy': 0.8522357561159374, 'chunk_ac...",10506.523075
2,12,"{'doc_accuracy': 0.9135035788079773, 'chunk_ac...",11872.97477
3,25,"{'doc_accuracy': 0.9460039498167787, 'chunk_ac...",12497.922858


In [17]:
results_df_full.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results.xlsx")

# Evaluate retrieval with MiniBatchKMeans

In [13]:
# Load data
X_semantic_train = np.load("./data/tensors/squad_train_v2_semantic_chunking.npy")
df_semantic_train = pd.read_excel("./data/prepared/squad_train_v2_semantic_chunking.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")

In [None]:
# --- Futtatás ---
results_df = minibatchkmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=160,
    batch_size=1000,
    top_k_total=5,
    init_fraction=0.5
)

In [None]:
# --- Plotolás: Pontosság ---
plt.figure(figsize=(10,5))
plt.plot(results_df["batch"], results_df["doc_accuracy"], label="Doc Accuracy", marker='o')
plt.plot(results_df["batch"], results_df["chunk_accuracy"], label="Chunk Accuracy", marker='s')
plt.xlabel("Batch")
plt.ylabel("Accuracy")
plt.title("Retrieval pontosság batchenként (Online KMeans)")
plt.legend()
plt.grid(True)
plt.show()

# --- Plotolás: Futásidők ---
plt.figure(figsize=(10,5))
plt.plot(results_df["batch"], results_df["init_time"], label="Init time", marker='o')
plt.plot(results_df["batch"], results_df["update_time"], label="Update time", marker='s')
plt.plot(results_df["batch"], results_df["retrieval_time"], label="Retrieval time", marker='^')
plt.xlabel("Batch")
plt.ylabel("Time (s)")
plt.title("Futásidők batchenként (Online KMeans)")
plt.legend()
plt.grid(True)
plt.show()

# Evaluate retrieval with online clustering

In [15]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v2_semantic_chunking.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v2_semantic_chunking.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

In [16]:
results_df = online_kmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=500,
    max_clusters=2000,
    batch_size=2000,
    top_k_total=5,
    metric="cosine",
    init_fraction=0.5,
    merge_threshold=0.08,    
    decay=1.0,
    new_cluster_threshold=0.8
)

🔧 Using 50% of data (42003 samples) for initialization
✅ Initialization done in 38.6944 s


  0%|          | 0/22 [00:00<?, ?it/s]

df_queries_seen: 48817, seen_df_chunks: 44003


100%|██████████| 48817/48817 [16:11<00:00, 50.23it/s]
  5%|▍         | 1/22 [16:12<5:40:28, 972.77s/it]

[Batch 1] Seen chunks: 44003, Doc acc: 0.7526, Chunk acc: 0.6831, Clusters: 524
df_queries_seen: 50578, seen_df_chunks: 46003


100%|██████████| 50578/50578 [19:55<00:00, 42.32it/s]
  9%|▉         | 2/22 [36:08<6:08:02, 1104.11s/it]

[Batch 2] Seen chunks: 46003, Doc acc: 0.7504, Chunk acc: 0.6803, Clusters: 524
df_queries_seen: 52477, seen_df_chunks: 48003


100%|██████████| 52477/52477 [19:34<00:00, 44.70it/s]
 14%|█▎        | 3/22 [55:43<5:59:53, 1136.52s/it]

[Batch 3] Seen chunks: 48003, Doc acc: 0.7481, Chunk acc: 0.6784, Clusters: 526
df_queries_seen: 54231, seen_df_chunks: 50003


100%|██████████| 54231/54231 [19:52<00:00, 45.48it/s]
 18%|█▊        | 4/22 [1:15:37<5:47:41, 1158.97s/it]

[Batch 4] Seen chunks: 50003, Doc acc: 0.7456, Chunk acc: 0.6742, Clusters: 544
df_queries_seen: 56119, seen_df_chunks: 52003


100%|██████████| 56119/56119 [20:09<00:00, 46.39it/s]
 23%|██▎       | 5/22 [1:35:47<5:33:39, 1177.62s/it]

[Batch 5] Seen chunks: 52003, Doc acc: 0.7438, Chunk acc: 0.6724, Clusters: 544
df_queries_seen: 58097, seen_df_chunks: 54003


100%|██████████| 58097/58097 [21:08<00:00, 45.82it/s]
 27%|██▋       | 6/22 [1:56:57<5:22:20, 1208.76s/it]

[Batch 6] Seen chunks: 54003, Doc acc: 0.7416, Chunk acc: 0.6703, Clusters: 544
df_queries_seen: 59930, seen_df_chunks: 56003


100%|██████████| 59930/59930 [22:23<00:00, 44.61it/s]
 32%|███▏      | 7/22 [2:19:22<5:13:18, 1253.26s/it]

[Batch 7] Seen chunks: 56003, Doc acc: 0.7393, Chunk acc: 0.6687, Clusters: 544
df_queries_seen: 61832, seen_df_chunks: 58003


100%|██████████| 61832/61832 [22:59<00:00, 44.82it/s]
 36%|███▋      | 8/22 [2:42:23<5:01:54, 1293.93s/it]

[Batch 8] Seen chunks: 58003, Doc acc: 0.7369, Chunk acc: 0.6663, Clusters: 544
df_queries_seen: 63816, seen_df_chunks: 60003


100%|██████████| 63816/63816 [23:57<00:00, 44.40it/s]
 41%|████      | 9/22 [3:06:21<4:50:09, 1339.16s/it]

[Batch 9] Seen chunks: 60003, Doc acc: 0.7358, Chunk acc: 0.6651, Clusters: 550
df_queries_seen: 65742, seen_df_chunks: 62003


100%|██████████| 65742/65742 [24:42<00:00, 44.33it/s]
 45%|████▌     | 10/22 [3:31:05<4:36:47, 1383.95s/it]

[Batch 10] Seen chunks: 62003, Doc acc: 0.7341, Chunk acc: 0.6637, Clusters: 550
df_queries_seen: 67503, seen_df_chunks: 64003


100%|██████████| 67503/67503 [25:50<00:00, 43.54it/s]
 50%|█████     | 11/22 [3:56:57<4:23:08, 1435.31s/it]

[Batch 11] Seen chunks: 64003, Doc acc: 0.7335, Chunk acc: 0.6626, Clusters: 551
df_queries_seen: 69330, seen_df_chunks: 66003


100%|██████████| 69330/69330 [26:16<00:00, 43.99it/s]
 55%|█████▍    | 12/22 [4:23:15<4:06:25, 1478.56s/it]

[Batch 12] Seen chunks: 66003, Doc acc: 0.7310, Chunk acc: 0.6606, Clusters: 551
df_queries_seen: 71487, seen_df_chunks: 68003


100%|██████████| 71487/71487 [29:58<00:00, 39.74it/s]
 59%|█████▉    | 13/22 [4:53:16<3:56:26, 1576.23s/it]

[Batch 13] Seen chunks: 68003, Doc acc: 0.7297, Chunk acc: 0.6592, Clusters: 551
df_queries_seen: 73342, seen_df_chunks: 70003


100%|██████████| 73342/73342 [28:31<00:00, 42.85it/s]
 64%|██████▎   | 14/22 [5:21:50<3:35:43, 1617.89s/it]

[Batch 14] Seen chunks: 70003, Doc acc: 0.7287, Chunk acc: 0.6580, Clusters: 552
df_queries_seen: 75361, seen_df_chunks: 72003


100%|██████████| 75361/75361 [32:28<00:00, 38.67it/s]
 68%|██████▊   | 15/22 [5:54:20<3:20:27, 1718.17s/it]

[Batch 15] Seen chunks: 72003, Doc acc: 0.7256, Chunk acc: 0.6553, Clusters: 552
df_queries_seen: 77458, seen_df_chunks: 74003


100%|██████████| 77458/77458 [31:24<00:00, 41.11it/s]
 73%|███████▎  | 16/22 [6:25:46<2:56:51, 1768.59s/it]

[Batch 16] Seen chunks: 74003, Doc acc: 0.7246, Chunk acc: 0.6548, Clusters: 554
df_queries_seen: 79607, seen_df_chunks: 76003


100%|██████████| 79607/79607 [31:25<00:00, 42.22it/s]
 77%|███████▋  | 17/22 [6:57:13<2:30:21, 1804.27s/it]

[Batch 17] Seen chunks: 76003, Doc acc: 0.7247, Chunk acc: 0.6551, Clusters: 554
df_queries_seen: 81585, seen_df_chunks: 78003


100%|██████████| 81585/81585 [33:05<00:00, 41.08it/s]
 82%|████████▏ | 18/22 [7:30:21<2:03:57, 1859.34s/it]

[Batch 18] Seen chunks: 78003, Doc acc: 0.7230, Chunk acc: 0.6536, Clusters: 554
df_queries_seen: 83407, seen_df_chunks: 80003


100%|██████████| 83407/83407 [33:33<00:00, 41.43it/s]
 86%|████████▋ | 19/22 [8:03:56<1:35:18, 1906.02s/it]

[Batch 19] Seen chunks: 80003, Doc acc: 0.7227, Chunk acc: 0.6532, Clusters: 555
df_queries_seen: 85432, seen_df_chunks: 82003


100%|██████████| 85432/85432 [34:35<00:00, 41.16it/s]
 91%|█████████ | 20/22 [8:38:33<1:05:15, 1957.57s/it]

[Batch 20] Seen chunks: 82003, Doc acc: 0.7207, Chunk acc: 0.6512, Clusters: 555
df_queries_seen: 87594, seen_df_chunks: 84003


100%|██████████| 87594/87594 [35:20<00:00, 41.30it/s]
 95%|█████████▌| 21/22 [9:13:56<33:27, 2007.11s/it]  

[Batch 21] Seen chunks: 84003, Doc acc: 0.7193, Chunk acc: 0.6502, Clusters: 555
df_queries_seen: 87599, seen_df_chunks: 84007


100%|██████████| 87599/87599 [34:47<00:00, 41.96it/s]
100%|██████████| 22/22 [9:48:45<00:00, 1605.72s/it]

[Batch 22] Seen chunks: 84007, Doc acc: 0.7193, Chunk acc: 0.6502, Clusters: 555





In [17]:
results_df.to_excel("../data/results/onlinekmeans_v2.xlsx")

In [18]:
# --- Plot Accuracy ---
plt.figure(figsize=(10, 5))
plt.plot(results_df["batch"], results_df["doc_accuracy"], marker="o", label="Doc Accuracy")
plt.plot(results_df["batch"], results_df["chunk_accuracy"], marker="s", label="Chunk Accuracy")
plt.xlabel("Batch")
plt.ylabel("Accuracy")
plt.title("📊 Retrieval Accuracy per Batch (OnlineKMeans)")
plt.legend()
plt.grid(True)
plt.show()

# --- Plot Runtimes ---
plt.figure(figsize=(10, 5))
plt.plot(results_df["batch"], results_df["update_time"], label="Update Time", marker='o')
plt.plot(results_df["batch"], results_df["retrieval_time"], label="Retrieval Time", marker='s')
plt.plot(results_df["batch"], results_df["prediction_time"], label="Prediction Time", marker='^')
plt.xlabel("Batch")
plt.ylabel("Time (s)")
plt.title("⚙️ Runtime per Batch (OnlineKMeans)")
plt.legend()
plt.grid(True)
plt.show()

# --- Plot number of clusters (optional) ---
plt.figure(figsize=(10, 5))
plt.plot(results_df["batch"], results_df["n_clusters"], label="Number of Clusters", marker='o', color='purple')
plt.xlabel("Batch")
plt.ylabel("# Clusters")
plt.title("📈 Cluster Count Evolution (OnlineKMeans)")
plt.legend()
plt.grid(True)
plt.show()

KeyError: 'doc_accuracy'

<Figure size 1000x500 with 0 Axes>