# Retrieval

This notebook contains the code for the retrival pipeline.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import time
import warnings
import re

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans

from OnlineKMeans import OnlineKMeans

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Classes and functions

In [2]:
# Encoder model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
def compute_cluster_centroids(chunk_embeddings, cluster_labels):
    """
    Compute centroids once for all clusters.
    Returns:
        centroid_matrix: np.ndarray of shape (n_clusters, embedding_dim)
        centroid_ids: list of cluster IDs
    """
    unique_clusters = np.unique(cluster_labels)
    cluster_centroids = {
        cid: chunk_embeddings[cluster_labels == cid].mean(axis=0)
        for cid in unique_clusters
    }
    centroid_matrix = np.vstack(list(cluster_centroids.values()))
    centroid_ids = list(cluster_centroids.keys())
    return centroid_matrix, centroid_ids


def retrieve_top_chunks_by_cluster(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    centroid_matrix,
    centroid_ids,
    top_n_clusters=2,
    top_k_total=5
):
    # --- Use precomputed centroids ---
    cluster_sims = cosine_similarity([query_embedding], centroid_matrix)[0]
    top_n_idx = cluster_sims.argsort()[::-1][:top_n_clusters]
    selected_clusters = [centroid_ids[i] for i in top_n_idx]

    # Collect all chunks from selected clusters
    mask = np.isin(cluster_labels, selected_clusters)
    selected_chunk_embeddings = chunk_embeddings[mask]
    selected_df = df_chunks[mask].reset_index(drop=True)

    # Compute similarity for all these chunks
    sims = cosine_similarity([query_embedding], selected_chunk_embeddings)[0]

    # Get top-K chunks overall
    top_k_idx = sims.argsort()[::-1][:top_k_total]
    results = []

    for idx in top_k_idx:
        results.append({
            "cluster": cluster_labels[mask][idx],
            "context_id": selected_df.iloc[idx]["context_id"],
            "chunk_id": selected_df.iloc[idx]["chunk_id"],
            "title": selected_df.iloc[idx]["title"],
            "chunk_embed_text": selected_df.iloc[idx]["chunk_embed_text"],
            "chunk_start": selected_df.iloc[idx]["chunk_start"],
            "chunk_end": selected_df.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })

    return pd.DataFrame(results).sort_values("similarity", ascending=False).reset_index(drop=True)


In [4]:
def retrieve_top_chunks_full(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    top_k_chunks=10
):
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k_chunks]
    
    results = []
    for idx in top_idx:
        results.append({
            "context_id": df_chunks.iloc[idx]["context_id"],
            "chunk_id": df_chunks.iloc[idx]["chunk_id"],
            "title": df_chunks.iloc[idx]["title"],
            "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
            "chunk_start": df_chunks.iloc[idx]["chunk_start"],
            "chunk_end": df_chunks.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [5]:
# # ---------- Answer Containment ----------
# def is_answer_in_chunk(chunk_text, answer_text):
#     return answer_text.lower().strip() in chunk_text.lower()

# def is_answer_in_chunk(chunk_text, answer_text):
#     # Normalize
#     chunk_tokens = set(re.findall(r"\w+", chunk_text.lower()))
#     answer_tokens = set(re.findall(r"\w+", answer_text.lower()))

#     # Require that most/all answer tokens are present
#     return len(answer_tokens & chunk_tokens) / max(1, len(answer_tokens)) >= 0.8


# from rapidfuzz import fuzz

# def is_answer_in_chunk(chunk_text, answer_text, threshold=80):
#     score = fuzz.partial_ratio(answer_text.lower(), chunk_text.lower())
#     return score >= threshold

def is_answer_in_chunk(answer_start, chunk_start, chunk_length):
    if answer_start is None or chunk_start is None or chunk_length is None:
        return False
    return chunk_start <= answer_start < (chunk_start + chunk_length)

In [6]:
def compute_metrics_for_query(results, query_row, similarity_threshold=0.6):
    # --- Check similarity threshold ---
    if results.empty or results["similarity"].max() < similarity_threshold:
        results_filtered = pd.DataFrame([])  # Treat as no answer
    else:
        results_filtered = results

    # --- Document-level ---
    answer_exists = pd.notna(query_row["answer_start"])
    found_doc_id = False if results_filtered.empty else any(
        query_row["context_id"] == doc_id for doc_id in results_filtered["context_id"]
    )
    y_true_doc = 1 if answer_exists else 0
    y_pred_doc = 1 if found_doc_id else 0

    # --- Chunk-level ---
    if results_filtered.empty:
        found_chunk_context = False
        good_chunks = 0
    else:
        correct_doc_chunks = results_filtered[results_filtered["context_id"] == query_row["context_id"]]
        found_chunk_context = any(
            is_answer_in_chunk(
                query_row["answer_start"],
                chunk["chunk_start"],
                chunk["chunk_end"] - chunk["chunk_start"]
            )
            for _, chunk in results_filtered.iterrows()
        )
        good_chunks = len(correct_doc_chunks)

    total_chunks = results_filtered.shape[0] if not results_filtered.empty else 1
    chunk_ratio = good_chunks / total_chunks

    y_true_chunk = 1 if answer_exists else 0
    y_pred_chunk = 1 if found_chunk_context else 0

    return y_true_doc, y_pred_doc, y_true_chunk, y_pred_chunk, chunk_ratio

In [7]:
# def evaluate_top_k_accuracy(
#     df_queries,
#     chunk_embeddings,
#     df_chunks,
#     cluster_labels,
#     top_n_clusters=2,
#     top_k_total=5
# ):
#     # ✅ Compute centroids once
#     centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

#     y_true_doc = []
#     y_pred_doc = []

#     y_true_chunk = []
#     y_pred_chunk = []

#     chunk_ratios = []

#     for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
#         query_emb = model.encode([row["question"]])[0]
#         results = retrieve_top_chunks_by_cluster(
#             query_embedding=query_emb,
#             chunk_embeddings=chunk_embeddings,
#             df_chunks=df_chunks,
#             cluster_labels=cluster_labels,
#             centroid_matrix=centroid_matrix,
#             centroid_ids=centroid_ids,
#             top_n_clusters=top_n_clusters,
#             top_k_total=top_k_total
#         )

#         # Document-level
#         found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
#         y_true_doc.append(1)
#         y_pred_doc.append(1 if found_doc_id else 0)

#         correct_doc_chunks = results[results["context_id"] == row["context_id"]]
#         found_chunk_context = any(
#             is_answer_in_chunk(
#                 row["answer_start"],
#                 chunk["chunk_start"],
#                 chunk["chunk_end"] - chunk["chunk_start"]
#             )
#             for _, chunk in correct_doc_chunks.iterrows()
#         )
#         good_chunks = len(correct_doc_chunks)
#         total_chunks = results.shape[0]
#         ratio = good_chunks / total_chunks
#         chunk_ratios.append(ratio)

#         y_true_chunk.append(1)
#         y_pred_chunk.append(1 if found_chunk_context else 0)

#     # Compute metrics
#     chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0
#     metrics = {
#         "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
#         "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
#         "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
#         "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
#         "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
#         "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "correct_chunk_accuracy": chunk_accuracy
#     }

#     return metrics
def evaluate_top_k_accuracy(
    df_queries,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    top_n_clusters=2,
    top_k_total=5
):
    # Compute centroids once
    centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_by_cluster(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=cluster_labels,
            centroid_matrix=centroid_matrix,
            centroid_ids=centroid_ids,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)

    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics


In [9]:
# def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5, similarity_threshold=0.6):
    # y_true_doc = []
    # y_pred_doc = []

    # y_true_chunk = []
    # y_pred_chunk = []

    # chunk_ratios = []

    # for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
    #     query_emb = model.encode([row["question"]])[0]
    #     results = retrieve_top_chunks_full(
    #         query_embedding=query_emb,
    #         chunk_embeddings=chunk_embeddings,
    #         df_chunks=df_chunks,
    #         top_k_chunks=top_k_chunks
    #     )

    #     # Document-level
    #     found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
    #     y_true_doc.append(1)
    #     y_pred_doc.append(1 if found_doc_id else 0)

    #     correct_doc_chunks = results[results["context_id"] == row["context_id"]]
    #     found_chunk_context = any(
    #         is_answer_in_chunk(
    #             row["answer_start"],
    #             chunk["chunk_start"],
    #             chunk["chunk_end"] - chunk["chunk_start"]
    #         )
    #         for _, chunk in correct_doc_chunks.iterrows()
    #     )
    #     good_chunks = len(correct_doc_chunks)
    #     total_chunks = results.shape[0]
    #     ratio = good_chunks / total_chunks
    #     chunk_ratios.append(ratio)

    #     y_true_chunk.append(1)
    #     y_pred_chunk.append(1 if found_chunk_context else 0)

    # # Compute metrics
    # chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0
    # metrics = {
    #     "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
    #     "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
    #     "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "correct_chunk_accuracy": chunk_accuracy
    # }

    # return metrics
def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5):
    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_full(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)

    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics

In [10]:
def minibatchkmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.1
):
    n_samples = chunk_embeddings.shape[0]
    n_batches = int(np.ceil(n_samples / batch_size))

    results = []

    # --- Inicializáló klaszterezés ---
    init_start = time.time()
    init_size = max(1, int(n_samples * init_fraction))
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=batch_size)
    kmeans.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start

    print("Start batch processing...")
    for batch_idx in tqdm(range(1, n_batches + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, n_samples)
        X_batch = chunk_embeddings[start_idx:end_idx]

        # --- Online update ---
        update_start = time.time()
        kmeans.partial_fit(X_batch)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Klasztercímkék frissítése ---
        labels = kmeans.predict(chunk_embeddings)

        # --- Retrieval + pontosság ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=labels,
            top_n_clusters=5,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
        })
        print(f"[Batch {batch_idx}/{n_batches}] Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}")

    return pd.DataFrame(results)


In [11]:
def online_kmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.5,  # fraction of data used for initialization
    max_clusters=None,
    metric="cosine",
    new_cluster_threshold=None,
    merge_threshold=None,
    decay=None
):
    """
    OnlineKMeans clustering + retrieval evaluation on growing dataset.
    Only evaluates on the chunks that have been clustered so far.
    """

    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size

    # --- Step 1: Initialization ---
    print(f"🔧 Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")
    init_start = time.time()
    okm = OnlineKMeans(
        n_clusters=n_clusters,
        max_clusters=max_clusters,
        metric=metric,
        new_cluster_threshold=new_cluster_threshold,
        merge_threshold=merge_threshold,
        random_state=42,
        decay=decay
    )
    okm.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start
    print(f"✅ Initialization done in {init_time:.4f} s")

    # --- Step 2: Online updates on the remaining data ---
    results = []
    for batch_idx in tqdm(range(1, int(np.ceil(remaining_size / batch_size)) + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        batch_embeddings = chunk_embeddings[init_size + start_idx : init_size + end_idx]

        # --- Online update ---
        update_start = time.time()
        okm.partial_fit(batch_embeddings)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Only evaluate on seen data so far ---
        seen_end_idx = init_size + end_idx
        seen_embeddings = chunk_embeddings[:seen_end_idx]
        seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

        # --- Predict cluster labels for seen data ---
        labels_seen = okm.predict(seen_embeddings)

        # --- Filter queries to only those with seen context_ids ---
        df_queries_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())].reset_index(drop=True)
        print(f"df_queries_seen: {df_queries_seen.shape[0]}, seen_df_chunks: {seen_df_chunks.shape[0]}")

        # --- Retrieval accuracy ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries_seen,
            chunk_embeddings=seen_embeddings,
            df_chunks=seen_df_chunks,
            cluster_labels=labels_seen,
            top_n_clusters=5,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "n_clusters": len(okm.centroids)
        })

        print(f"[Batch {batch_idx}] Seen chunks: {seen_end_idx}, Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}, Clusters: {len(okm.centroids)}")

    return pd.DataFrame(results)

# Workflow

In [13]:
df_train = pd.read_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

X_train = np.load("../data/tensors/squad_train_v4_semantic_chunking_l2.npy")

labels_train = df_train["cluster"].values

In [14]:
df_train.shape

(84007, 10)

In [15]:
df_queries_train.shape

(87599, 4)

In [23]:
df_queries_train.loc[1000, "question"]

'How much did Beyonce initially contribute to the foundation?'

In [28]:
query = df_queries_train.loc[1000, "question"]
query_emb = model.encode([query])[0]

In [31]:
centroid_matrix, centroid_ids = compute_cluster_centroids(X_train, labels_train)

start_time = time.time()

top_chunks_cluster = retrieve_top_chunks_by_cluster(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    cluster_labels=labels_train,
    top_n_clusters=1,
    top_k_total=3,
    centroid_matrix=centroid_matrix,
    centroid_ids=centroid_ids
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Cluster-based retrieval:")
print(top_chunks_cluster['chunk_embed_text'].tolist())

Runtime: 0.00597 seconds
Cluster-based retrieval:
['Rolling Stone reported that the music industry was urging them to return the money they earned for the concerts; a spokesperson for Beyoncé later confirmed to The Huffington Post that she donated the money to the Clinton Bush Haiti Fund.', 'After Hurricane Katrina in 2005, Beyoncé and Rowland founded the Survivor Foundation to provide transitional housing for victims in the Houston area, to which Beyoncé contributed an initial $250,000.', 'Beyoncé would later speak of her mother as the person who helped her fight it.']


In [30]:
start_time = time.time()

top_chunks_full = retrieve_top_chunks_full(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    top_k_chunks=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Full retrieval:")
print(top_chunks_full['chunk_embed_text'].tolist())

Runtime: 0.26332 seconds
Full retrieval:
['Rolling Stone reported that the music industry was urging them to return the money they earned for the concerts; a spokesperson for Beyoncé later confirmed to The Huffington Post that she donated the money to the Clinton Bush Haiti Fund.', 'After Hurricane Katrina in 2005, Beyoncé and Rowland founded the Survivor Foundation to provide transitional housing for victims in the Houston area, to which Beyoncé contributed an initial $250,000.', 'See: List of wealthiest foundations.']


# Evaluation for retrival with cluster centroids vs full

In [13]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v2_semantic_chunking_l2.npy")
df_semantic_train = pd.read_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

labels_train = df_semantic_train["cluster"].values

In [14]:
df_semantic_train['cluster'].nunique()

500

In [15]:
np.random.seed(42)
n_rows = len(df_queries_train)
random_indices = np.random.choice(n_rows, size=int(0.15 * n_rows), replace=False)
df_queries_train.loc[random_indices, 'answer_start'] = None

In [16]:
df_queries_train['answer_start'].isna().sum()

np.int64(13139)

In [93]:
X_semantic_train = X_semantic_train[:4000]
df_semantic_train = df_semantic_train.iloc[:4000]

df_queries_train = df_queries_train[df_queries_train["context_id"].isin(df_semantic_train["context_id"].unique())].reset_index(drop=True)

labels_train = df_semantic_train["cluster"].values

In [None]:
# Benchmark
top_ks = [3, 5, 12, 25]
top_n_clusters = [5, 10, 20, 35]

# top_ks = [5, 10, 20]
# top_n_clusters = [5, 10, 20]

results_centroid = []
results_full = []
for top_k in top_ks:
    # for top_n_cluster in top_n_clusters:
    #     print(f"Evaluating: Top-{top_k} chunks in Top-{top_n_cluster} clusters")
        
    #     start_centroid = time.time()
    #     centroid_metrics = evaluate_top_k_accuracy(df_queries_train, X_semantic_train, df_semantic_train, labels_train, top_n_clusters=top_n_cluster, top_k_total=top_k)
    #     end_centroid = time.time()
        
    #     results_centroid.append({
    #         "top_k": top_k,
    #         "top_n_clusters": top_n_cluster,
    #         "centroid_metrics": centroid_metrics,
    #         "centroid_time": end_centroid - start_centroid
    #     })
        
    #     results_df_centroid = pd.DataFrame(results_centroid)
    #     results_df_centroid.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/centroid_results_kmeans500_v2_l2.xlsx")
        
    start_full = time.time()
    full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=top_k)
    end_full = time.time()
    results_full.append({
        "top_k": top_k,
        "full_metrics": full_metrics,
        "full_time": end_full - start_full
    })
    results_df_full = pd.DataFrame(results_full)
    results_df_full.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results_kmeans500_v2_l2_corrected.xlsx")

 44%|████▍     | 38692/87599 [1:39:50<2:06:29,  6.44it/s]

In [13]:
len(np.unique(labels_train)) / 500

1.0

In [39]:
4000 / 84000

0.047619047619047616

In [14]:
start_full = time.time()
full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=25)
end_full = time.time()
results_full.append({
    "top_k": 25,
    "full_metrics": full_metrics,
    "full_time": end_full - start_full
})
results_df_full = pd.DataFrame(results_full)

100%|██████████| 87599/87599 [3:28:17<00:00,  7.01it/s]   


In [16]:
results_df_full

Unnamed: 0,top_k,full_metrics,full_time
0,3,"{'doc_accuracy': 0.8036735579173279, 'chunk_ac...",10972.37997
1,5,"{'doc_accuracy': 0.8522357561159374, 'chunk_ac...",10506.523075
2,12,"{'doc_accuracy': 0.9135035788079773, 'chunk_ac...",11872.97477
3,25,"{'doc_accuracy': 0.9460039498167787, 'chunk_ac...",12497.922858


In [17]:
results_df_full.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results.xlsx")

# Evaluate retrieval with MiniBatchKMeans

In [None]:
# Load data
X_semantic_train = np.load("./data/tensors/squad_train_v2_semantic_chunking_l2.npy")
df_semantic_train = pd.read_excel("./data/prepared/squad_train_v2_semantic_chunking.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")

In [None]:
# --- Futtatás ---
results_df = minibatchkmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=160,
    batch_size=1000,
    top_k_total=5,
    init_fraction=0.5
)

In [None]:
# --- Plotolás: Pontosság ---
plt.figure(figsize=(10,5))
plt.plot(results_df["batch"], results_df["doc_accuracy"], label="Doc Accuracy", marker='o')
plt.plot(results_df["batch"], results_df["chunk_accuracy"], label="Chunk Accuracy", marker='s')
plt.xlabel("Batch")
plt.ylabel("Accuracy")
plt.title("Retrieval pontosság batchenként (Online KMeans)")
plt.legend()
plt.grid(True)
plt.show()

# --- Plotolás: Futásidők ---
plt.figure(figsize=(10,5))
plt.plot(results_df["batch"], results_df["init_time"], label="Init time", marker='o')
plt.plot(results_df["batch"], results_df["update_time"], label="Update time", marker='s')
plt.plot(results_df["batch"], results_df["retrieval_time"], label="Retrieval time", marker='^')
plt.xlabel("Batch")
plt.ylabel("Time (s)")
plt.title("Futásidők batchenként (Online KMeans)")
plt.legend()
plt.grid(True)
plt.show()

# Evaluate retrieval with online clustering

In [107]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v2_semantic_chunking_l2.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v2_semantic_chunking.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

In [108]:
results_df = online_kmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=500,
    max_clusters=2000,
    batch_size=2000,
    top_k_total=5,
    metric="cosine",
    init_fraction=0.5,
    merge_threshold=0.08,    
    decay=1.0,
    new_cluster_threshold=0.8
)

results_df.to_excel("../data/results/onlinekmeans_v2.xlsx") # cluster500rol

🔧 Using 50% of data (42003 samples) for initialization
✅ Initialization done in 65.6504 s


  0%|          | 0/22 [00:00<?, ?it/s]

df_queries_seen: 48817, seen_df_chunks: 44003


100%|██████████| 48817/48817 [17:23<00:00, 46.79it/s]
  5%|▍         | 1/22 [17:24<6:05:30, 1044.31s/it]

[Batch 1] Seen chunks: 44003, Doc acc: 0.7526, Chunk acc: 0.6238, Clusters: 524
df_queries_seen: 50578, seen_df_chunks: 46003


100%|██████████| 50578/50578 [19:28<00:00, 43.27it/s]
  9%|▉         | 2/22 [36:54<6:12:44, 1118.23s/it]

[Batch 2] Seen chunks: 46003, Doc acc: 0.7504, Chunk acc: 0.6206, Clusters: 524
df_queries_seen: 52477, seen_df_chunks: 48003


100%|██████████| 52477/52477 [20:12<00:00, 43.27it/s]
 14%|█▎        | 3/22 [57:08<6:07:56, 1161.90s/it]

[Batch 3] Seen chunks: 48003, Doc acc: 0.7481, Chunk acc: 0.6187, Clusters: 526
df_queries_seen: 54231, seen_df_chunks: 50003


100%|██████████| 54231/54231 [22:14<00:00, 40.64it/s]
 18%|█▊        | 4/22 [1:19:23<6:09:09, 1230.54s/it]

[Batch 4] Seen chunks: 50003, Doc acc: 0.7456, Chunk acc: 0.6149, Clusters: 544
df_queries_seen: 56119, seen_df_chunks: 52003


100%|██████████| 56119/56119 [22:43<00:00, 41.15it/s]
 23%|██▎       | 5/22 [1:42:08<6:02:22, 1278.96s/it]

[Batch 5] Seen chunks: 52003, Doc acc: 0.7438, Chunk acc: 0.6129, Clusters: 544
df_queries_seen: 58097, seen_df_chunks: 54003


100%|██████████| 58097/58097 [23:27<00:00, 41.27it/s]
 27%|██▋       | 6/22 [2:05:37<5:52:51, 1323.20s/it]

[Batch 6] Seen chunks: 54003, Doc acc: 0.7416, Chunk acc: 0.6102, Clusters: 544
df_queries_seen: 59930, seen_df_chunks: 56003


100%|██████████| 59930/59930 [25:00<00:00, 39.94it/s]
 32%|███▏      | 7/22 [2:30:39<5:45:22, 1381.52s/it]

[Batch 7] Seen chunks: 56003, Doc acc: 0.7393, Chunk acc: 0.6087, Clusters: 544
df_queries_seen: 61832, seen_df_chunks: 58003


100%|██████████| 61832/61832 [25:24<00:00, 40.56it/s]
 36%|███▋      | 8/22 [2:56:05<5:33:05, 1427.50s/it]

[Batch 8] Seen chunks: 58003, Doc acc: 0.7369, Chunk acc: 0.6063, Clusters: 544
df_queries_seen: 63816, seen_df_chunks: 60003


100%|██████████| 63816/63816 [26:15<00:00, 40.51it/s]
 41%|████      | 9/22 [3:22:22<5:19:24, 1474.17s/it]

[Batch 9] Seen chunks: 60003, Doc acc: 0.7358, Chunk acc: 0.6046, Clusters: 550
df_queries_seen: 65742, seen_df_chunks: 62003


100%|██████████| 65742/65742 [28:18<00:00, 38.70it/s]
 45%|████▌     | 10/22 [3:50:42<5:08:48, 1544.04s/it]

[Batch 10] Seen chunks: 62003, Doc acc: 0.7341, Chunk acc: 0.6039, Clusters: 550
df_queries_seen: 67503, seen_df_chunks: 64003


100%|██████████| 67503/67503 [27:48<00:00, 40.46it/s]
 50%|█████     | 11/22 [4:18:32<4:50:08, 1582.56s/it]

[Batch 11] Seen chunks: 64003, Doc acc: 0.7335, Chunk acc: 0.6031, Clusters: 551
df_queries_seen: 69330, seen_df_chunks: 66003


100%|██████████| 69330/69330 [28:42<00:00, 40.24it/s]
 55%|█████▍    | 12/22 [4:47:17<4:30:57, 1625.78s/it]

[Batch 12] Seen chunks: 66003, Doc acc: 0.7310, Chunk acc: 0.6008, Clusters: 551
df_queries_seen: 71487, seen_df_chunks: 68003


100%|██████████| 71487/71487 [29:55<00:00, 39.82it/s]
 59%|█████▉    | 13/22 [5:17:14<4:11:39, 1677.67s/it]

[Batch 13] Seen chunks: 68003, Doc acc: 0.7297, Chunk acc: 0.5994, Clusters: 551
df_queries_seen: 73342, seen_df_chunks: 70003


100%|██████████| 73342/73342 [31:08<00:00, 39.25it/s]
 64%|██████▎   | 14/22 [5:48:24<3:51:26, 1735.86s/it]

[Batch 14] Seen chunks: 70003, Doc acc: 0.7287, Chunk acc: 0.5980, Clusters: 552
df_queries_seen: 75361, seen_df_chunks: 72003


100%|██████████| 75361/75361 [32:31<00:00, 38.62it/s]
 68%|██████▊   | 15/22 [6:20:57<3:30:09, 1801.30s/it]

[Batch 15] Seen chunks: 72003, Doc acc: 0.7256, Chunk acc: 0.5957, Clusters: 552
df_queries_seen: 77458, seen_df_chunks: 74003


100%|██████████| 77458/77458 [33:13<00:00, 38.85it/s]
 73%|███████▎  | 16/22 [6:54:13<3:05:58, 1859.78s/it]

[Batch 16] Seen chunks: 74003, Doc acc: 0.7246, Chunk acc: 0.5954, Clusters: 554
df_queries_seen: 79607, seen_df_chunks: 76003


100%|██████████| 79607/79607 [34:00<00:00, 39.01it/s]
 77%|███████▋  | 17/22 [7:28:15<2:39:33, 1914.68s/it]

[Batch 17] Seen chunks: 76003, Doc acc: 0.7247, Chunk acc: 0.5958, Clusters: 554
df_queries_seen: 81585, seen_df_chunks: 78003


100%|██████████| 81585/81585 [36:48<00:00, 36.94it/s]
 82%|████████▏ | 18/22 [8:05:06<2:13:34, 2003.63s/it]

[Batch 18] Seen chunks: 78003, Doc acc: 0.7230, Chunk acc: 0.5942, Clusters: 554
df_queries_seen: 83407, seen_df_chunks: 80003


100%|██████████| 83407/83407 [36:54<00:00, 37.66it/s]
 86%|████████▋ | 19/22 [8:42:02<1:43:22, 2067.65s/it]

[Batch 19] Seen chunks: 80003, Doc acc: 0.7227, Chunk acc: 0.5939, Clusters: 555
df_queries_seen: 85432, seen_df_chunks: 82003


100%|██████████| 85432/85432 [37:16<00:00, 38.20it/s]
 91%|█████████ | 20/22 [9:19:21<1:10:37, 2118.96s/it]

[Batch 20] Seen chunks: 82003, Doc acc: 0.7207, Chunk acc: 0.5916, Clusters: 555
df_queries_seen: 87594, seen_df_chunks: 84003


100%|██████████| 87594/87594 [38:46<00:00, 37.65it/s]
 95%|█████████▌| 21/22 [9:58:09<36:21, 2181.84s/it]  

[Batch 21] Seen chunks: 84003, Doc acc: 0.7193, Chunk acc: 0.5901, Clusters: 555
df_queries_seen: 87599, seen_df_chunks: 84007


100%|██████████| 87599/87599 [39:01<00:00, 37.41it/s]
100%|██████████| 22/22 [10:37:13<00:00, 1737.90s/it]

[Batch 22] Seen chunks: 84007, Doc acc: 0.7193, Chunk acc: 0.5901, Clusters: 555



