# Retrieval

This notebook contains the code for the retrival pipeline.

In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import time
import warnings
import re

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans

from OnlineKMeans import OnlineKMeans

warnings.filterwarnings("ignore")

# Classes and functions

In [143]:
# Encoder model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [144]:
def compute_cluster_centroids(chunk_embeddings, cluster_labels):
    """
    Compute centroids once for all clusters.
    Returns:
        centroid_matrix: np.ndarray of shape (n_clusters, embedding_dim)
        centroid_ids: list of cluster IDs
    """
    unique_clusters = np.unique(cluster_labels)
    cluster_centroids = {
        cid: chunk_embeddings[cluster_labels == cid].mean(axis=0)
        for cid in unique_clusters
    }
    centroid_matrix = np.vstack(list(cluster_centroids.values()))
    centroid_ids = list(cluster_centroids.keys())
    return centroid_matrix, centroid_ids


def retrieve_top_chunks_by_cluster(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    centroid_matrix,
    centroid_ids,
    top_n_clusters=2,
    top_k_total=5
):
    # --- Use precomputed centroids ---
    cluster_sims = cosine_similarity([query_embedding], centroid_matrix)[0]
    top_n_idx = cluster_sims.argsort()[::-1][:top_n_clusters]
    selected_clusters = [centroid_ids[i] for i in top_n_idx]

    # Collect all chunks from selected clusters
    mask = np.isin(cluster_labels, selected_clusters)
    selected_chunk_embeddings = chunk_embeddings[mask]
    selected_df = df_chunks[mask].reset_index(drop=True)

    # Compute similarity for all these chunks
    sims = cosine_similarity([query_embedding], selected_chunk_embeddings)[0]

    # Get top-K chunks overall
    top_k_idx = sims.argsort()[::-1][:top_k_total]
    results = []

    for idx in top_k_idx:
        results.append({
            "cluster": cluster_labels[mask][idx],
            "context_id": selected_df.iloc[idx]["context_id"],
            "chunk_id": selected_df.iloc[idx]["chunk_id"],
            "title": selected_df.iloc[idx]["title"],
            "chunk_embed_text": selected_df.iloc[idx]["chunk_embed_text"],
            "chunk_start": selected_df.iloc[idx]["chunk_start"],
            "chunk_end": selected_df.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })

    return pd.DataFrame(results).sort_values("similarity", ascending=False).reset_index(drop=True)


In [145]:
def retrieve_top_chunks_full(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    top_k_chunks=10
):
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k_chunks]
    
    results = []
    for idx in top_idx:
        results.append({
            "context_id": df_chunks.iloc[idx]["context_id"],
            "chunk_id": df_chunks.iloc[idx]["chunk_id"],
            "title": df_chunks.iloc[idx]["title"],
            "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
            "chunk_start": df_chunks.iloc[idx]["chunk_start"],
            "chunk_end": df_chunks.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [146]:
# # ---------- Answer Containment ----------
# def is_answer_in_chunk(chunk_text, answer_text):
#     return answer_text.lower().strip() in chunk_text.lower()

# def is_answer_in_chunk(chunk_text, answer_text):
#     # Normalize
#     chunk_tokens = set(re.findall(r"\w+", chunk_text.lower()))
#     answer_tokens = set(re.findall(r"\w+", answer_text.lower()))

#     # Require that most/all answer tokens are present
#     return len(answer_tokens & chunk_tokens) / max(1, len(answer_tokens)) >= 0.8


# from rapidfuzz import fuzz

# def is_answer_in_chunk(chunk_text, answer_text, threshold=80):
#     score = fuzz.partial_ratio(answer_text.lower(), chunk_text.lower())
#     return score >= threshold

def is_answer_in_chunk(answer_start, chunk_start, chunk_length):
    if answer_start is None or chunk_start is None or chunk_length is None:
        return False
    return chunk_start <= answer_start < (chunk_start + chunk_length)

In [147]:
def compute_metrics_for_query(results, query_row, similarity_threshold=0.75):
    # --- Check similarity threshold ---
    if results.empty or results["similarity"].max() < similarity_threshold:
        results_filtered = pd.DataFrame([])  # Treat as no answer
    else:
        results_filtered = results

    # --- Document-level ---
    answer_exists = pd.notna(query_row["answer_start"])
    found_doc_id = False if results_filtered.empty else any(
        query_row["context_id"] == doc_id for doc_id in results_filtered["context_id"]
    )
    y_true_doc = 1 if answer_exists else 0
    y_pred_doc = 1 if found_doc_id else 0

    # --- Chunk-level ---
    if results_filtered.empty:
        found_chunk_context = False
        good_chunks = 0
    else:
        correct_doc_chunks = results_filtered[results_filtered["context_id"] == query_row["context_id"]]
        found_chunk_context = any(
            is_answer_in_chunk(
                query_row["answer_start"],
                chunk["chunk_start"],
                chunk["chunk_end"] - chunk["chunk_start"]
            )
            for _, chunk in results_filtered.iterrows()
        )
        good_chunks = len(correct_doc_chunks)

    total_chunks = results_filtered.shape[0] if not results_filtered.empty else 1
    chunk_ratio = good_chunks / total_chunks

    y_true_chunk = 1 if answer_exists else 0
    y_pred_chunk = 1 if found_chunk_context else 0

    return y_true_doc, y_pred_doc, y_true_chunk, y_pred_chunk, chunk_ratio

In [148]:
# def evaluate_top_k_accuracy(
#     df_queries,
#     chunk_embeddings,
#     df_chunks,
#     cluster_labels,
#     top_n_clusters=2,
#     top_k_total=5
# ):
#     # ✅ Compute centroids once
#     centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

#     y_true_doc = []
#     y_pred_doc = []

#     y_true_chunk = []
#     y_pred_chunk = []

#     chunk_ratios = []

#     for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
#         query_emb = model.encode([row["question"]])[0]
#         results = retrieve_top_chunks_by_cluster(
#             query_embedding=query_emb,
#             chunk_embeddings=chunk_embeddings,
#             df_chunks=df_chunks,
#             cluster_labels=cluster_labels,
#             centroid_matrix=centroid_matrix,
#             centroid_ids=centroid_ids,
#             top_n_clusters=top_n_clusters,
#             top_k_total=top_k_total
#         )

#         # Document-level
#         found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
#         y_true_doc.append(1)
#         y_pred_doc.append(1 if found_doc_id else 0)

#         correct_doc_chunks = results[results["context_id"] == row["context_id"]]
#         found_chunk_context = any(
#             is_answer_in_chunk(
#                 row["answer_start"],
#                 chunk["chunk_start"],
#                 chunk["chunk_end"] - chunk["chunk_start"]
#             )
#             for _, chunk in correct_doc_chunks.iterrows()
#         )
#         good_chunks = len(correct_doc_chunks)
#         total_chunks = results.shape[0]
#         ratio = good_chunks / total_chunks
#         chunk_ratios.append(ratio)

#         y_true_chunk.append(1)
#         y_pred_chunk.append(1 if found_chunk_context else 0)

#     # Compute metrics
#     chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0
#     metrics = {
#         "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
#         "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
#         "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
#         "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
#         "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
#         "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "correct_chunk_accuracy": chunk_accuracy
#     }

#     return metrics
def evaluate_top_k_accuracy(
    df_queries,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    top_n_clusters=2,
    top_k_total=5,
    similarity_threshold=0.6
):
    # Compute centroids once
    centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_by_cluster(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=cluster_labels,
            centroid_matrix=centroid_matrix,
            centroid_ids=centroid_ids,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row, similarity_threshold)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)

    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics


In [149]:
# def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5, similarity_threshold=0.6):
    # y_true_doc = []
    # y_pred_doc = []

    # y_true_chunk = []
    # y_pred_chunk = []

    # chunk_ratios = []

    # for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
    #     query_emb = model.encode([row["question"]])[0]
    #     results = retrieve_top_chunks_full(
    #         query_embedding=query_emb,
    #         chunk_embeddings=chunk_embeddings,
    #         df_chunks=df_chunks,
    #         top_k_chunks=top_k_chunks
    #     )

    #     # Document-level
    #     found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
    #     y_true_doc.append(1)
    #     y_pred_doc.append(1 if found_doc_id else 0)

    #     correct_doc_chunks = results[results["context_id"] == row["context_id"]]
    #     found_chunk_context = any(
    #         is_answer_in_chunk(
    #             row["answer_start"],
    #             chunk["chunk_start"],
    #             chunk["chunk_end"] - chunk["chunk_start"]
    #         )
    #         for _, chunk in correct_doc_chunks.iterrows()
    #     )
    #     good_chunks = len(correct_doc_chunks)
    #     total_chunks = results.shape[0]
    #     ratio = good_chunks / total_chunks
    #     chunk_ratios.append(ratio)

    #     y_true_chunk.append(1)
    #     y_pred_chunk.append(1 if found_chunk_context else 0)

    # # Compute metrics
    # chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0
    # metrics = {
    #     "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
    #     "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
    #     "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "correct_chunk_accuracy": chunk_accuracy
    # }

    # return metrics
def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5, similarity_threshold=0.7):
    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_full(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row, similarity_threshold)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)

    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics

In [150]:
def minibatchkmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.1
):
    n_samples = chunk_embeddings.shape[0]
    n_batches = int(np.ceil(n_samples / batch_size))

    results = []

    # --- Inicializáló klaszterezés ---
    init_start = time.time()
    init_size = max(1, int(n_samples * init_fraction))
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=batch_size)
    kmeans.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start

    print("Start batch processing...")
    for batch_idx in tqdm(range(1, n_batches + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, n_samples)
        X_batch = chunk_embeddings[start_idx:end_idx]

        # --- Online update ---
        update_start = time.time()
        kmeans.partial_fit(X_batch)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Klasztercímkék frissítése ---
        labels = kmeans.predict(chunk_embeddings)

        # --- Retrieval + pontosság ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=labels,
            top_n_clusters=5,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
        })
        print(f"[Batch {batch_idx}/{n_batches}] Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}")

    return pd.DataFrame(results)


In [151]:
def online_kmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.5,  # fraction of data used for initialization
    max_clusters=None,
    metric="cosine",
    new_cluster_threshold=None,
    merge_threshold=None,
    decay=None
):
    """
    OnlineKMeans clustering + retrieval evaluation on growing dataset.
    Only evaluates on the chunks that have been clustered so far.
    """

    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size

    # --- Step 1: Initialization ---
    print(f"🔧 Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")
    init_start = time.time()
    okm = OnlineKMeans(
        n_clusters=n_clusters,
        max_clusters=max_clusters,
        metric=metric,
        new_cluster_threshold=new_cluster_threshold,
        merge_threshold=merge_threshold,
        random_state=42,
        decay=decay
    )
    okm.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start
    print(f"✅ Initialization done in {init_time:.4f} s")

    # --- Step 2: Online updates on the remaining data ---
    results = []
    for batch_idx in tqdm(range(1, int(np.ceil(remaining_size / batch_size)) + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        batch_embeddings = chunk_embeddings[init_size + start_idx : init_size + end_idx]

        # --- Online update ---
        update_start = time.time()
        okm.partial_fit(batch_embeddings)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Only evaluate on seen data so far ---
        seen_end_idx = init_size + end_idx
        seen_embeddings = chunk_embeddings[:seen_end_idx]
        seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

        # --- Predict cluster labels for seen data ---
        labels_seen = okm.predict(seen_embeddings)

        # --- Filter queries to only those with seen context_ids ---
        # df_queries_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())].reset_index(drop=True)
        # print(f"df_queries_seen: {df_queries_seen.shape[0]}, seen_df_chunks: {seen_df_chunks.shape[0]}")

        # --- Retrieval accuracy ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries,
            chunk_embeddings=seen_embeddings,
            df_chunks=seen_df_chunks,
            cluster_labels=labels_seen,
            top_n_clusters=5,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "n_clusters": len(okm.centroids)
        })

        results_df = pd.DataFrame(results)
        results_df.to_excel("./onlinekmeans_test.xlsx")

        print(f"[Batch {batch_idx}] Seen chunks: {seen_end_idx}, Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}, Clusters: {len(okm.centroids)}")

    return pd.DataFrame(results)

# Workflow

In [13]:
df_train = pd.read_excel("./data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_val = pd.read_excel("./data/labelled/squad_val_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")
df_queries_train = df_queries_train[df_queries_train["context_id"].isin(df_train["context_id"].unique())].reset_index(drop=True)

X_train = np.load("./data/labelled/squad_train_v2_semantic_chunking_clustered.npy")
X_val = np.load("./data/labelled/squad_val_v2_semantic_chunking_clustered.npy")
df_queries_val = pd.read_excel("./data/prepared/squad_val_v2_queries.xlsx")

labels_train = df_train["cluster"].values

In [14]:
df_train.shape

(84007, 10)

In [15]:
df_queries_train.shape

(87599, 4)

In [25]:
df_queries_train.loc[1, "question"]

'What is in front of the Notre Dame Main Building?'

In [43]:
query = df_queries_train.loc[50000, "question"]
query_emb = model.encode([query])[0]

In [None]:
start_time = time.time()

top_chunks_cluster = retrieve_top_chunks_by_cluster(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    cluster_labels=labels_train,
    top_n_clusters=3,
    top_k_total=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Cluster-based retrieval:")
print(top_chunks_cluster['chunk_embed_text'].tolist())

In [51]:
start_time = time.time()

top_chunks_full = retrieve_top_chunks_full(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    top_k_chunks=10
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Full retrieval:")
print(top_chunks_full['chunk_embed_text'].tolist())

[0.78553945 0.710261   0.69749516 0.67169    0.6706306  0.6444305
 0.6248773  0.6170142  0.6095041  0.5957897 ]
Runtime: 0.08458 seconds
Full retrieval:
['It was accepted as a territory of Australia, separate from any state, by the Norfolk Island Act 1913 (Cth), passed under the territories power (Constitution section 122) and made effective in 1914. In 1976 the High Court of Australia held unanimously that Norfolk Island is a part of the Commonwealth.', 'The Norfolk Island Act 1979, passed by the Parliament of Australia in 1979, is the Act under which the island was governed until the passing of the Norfolk Island Legislation Amendment Act 2015.', 'After the creation of the Commonwealth of Australia in 1901, Norfolk Island was placed under the authority of the new Commonwealth government to be administered as an external territory.', 'Norfolk Island was colonised by East Polynesians but was long unpeopled when it was settled by Great Britain as part of its settlement of Australia from

# Evaluation for retrival with cluster centroids vs full

In [65]:
# Load data
X_semantic_train = np.load("./data/tensors/squad_train_v2_semantic_chunking_l2.npy")
df_semantic_train = pd.read_excel("./data/labelled/squad_train_v2_semantic_chunking_clustered_kmeans180.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")

labels_train = df_semantic_train["cluster"].values

In [66]:
# Benchmark
# top_ks = [3, 5, 12, 25]
top_ks = [3]
top_n_clusters = [5, 10, 20]

# top_ks = [5, 10, 20]
# top_n_clusters = [5, 10, 20]
 

results_centroid = []
results_full = []

for top_k in top_ks:
    for top_n_cluster in top_n_clusters:

        print(f"Evaluating: Top-{top_k} chunks in Top-{top_n_cluster} clusters")
        start_centroid = time.time()
        centroid_metrics = evaluate_top_k_accuracy(df_queries_train, X_semantic_train, df_semantic_train, labels_train, top_n_clusters=top_n_cluster, top_k_total=top_k)
        end_centroid = time.time()

        results_centroid.append({
            "top_k": top_k,
            "top_n_clusters": top_n_cluster,
            "centroid_metrics": centroid_metrics,
            "centroid_time": end_centroid - start_centroid
        }) 
        results_df_centroid = pd.DataFrame(results_centroid)
        results_df_centroid.to_excel("./centroid_test.xlsx")

    start_full = time.time()
    full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=top_k)
    end_full = time.time()

    results_full.append({
        "top_k": top_k,
        "full_metrics": full_metrics,
        "full_time": end_full - start_full
    })
    results_df_full = pd.DataFrame(results_full)
    results_df_full.to_excel("./full_test.xlsx")

Evaluating: Top-3 chunks in Top-5 clusters


100%|██████████| 87599/87599 [16:20<00:00, 89.37it/s] 


Evaluating: Top-3 chunks in Top-10 clusters


100%|██████████| 87599/87599 [24:30<00:00, 59.58it/s]    


Evaluating: Top-3 chunks in Top-20 clusters


100%|██████████| 87599/87599 [31:50<00:00, 45.84it/s]    
 22%|██▏       | 19313/87599 [13:25<47:29, 23.96it/s]  


KeyboardInterrupt: 

In [38]:
relative_doc_acc_centroid = centroid_metrics['chunk_accuracy'] / centroid_metrics['doc_accuracy']
relative_doc_acc_full = full_metrics['chunk_accuracy'] / full_metrics['doc_accuracy']

print(f"Cluster-based:")
print(f"Doc accuracy: {centroid_metrics['doc_accuracy']:.4f}, Chunk accuracy: {centroid_metrics['chunk_accuracy']:.4f}, Relative chunk/doc accuracy: {relative_doc_acc_centroid:.4f}")
print(f"Full retrieval:")
print(f"Doc accuracy: {full_metrics['doc_accuracy']:.4f}, Chunk accuracy: {full_metrics['chunk_accuracy']:.4f}, Relative chunk/doc accuracy: {relative_doc_acc_full:.4f}")


Cluster-based:
Doc accuracy: 0.7683, Chunk accuracy: 0.6987, Relative chunk/doc accuracy: 0.9095
Full retrieval:
Doc accuracy: 0.8522, Chunk accuracy: 0.7841, Relative chunk/doc accuracy: 0.9200


# Evaluate retrieval with MiniBatchKMeans

In [29]:
# Load data
X_semantic_train = np.load("./data/tensors/squad_train_v2_semantic_chunking.npy")
df_semantic_train = pd.read_excel("./data/prepared/squad_train_v2_semantic_chunking.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")

In [None]:
# --- Futtatás ---
results_df = minibatchkmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=160,
    batch_size=1000,
    top_k_total=5,
    init_fraction=0.5
)

In [None]:
# --- Plotolás: Pontosság ---
plt.figure(figsize=(10,5))
plt.plot(results_df["batch"], results_df["doc_accuracy"], label="Doc Accuracy", marker='o')
plt.plot(results_df["batch"], results_df["chunk_accuracy"], label="Chunk Accuracy", marker='s')
plt.xlabel("Batch")
plt.ylabel("Accuracy")
plt.title("Retrieval pontosság batchenként (Online KMeans)")
plt.legend()
plt.grid(True)
plt.show()

# --- Plotolás: Futásidők ---
plt.figure(figsize=(10,5))
plt.plot(results_df["batch"], results_df["init_time"], label="Init time", marker='o')
plt.plot(results_df["batch"], results_df["update_time"], label="Update time", marker='s')
plt.plot(results_df["batch"], results_df["retrieval_time"], label="Retrieval time", marker='^')
plt.xlabel("Batch")
plt.ylabel("Time (s)")
plt.title("Futásidők batchenként (Online KMeans)")
plt.legend()
plt.grid(True)
plt.show()

# Evaluate retrieval with online clustering

In [152]:
# Load data
X_semantic_train = np.load("./data/tensors/squad_train_v2_semantic_chunking_l2.npy")
df_semantic_train = pd.read_excel("./data/prepared/squad_train_v2_semantic_chunking.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")

In [153]:
# df_queries_train.loc[25000:50000, 'answer_start'] = None

np.random.seed(42)
n_rows = len(df_queries_train)
random_indices = np.random.choice(n_rows, size=int(0.15 * n_rows), replace=False)
df_queries_train.loc[random_indices, 'answer_start'] = None

In [None]:
results_df = online_kmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=500,
    max_clusters=2000,
    batch_size=2000,
    top_k_total=5,
    metric="cosine",
    init_fraction=0.5,
    merge_threshold=0.08,    
    decay=1.0,
    new_cluster_threshold=0.8
)