# Retrieval

This notebook contains the code for the retrival pipeline.

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import time
import warnings
import re
import faiss

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans, KMeans
from river import cluster

from OnlineKMeans import OnlineKMeans

warnings.filterwarnings("ignore")

# Classes and functions

In [65]:
# Encoder model
# model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer("Snowflake/snowflake-arctic-embed-l-v2.0")

In [66]:
def compute_cluster_centroids(chunk_embeddings, cluster_labels):
    """
    Compute centroids once for all clusters.
    Returns:
        centroid_matrix: np.ndarray of shape (n_clusters, embedding_dim)
        centroid_ids: list of cluster IDs
    """
    unique_clusters = np.unique(cluster_labels)
    cluster_centroids = {
        cid: chunk_embeddings[cluster_labels == cid].mean(axis=0)
        for cid in unique_clusters
    }
    centroid_matrix = np.vstack(list(cluster_centroids.values()))
    centroid_ids = list(cluster_centroids.keys())
    return centroid_matrix, centroid_ids


def retrieve_top_chunks_by_cluster(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    centroid_matrix,
    centroid_ids,
    top_n_clusters=2,
    top_k_total=5
):
    # --- Use precomputed centroids ---
    cluster_sims = cosine_similarity([query_embedding], centroid_matrix)[0]
    top_n_idx = cluster_sims.argsort()[::-1][:top_n_clusters]
    selected_clusters = [centroid_ids[i] for i in top_n_idx]

    # Collect all chunks from selected clusters
    mask = np.isin(cluster_labels, selected_clusters)
    selected_chunk_embeddings = chunk_embeddings[mask]
    selected_df = df_chunks[mask].reset_index(drop=True)

    # Compute similarity for all these chunks
    sims = cosine_similarity([query_embedding], selected_chunk_embeddings)[0]

    # Get top-K chunks overall
    top_k_idx = sims.argsort()[::-1][:top_k_total]
    results = []

    for idx in top_k_idx:
        results.append({
            "cluster": cluster_labels[mask][idx],
            "context_id": selected_df.iloc[idx]["context_id"],
            "chunk_id": selected_df.iloc[idx]["chunk_id"],
            "title": selected_df.iloc[idx]["title"],
            "chunk_embed_text": selected_df.iloc[idx]["chunk_embed_text"],
            "chunk_start": selected_df.iloc[idx]["chunk_start"],
            "chunk_end": selected_df.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })

    return pd.DataFrame(results).sort_values("similarity", ascending=False).reset_index(drop=True)


In [67]:
def retrieve_top_chunks_full(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    top_k_chunks=10
):
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k_chunks]
    
    results = []
    for idx in top_idx:
        results.append({
            "context_id": df_chunks.iloc[idx]["context_id"],
            "chunk_id": df_chunks.iloc[idx]["chunk_id"],
            "title": df_chunks.iloc[idx]["title"],
            "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
            "chunk_start": df_chunks.iloc[idx]["chunk_start"],
            "chunk_end": df_chunks.iloc[idx]["chunk_end"],
            "similarity": sims[idx]
        })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [68]:
# # ---------- Answer Containment ----------
# def is_answer_in_chunk(chunk_text, answer_text):
#     return answer_text.lower().strip() in chunk_text.lower()

# def is_answer_in_chunk(chunk_text, answer_text):
#     # Normalize
#     chunk_tokens = set(re.findall(r"\w+", chunk_text.lower()))
#     answer_tokens = set(re.findall(r"\w+", answer_text.lower()))

#     # Require that most/all answer tokens are present
#     return len(answer_tokens & chunk_tokens) / max(1, len(answer_tokens)) >= 0.8


# from rapidfuzz import fuzz

# def is_answer_in_chunk(chunk_text, answer_text, threshold=80):
#     score = fuzz.partial_ratio(answer_text.lower(), chunk_text.lower())
#     return score >= threshold

def is_answer_in_chunk(answer_start, chunk_start, chunk_length):
    if answer_start is None or chunk_start is None or chunk_length is None:
        return False
    return chunk_start <= answer_start < (chunk_start + chunk_length)

In [69]:
def compute_metrics_for_query(results, query_row, similarity_threshold=0.6):
    # --- Check similarity threshold ---
    if results.empty or results["similarity"].max() < similarity_threshold:
        results_filtered = pd.DataFrame([])  # Treat as no answer
    else:
        results_filtered = results

    # --- Document-level ---
    answer_exists = pd.notna(query_row["answer_start"])
    found_doc_id = False if results_filtered.empty else any(
        query_row["context_id"] == doc_id for doc_id in results_filtered["context_id"]
    )
    y_true_doc = 1 if answer_exists else 0
    y_pred_doc = 1 if found_doc_id else 0

    # --- Chunk-level ---
    if results_filtered.empty:
        found_chunk_context = False
        good_chunks = 0
    else:
        correct_doc_chunks = results_filtered[results_filtered["context_id"] == query_row["context_id"]]
        found_chunk_context = any(
            is_answer_in_chunk(
                query_row["answer_start"],
                chunk["chunk_start"],
                chunk["chunk_end"] - chunk["chunk_start"]
            )
            for _, chunk in correct_doc_chunks.iterrows()
        )
        good_chunks = len(correct_doc_chunks)

    total_chunks = results_filtered.shape[0] if not results_filtered.empty else 1
    chunk_ratio = good_chunks / total_chunks

    y_true_chunk = 1 if answer_exists else 0
    y_pred_chunk = 1 if found_chunk_context else 0

    return y_true_doc, y_pred_doc, y_true_chunk, y_pred_chunk, chunk_ratio

In [70]:
# def evaluate_top_k_accuracy(
#     df_queries,
#     chunk_embeddings,
#     df_chunks,
#     cluster_labels,
#     top_n_clusters=2,
#     top_k_total=5
# ):
#     # ‚úÖ Compute centroids once
#     centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

#     y_true_doc = []
#     y_pred_doc = []

#     y_true_chunk = []
#     y_pred_chunk = []

#     chunk_ratios = []

#     for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
#         query_emb = model.encode([row["question"]])[0]
#         results = retrieve_top_chunks_by_cluster(
#             query_embedding=query_emb,
#             chunk_embeddings=chunk_embeddings,
#             df_chunks=df_chunks,
#             cluster_labels=cluster_labels,
#             centroid_matrix=centroid_matrix,
#             centroid_ids=centroid_ids,
#             top_n_clusters=top_n_clusters,
#             top_k_total=top_k_total
#         )

#         # Document-level
#         found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
#         y_true_doc.append(1)
#         y_pred_doc.append(1 if found_doc_id else 0)

#         correct_doc_chunks = results[results["context_id"] == row["context_id"]]
#         found_chunk_context = any(
#             is_answer_in_chunk(
#                 row["answer_start"],
#                 chunk["chunk_start"],
#                 chunk["chunk_end"] - chunk["chunk_start"]
#             )
#             for _, chunk in correct_doc_chunks.iterrows()
#         )
#         good_chunks = len(correct_doc_chunks)
#         total_chunks = results.shape[0]
#         ratio = good_chunks / total_chunks
#         chunk_ratios.append(ratio)

#         y_true_chunk.append(1)
#         y_pred_chunk.append(1 if found_chunk_context else 0)

#     # Compute metrics
#     chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0
#     metrics = {
#         "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
#         "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
#         "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
#         "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
#         "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
#         "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0),
#         "correct_chunk_accuracy": chunk_accuracy
#     }

#     return metrics
def evaluate_top_k_accuracy(
    df_queries,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    top_n_clusters=2,
    top_k_total=5
):
    # Compute centroids once
    centroid_matrix, centroid_ids = compute_cluster_centroids(chunk_embeddings, cluster_labels)

    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []
    
    # For 1024D embeddigs
    sample_size = 2000
    if len(df_queries) > sample_size:
        df_queries = df_queries.sample(n=sample_size, random_state=42).reset_index(drop=True)
    else:
        df_queries = df_queries.reset_index(drop=True)

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_by_cluster(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=cluster_labels,
            centroid_matrix=centroid_matrix,
            centroid_ids=centroid_ids,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)

    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics


In [71]:
# def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5, similarity_threshold=0.6):
    # y_true_doc = []
    # y_pred_doc = []

    # y_true_chunk = []
    # y_pred_chunk = []

    # chunk_ratios = []

    # for i, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
    #     query_emb = model.encode([row["question"]])[0]
    #     results = retrieve_top_chunks_full(
    #         query_embedding=query_emb,
    #         chunk_embeddings=chunk_embeddings,
    #         df_chunks=df_chunks,
    #         top_k_chunks=top_k_chunks
    #     )

    #     # Document-level
    #     found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
    #     y_true_doc.append(1)
    #     y_pred_doc.append(1 if found_doc_id else 0)

    #     correct_doc_chunks = results[results["context_id"] == row["context_id"]]
    #     found_chunk_context = any(
    #         is_answer_in_chunk(
    #             row["answer_start"],
    #             chunk["chunk_start"],
    #             chunk["chunk_end"] - chunk["chunk_start"]
    #         )
    #         for _, chunk in correct_doc_chunks.iterrows()
    #     )
    #     good_chunks = len(correct_doc_chunks)
    #     total_chunks = results.shape[0]
    #     ratio = good_chunks / total_chunks
    #     chunk_ratios.append(ratio)

    #     y_true_chunk.append(1)
    #     y_pred_chunk.append(1 if found_chunk_context else 0)

    # # Compute metrics
    # chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0
    # metrics = {
    #     "doc_accuracy": sum(y_pred_doc) / len(y_pred_doc),
    #     "chunk_accuracy": sum(y_pred_chunk) / len(y_pred_chunk),
    #     "doc_precision": precision_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "doc_recall": recall_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "doc_f1": f1_score(y_true_doc, y_pred_doc, zero_division=0),
    #     "chunk_precision": precision_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "chunk_recall": recall_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "chunk_f1": f1_score(y_true_chunk, y_pred_chunk, zero_division=0),
    #     "correct_chunk_accuracy": chunk_accuracy
    # }

    # return metrics
def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5):
    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []
    
    # For 1024D embeddigs
    sample_size = 2000
    if len(df_queries) > sample_size:
        df_queries = df_queries.sample(n=sample_size, random_state=42).reset_index(drop=True)
    else:
        df_queries = df_queries.reset_index(drop=True)
    

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_full(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)

    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics

In [72]:
def minibatchkmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=2000,
    top_k_total=10,
    init_fraction=0.5,  # fraction of data used for initialization
    top_n_clusters=10
):
    """
    MiniBatchKMeans clustering + retrieval evaluation on a growing dataset.
    Simulates online learning via incremental partial_fit updates.
    Evaluates only on the chunks that have been clustered so far.
    """
    queries_context_ids = df_queries['context_id'].unique()
    chunk_context_ids = df_chunks['context_id'].unique()
    missing_context_ids = set(queries_context_ids) - set(chunk_context_ids)
    
    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size
    n_batches = int(np.ceil(remaining_size / batch_size))

    # --- Step 1: Initialization ---
    print(f"üîß Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")
    init_start = time.time()
    kmeans = MiniBatchKMeans(
        n_clusters=n_clusters,
        random_state=42,
        batch_size=batch_size
    )
    kmeans.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start
    print(f"‚úÖ Initialization done in {init_time:.4f} s")

    # --- Step 2: Online updates on the remaining data ---
    results = []
    for batch_idx in tqdm(range(1, n_batches + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        batch_embeddings = chunk_embeddings[init_size + start_idx : init_size + end_idx]

        # --- Online update (MiniBatchKMeans incremental fit) ---
        update_start = time.time()
        kmeans.partial_fit(batch_embeddings)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Only evaluate on seen data so far ---
        seen_end_idx = init_size + end_idx
        seen_embeddings = chunk_embeddings[:seen_end_idx]
        seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

        # --- Predict cluster labels for seen data ---
        labels_seen = kmeans.predict(seen_embeddings)

        # --- Progressive query inclusion ---
        progress = batch_idx / n_batches
        queries_in_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())]
        queries_not_in_seen = df_queries[df_queries["context_id"].isin(missing_context_ids)]
        n_to_sample = int(len(queries_not_in_seen) * progress)
        queries_sampled = (
            queries_not_in_seen.sample(n=n_to_sample, random_state=42)
            if n_to_sample > 0 else pd.DataFrame(columns=df_queries.columns)
        )
        df_queries_seen = pd.concat([queries_in_seen, queries_sampled]).reset_index(drop=True)

        # --- Retrieval accuracy ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries_seen,
            chunk_embeddings=seen_embeddings,
            df_chunks=seen_df_chunks,
            cluster_labels=labels_seen,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        # --- Log results ---
        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "n_clusters": n_clusters
        })
        results_df = pd.DataFrame(results)
        results_df.to_excel("../data/results/onlinekmeans_with_minibatchkmeans_v4.xlsx")

    return pd.DataFrame(results)


In [73]:
def online_kmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.5,  # fraction of data used for initialization
    max_clusters=None,
    metric="cosine",
    new_cluster_threshold=None,
    merge_threshold=None,
    decay=None,
    top_n_clusters=10
):
    """
    OnlineKMeans clustering + retrieval evaluation on growing dataset.
    Only evaluates on the chunks that have been clustered so far.
    """
    queries_context_ids = df_queries['context_id'].unique()
    chunk_context_ids = df_chunks['context_id'].unique()
    missing_context_ids = set(queries_context_ids) - set(chunk_context_ids)
    
    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size
    n_batches = int(np.ceil(remaining_size / batch_size))

    # --- Step 1: Initialization ---
    print(f"üîß Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")
    init_start = time.time()
    okm = OnlineKMeans(
        n_clusters=n_clusters,
        max_clusters=max_clusters,
        metric=metric,
        new_cluster_threshold=new_cluster_threshold,
        merge_threshold=merge_threshold,
        random_state=42,
        decay=decay
    )
    okm.partial_fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start
    print(f"‚úÖ Initialization done in {init_time:.4f} s")

    # --- Step 2: Online updates on the remaining data ---
    results = []
    for batch_idx in tqdm(range(1, int(np.ceil(remaining_size / batch_size)) + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        batch_embeddings = chunk_embeddings[init_size + start_idx : init_size + end_idx]

        # --- Online update ---
        update_start = time.time()
        okm.partial_fit(batch_embeddings)
        update_end = time.time()
        update_time = update_end - update_start

        # --- Only evaluate on seen data so far ---
        seen_end_idx = init_size + end_idx
        seen_embeddings = chunk_embeddings[:seen_end_idx]
        seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

        # --- Predict cluster labels for seen data ---
        labels_seen = okm.predict(seen_embeddings)

        progress = batch_idx / n_batches
        queries_in_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())]
        queries_not_in_seen = df_queries[(df_queries["context_id"].isin(missing_context_ids))]
        n_to_sample = int(len(queries_not_in_seen) * progress)
        queries_sampled = queries_not_in_seen.sample(n=n_to_sample, random_state=42) if n_to_sample > 0 else pd.DataFrame(columns=df_queries.columns)
        df_queries_seen = pd.concat([queries_in_seen, queries_sampled]).reset_index(drop=True)
        
        # --- Retrieval accuracy ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries_seen,
            chunk_embeddings=seen_embeddings,
            df_chunks=seen_df_chunks,
            cluster_labels=labels_seen,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "n_clusters": len(okm.centroids)
        })
        results_df = pd.DataFrame(results)
        results_df.to_excel("../data/results/onlinekmeans_v4_from.xlsx")

        #print(f"[Batch {batch_idx}] Seen chunks: {seen_end_idx}, Doc acc: {metrics['doc_accuracy']:.4f}, Chunk acc: {metrics['chunk_accuracy']:.4f}, Clusters: {len(okm.centroids)}")

    return pd.DataFrame(results)

In [74]:
def kmeans_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    n_clusters=20,
    batch_size=500,
    top_k_total=5,
    init_fraction=0.5,  # fraction of data used for initialization
    top_n_clusters=10
):
    """
    Batch KMeans clustering + retrieval evaluation on growing dataset.
    Re-fits KMeans from scratch after each batch to simulate online learning.
    """
    queries_context_ids = df_queries['context_id'].unique()
    chunk_context_ids = df_chunks['context_id'].unique()
    missing_context_ids = set(queries_context_ids) - set(chunk_context_ids)
    
    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size
    n_batches = int(np.ceil(remaining_size / batch_size))

    # --- Step 1: Initialization ---
    print(f"üîß Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")
    init_start = time.time()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start
    print(f"‚úÖ Initialization done in {init_time:.4f} s")

    # --- Step 2: Batch updates (re-fit each time on seen data) ---
    results = []
    for batch_idx in tqdm(range(1, n_batches + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        seen_end_idx = init_size + end_idx

        # --- Fit KMeans on all seen data so far ---
        seen_embeddings = chunk_embeddings[:seen_end_idx]
        seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

        update_start = time.time()
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(seen_embeddings)
        update_end = time.time()
        update_time = update_end - update_start

        labels_seen = kmeans.labels_

        # --- Progress based on batch number ---
        progress = batch_idx / n_batches

        # --- Query selection ---
        queries_in_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())]
        queries_not_in_seen = df_queries[df_queries["context_id"].isin(missing_context_ids)]
        n_to_sample = int(len(queries_not_in_seen) * progress)
        queries_sampled = (
            queries_not_in_seen.sample(n=n_to_sample, random_state=42)
            if n_to_sample > 0 else pd.DataFrame(columns=df_queries.columns)
        )
        df_queries_seen = pd.concat([queries_in_seen, queries_sampled]).reset_index(drop=True)

        # --- Retrieval accuracy ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy(
            df_queries=df_queries_seen,
            chunk_embeddings=seen_embeddings,
            df_chunks=seen_df_chunks,
            cluster_labels=labels_seen,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "update_time": update_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "n_clusters": n_clusters
        })
        results_df = pd.DataFrame(results)
        results_df.to_excel("../data/results/onlinekmeans_with_kmeans_v4.xlsx")

    return pd.DataFrame(results)


In [75]:
# def river_retrieval_evaluation(
#     chunk_embeddings,
#     df_chunks,
#     df_queries,
#     n_clusters=10,
#     batch_size=2000,
#     top_k_total=10,
#     init_fraction=0.5,
#     top_n_clusters=10
# ):
#     """
#     Uses River's online clustering (streaming KMeans) + retrieval evaluation on growing dataset.
#     Evaluates progressively as more chunks are clustered.
#     """
#     queries_context_ids = df_queries['context_id'].unique()
#     chunk_context_ids = df_chunks['context_id'].unique()
#     missing_context_ids = set(queries_context_ids) - set(chunk_context_ids)
    
#     n_samples = chunk_embeddings.shape[0]
#     init_size = int(n_samples * init_fraction)
#     remaining_size = n_samples - init_size
#     n_batches = int(np.ceil(remaining_size / batch_size))

#     print(f"üîß Using {init_fraction*100:.0f}% of data ({init_size} samples) for initialization")

#     # ‚úÖ Convert embeddings to dict once for speed
#     print("‚öôÔ∏è Converting embeddings to River-friendly dict format...")
#     emb_dicts = [{i: float(v) for i, v in enumerate(emb)} for emb in chunk_embeddings]
#     print("‚úÖ Conversion done.")

#     # Initialize River KMeans
#     stream_clusterer = cluster.KMeans(n_clusters=n_clusters, halflife=0.001, sigma=3, seed=42)

#     # --- Initialization ---
#     init_start = time.time()
#     for emb_dict in emb_dicts[:init_size]:
#         stream_clusterer.learn_one(emb_dict)
#     init_end = time.time()
#     init_time = init_end - init_start
#     print(f"‚úÖ Initialization done in {init_time:.4f} s")

#     results = []
#     for batch_idx in tqdm(range(1, n_batches + 1), desc="Batches"):
#         start_idx = (batch_idx - 1) * batch_size
#         end_idx = min(batch_idx * batch_size, remaining_size)
#         batch_dicts = emb_dicts[init_size + start_idx : init_size + end_idx]

#         # --- Online update ---
#         update_start = time.time()
#         for emb_dict in batch_dicts:
#             stream_clusterer.learn_one(emb_dict)
#         update_end = time.time()
#         update_time = update_end - update_start

#         # --- Evaluate on seen data ---
#         seen_end_idx = init_size + end_idx
#         seen_embeddings = chunk_embeddings[:seen_end_idx]
#         seen_df_chunks = df_chunks.iloc[:seen_end_idx].reset_index(drop=True)

#         # --- Predict cluster labels for seen data ---
#         labels_seen = [stream_clusterer.predict_one(emb_dict) for emb_dict in emb_dicts[:seen_end_idx]]

#         # --- Progressive inclusion of unseen queries ---
#         progress = batch_idx / n_batches
#         queries_in_seen = df_queries[df_queries["context_id"].isin(seen_df_chunks["context_id"].unique())]
#         queries_not_in_seen = df_queries[df_queries["context_id"].isin(missing_context_ids)]
#         n_to_sample = int(len(queries_not_in_seen) * progress)
#         queries_sampled = (
#             queries_not_in_seen.sample(n=n_to_sample, random_state=42)
#             if n_to_sample > 0 else pd.DataFrame(columns=df_queries.columns)
#         )
#         df_queries_seen = pd.concat([queries_in_seen, queries_sampled]).reset_index(drop=True)

#         # --- Retrieval accuracy ---
#         retrieval_start = time.time()
#         metrics = evaluate_top_k_accuracy(
#             df_queries=df_queries_seen,
#             chunk_embeddings=seen_embeddings,
#             df_chunks=seen_df_chunks,
#             cluster_labels=labels_seen,
#             top_n_clusters=top_n_clusters,
#             top_k_total=top_k_total
#         )
#         retrieval_end = time.time()
#         retrieval_time = retrieval_end - retrieval_start

#         # --- Log results ---
#         results.append({
#             "batch": batch_idx,
#             "init_time": init_time if batch_idx == 1 else 0,
#             "update_time": update_time,
#             "retrieval_time": retrieval_time,
#             "metrics": metrics,
#             "n_clusters": len(stream_clusterer.centers)
#         })

#         results_df = pd.DataFrame(results)
#         results_df.to_excel("../data/results/river_kmeans_stream_v4.xlsx", index=False)

#     return pd.DataFrame(results)


In [76]:
def retrieve_top_chunks_faiss(
    query_embedding,
    faiss_index,
    df_chunks,
    top_k_chunks=10
):
    query_vec = np.array(query_embedding).astype('float32').reshape(1, -1)
    
    distances, indices = faiss_index.search(query_vec, top_k_chunks)

    if isinstance(faiss_index, faiss.IndexFlatL2):
        similarities = -distances[0]
    else:
        similarities = distances[0]
    
    # Collect results
    results = []
    for i, idx in enumerate(indices[0]):
        chunk = df_chunks.iloc[idx]
        results.append({
            "context_id": chunk["context_id"],
            "chunk_id": chunk["chunk_id"],
            "title": chunk["title"],
            "chunk_embed_text": chunk["chunk_embed_text"],
            "chunk_start": chunk["chunk_start"],
            "chunk_end": chunk["chunk_end"],
            "similarity": float(similarities[i])
        })

    return pd.DataFrame(results).sort_values("similarity", ascending=False)

In [77]:
def evaluate_top_k_accuracy_faiss(df_queries, faiss_index, df_chunks, top_k_chunks=5, similarity_threshold=0.6):
    y_true_doc = []
    y_pred_doc = []

    y_true_chunk = []
    y_pred_chunk = []

    chunk_ratios = []
    
    # For 1024D embeddigs
    sample_size = 2000
    if len(df_queries) > sample_size:
        df_queries = df_queries.sample(n=sample_size, random_state=42).reset_index(drop=True)
    else:
        df_queries = df_queries.reset_index(drop=True)

    for _, row in tqdm(df_queries.iterrows(), total=len(df_queries)):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_faiss(
            query_embedding=query_emb,
            faiss_index=faiss_index,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )

        ytd, ypd, ytc, ypc, cr = compute_metrics_for_query(results, row, similarity_threshold)
        y_true_doc.append(ytd)
        y_pred_doc.append(ypd)
        y_true_chunk.append(ytc)
        y_pred_chunk.append(ypc)
        chunk_ratios.append(cr)    


    # Convert to arrays
    y_true_doc_arr = np.array(y_true_doc)
    y_pred_doc_arr = np.array(y_pred_doc)
    y_true_chunk_arr = np.array(y_true_chunk)
    y_pred_chunk_arr = np.array(y_pred_chunk)

    # Compute metrics
    chunk_accuracy = sum(chunk_ratios) / len(chunk_ratios) if len(chunk_ratios) > 0 else 0

    metrics = {
        "doc_accuracy": (y_pred_doc_arr == y_true_doc_arr).mean(),
        "chunk_accuracy": (y_pred_chunk_arr == y_true_chunk_arr).mean(),
        "doc_precision": precision_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_recall": recall_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "doc_f1": f1_score(y_true_doc_arr, y_pred_doc_arr, zero_division=0),
        "chunk_precision": precision_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_recall": recall_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "chunk_f1": f1_score(y_true_chunk_arr, y_pred_chunk_arr, zero_division=0),
        "correct_chunk_accuracy": chunk_accuracy,
        # True/False Positives/Negatives
        "doc_true_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 1)),
        "doc_true_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 0)),
        "doc_false_positives": np.sum((y_pred_doc_arr == 1) & (y_true_doc_arr == 0)),
        "doc_false_negatives": np.sum((y_pred_doc_arr == 0) & (y_true_doc_arr == 1)),
        "chunk_true_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 1)),
        "chunk_true_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 0)),
        "chunk_false_positives": np.sum((y_pred_chunk_arr == 1) & (y_true_chunk_arr == 0)),
        "chunk_false_negatives": np.sum((y_pred_chunk_arr == 0) & (y_true_chunk_arr == 1)),
    }

    return metrics

In [78]:
def faiss_retrieval_evaluation(
    chunk_embeddings,
    df_chunks,
    df_queries,
    batch_size=2000,
    top_k_total=10,
    init_fraction=0.5,   # fraction of data used for initialization
    metric="cosine"
):
    # --- Step 0: Setup ---
    queries_context_ids = df_queries['context_id'].unique()
    chunk_context_ids = df_chunks['context_id'].unique()
    missing_context_ids = set(queries_context_ids) - set(chunk_context_ids)

    n_samples = chunk_embeddings.shape[0]
    init_size = int(n_samples * init_fraction)
    remaining_size = n_samples - init_size
    n_batches = int(np.ceil(remaining_size / batch_size))

    # --- Step 1: Initialization ---
    print(f"üîß Using {init_fraction*100:.0f}% of data ({init_size} samples) for FAISS initialization")

    d = chunk_embeddings.shape[1]
    if metric == "cosine":
        faiss.normalize_L2(chunk_embeddings)
        M = 32
        index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
        index.hnsw.efConstruction = 150
        index.hnsw.efSearch = 16  
    else:
        M = 32
        index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
        index.hnsw.efConstruction = 150
        index.hnsw.efSearch = 16

    init_start = time.time()
    index.add(chunk_embeddings[:init_size])
    init_end = time.time()
    init_time = init_end - init_start

    print(f"‚úÖ FAISS index initialized with {index.ntotal} vectors in {init_time:.4f}s")

    # Mapping: FAISS internal ID ‚Üí context_id
    index_to_chunk = df_chunks.iloc[:init_size]["context_id"].tolist()

    results = []

    # --- Step 2: Batch updates ---
    for batch_idx in tqdm(range(1, n_batches + 1)):
        start_idx = (batch_idx - 1) * batch_size
        end_idx = min(batch_idx * batch_size, remaining_size)
        seen_end_idx = init_size + end_idx

        # Get new batch of embeddings
        batch_embeddings = chunk_embeddings[init_size + start_idx:init_size + end_idx].astype(np.float32)
        batch_df_chunks = df_chunks.iloc[init_size + start_idx:init_size + end_idx]

        # --- Add new data to FAISS ---
        add_start = time.time()
        index.add(batch_embeddings)
        add_end = time.time()
        add_time = add_end - add_start

        # Extend mapping
        index_to_chunk.extend(batch_df_chunks["context_id"].tolist())

        # --- Determine which queries to include ---
        progress = batch_idx / n_batches
        queries_in_seen = df_queries[df_queries["context_id"].isin(index_to_chunk)]
        queries_not_in_seen = df_queries[df_queries["context_id"].isin(missing_context_ids)]
        n_to_sample = int(len(queries_not_in_seen) * progress)
        queries_sampled = (
            queries_not_in_seen.sample(n=n_to_sample, random_state=42)
            if n_to_sample > 0 else pd.DataFrame(columns=df_queries.columns)
        )
        df_queries_seen = pd.concat([queries_in_seen, queries_sampled]).reset_index(drop=True)

        # --- Retrieval evaluation ---
        retrieval_start = time.time()
        metrics = evaluate_top_k_accuracy_faiss(
            df_queries=df_queries_seen,
            df_chunks=df_chunks,
            faiss_index=index,
            top_k_chunks=top_k_total
        )
        retrieval_end = time.time()
        retrieval_time = retrieval_end - retrieval_start

        # --- Store results ---
        results.append({
            "batch": batch_idx,
            "init_time": init_time if batch_idx == 1 else 0,
            "add_time": add_time,
            "retrieval_time": retrieval_time,
            "metrics": metrics,
            "total_vectors": index.ntotal
        })

        # Save intermediate results
        results_df = pd.DataFrame(results)
        results_df.to_excel("../data/results/onlinekmeans_with_faiss_v4.xlsx")

    return pd.DataFrame(results)


# Workflow

In [13]:
df_train = pd.read_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

X_train = np.load("../data/tensors/squad_train_v4_semantic_chunking_l2.npy")

labels_train = df_train["cluster"].values

In [14]:
df_train.shape

(84007, 10)

In [15]:
df_queries_train.shape

(87599, 4)

In [23]:
df_queries_train.loc[1000, "question"]

'How much did Beyonce initially contribute to the foundation?'

In [28]:
query = df_queries_train.loc[1000, "question"]
query_emb = model.encode([query])[0]

In [31]:
centroid_matrix, centroid_ids = compute_cluster_centroids(X_train, labels_train)

start_time = time.time()

top_chunks_cluster = retrieve_top_chunks_by_cluster(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    cluster_labels=labels_train,
    top_n_clusters=1,
    top_k_total=3,
    centroid_matrix=centroid_matrix,
    centroid_ids=centroid_ids
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Cluster-based retrieval:")
print(top_chunks_cluster['chunk_embed_text'].tolist())

Runtime: 0.00597 seconds
Cluster-based retrieval:
['Rolling Stone reported that the music industry was urging them to return the money they earned for the concerts; a spokesperson for Beyonc√© later confirmed to The Huffington Post that she donated the money to the Clinton Bush Haiti Fund.', 'After Hurricane Katrina in 2005, Beyonc√© and Rowland founded the Survivor Foundation to provide transitional housing for victims in the Houston area, to which Beyonc√© contributed an initial $250,000.', 'Beyonc√© would later speak of her mother as the person who helped her fight it.']


In [30]:
start_time = time.time()

top_chunks_full = retrieve_top_chunks_full(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    top_k_chunks=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Full retrieval:")
print(top_chunks_full['chunk_embed_text'].tolist())

Runtime: 0.26332 seconds
Full retrieval:
['Rolling Stone reported that the music industry was urging them to return the money they earned for the concerts; a spokesperson for Beyonc√© later confirmed to The Huffington Post that she donated the money to the Clinton Bush Haiti Fund.', 'After Hurricane Katrina in 2005, Beyonc√© and Rowland founded the Survivor Foundation to provide transitional housing for victims in the Houston area, to which Beyonc√© contributed an initial $250,000.', 'See: List of wealthiest foundations.']


# Evaluation for retrival with cluster centroids vs full

In [79]:
# Load data

# FULL √©s CENTROID ezzel j√≥!!!!!!!!

X_semantic_train = np.load("../data/tensors/squad_train_v4_semantic_chunking_l2_missing.npy")
df_semantic_train = pd.read_excel("../data/labelled/squad_train_v4_semantic_chunking_clustered_missing.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v4_queries_missing.xlsx")
index = faiss.read_index("../data/faiss/squad_train_v4_semantic_chunking_l2_faiss_hnsw2.index")

labels_train = df_semantic_train["cluster"].values

In [19]:
# FAISS ezzel lesz j√≥!!!!

import faiss
index = faiss.read_index("../data/faiss/squad_train_v2_semantic_chunking_l2_faiss_hnsw2.index")
df_semantic_train = pd.read_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered_missing.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries_missing.xlsx")

In [80]:
df_semantic_train['cluster'].nunique()

500

In [81]:
df_queries_train['answer_start'].isna().sum()

np.int64(13134)

In [None]:
# X_semantic_train = X_semantic_train[:4000]
# df_semantic_train = df_semantic_train.iloc[:4000]

# df_queries_train = df_queries_train[df_queries_train["context_id"].isin(df_semantic_train["context_id"].unique())].reset_index(drop=True)

# labels_train = df_semantic_train["cluster"].values

In [82]:
# Benchmark
#3,5,12,25
top_ks = [3, 5, 12, 25]
top_n_clusters = [5, 10, 20, 35]

results_centroid = []
results_full = []
results_faiss = []
for top_k in top_ks:
    for top_n_cluster in top_n_clusters:
        print(f"Evaluating: Top-{top_k} chunks in Top-{top_n_cluster} clusters")
        
        start_centroid = time.time()
        centroid_metrics = evaluate_top_k_accuracy(df_queries_train, X_semantic_train, df_semantic_train, labels_train, top_n_clusters=top_n_cluster, top_k_total=top_k)
        end_centroid = time.time()
        
        results_centroid.append({
            "top_k": top_k,
            "top_n_clusters": top_n_cluster,
            "centroid_metrics": centroid_metrics,
            "centroid_time": end_centroid - start_centroid
        })
        
        results_df_centroid = pd.DataFrame(results_centroid)
        results_df_centroid.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/centroid_results_kmeans500_v4_l2_final.xlsx")
        
    start_full = time.time()
    full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=top_k)
    end_full = time.time()
    results_full.append({
        "top_k": top_k,
        "full_metrics": full_metrics,
        "full_time": end_full - start_full
    })
    results_df_full = pd.DataFrame(results_full)
    results_df_full.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results_kmeans500_v4_l2_final.xlsx")
    
    
    
    start_faiss = time.time()
    faiss_metrics = evaluate_top_k_accuracy_faiss(df_queries_train, index, df_semantic_train, top_k_chunks=top_k)
    end_faiss = time.time()
    results_faiss.append({
        "top_k": top_k,
        "faiss_metrics": faiss_metrics,
        "faiss_time": end_faiss - start_faiss
    })
    results_df_faiss = pd.DataFrame(results_faiss)
    results_df_faiss.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/faiss_results_kmeans500_v4_l2_final_hnsw.xlsx")

Evaluating: Top-3 chunks in Top-5 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:25<00:00,  4.49it/s]


Evaluating: Top-3 chunks in Top-10 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:46<00:00,  4.29it/s]


Evaluating: Top-3 chunks in Top-20 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:01<00:00,  4.15it/s]


Evaluating: Top-3 chunks in Top-35 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:27<00:00,  3.94it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [14:40<00:00,  2.27it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:23<00:00,  4.51it/s]


Evaluating: Top-5 chunks in Top-5 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:47<00:00,  4.28it/s]


Evaluating: Top-5 chunks in Top-10 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:09<00:00,  4.09it/s]


Evaluating: Top-5 chunks in Top-20 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:22<00:00,  3.98it/s]


Evaluating: Top-5 chunks in Top-35 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:36<00:00,  3.87it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [14:35<00:00,  2.28it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:56<00:00,  4.19it/s]


Evaluating: Top-12 chunks in Top-5 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:54<00:00,  4.21it/s]


Evaluating: Top-12 chunks in Top-10 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:57<00:00,  4.19it/s]


Evaluating: Top-12 chunks in Top-20 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:15<00:00,  4.03it/s]


Evaluating: Top-12 chunks in Top-35 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:57<00:00,  3.72it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [15:20<00:00,  2.17it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:32<00:00,  4.42it/s]


Evaluating: Top-25 chunks in Top-5 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:58<00:00,  4.18it/s]


Evaluating: Top-25 chunks in Top-10 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:07<00:00,  4.11it/s]


Evaluating: Top-25 chunks in Top-20 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:29<00:00,  3.93it/s]


Evaluating: Top-25 chunks in Top-35 clusters


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:53<00:00,  3.75it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [14:52<00:00,  2.24it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:30<00:00,  4.44it/s]


In [13]:
len(np.unique(labels_train)) / 500

1.0

In [14]:
start_full = time.time()
full_metrics = evaluate_top_k_accuracy_full(df_queries_train, X_semantic_train, df_semantic_train, top_k_chunks=25)
end_full = time.time()
results_full.append({
    "top_k": 25,
    "full_metrics": full_metrics,
    "full_time": end_full - start_full
})
results_df_full = pd.DataFrame(results_full)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87599/87599 [3:28:17<00:00,  7.01it/s]   


In [16]:
results_df_full

Unnamed: 0,top_k,full_metrics,full_time
0,3,"{'doc_accuracy': 0.8036735579173279, 'chunk_ac...",10972.37997
1,5,"{'doc_accuracy': 0.8522357561159374, 'chunk_ac...",10506.523075
2,12,"{'doc_accuracy': 0.9135035788079773, 'chunk_ac...",11872.97477
3,25,"{'doc_accuracy': 0.9460039498167787, 'chunk_ac...",12497.922858


In [17]:
results_df_full.to_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results.xlsx")

# Evaluate retrieval with MiniBatchKMeans

In [53]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v4_semantic_chunking_l2_missing.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v4_semantic_chunking_missing.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v4_queries_missing.xlsx")

In [54]:
# --- Futtat√°s ---
results_df = minibatchkmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=500,
    batch_size=2000,
    top_k_total=10,
    init_fraction=0.5, # 0.07 -> 5000 sample
    top_n_clusters=10
)
results_df.to_excel("../data/results/onlinekmeans_with_minibatchkmeans_v4_final_smallinitsize.xlsx")


üîß Using 50% of data (35713 samples) for initialization
‚úÖ Initialization done in 17.0099 s


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:06<00:00,  4.11it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:40<00:00,  4.34it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:49<00:00,  4.26it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:48<00:00,  4.27it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:50<00:00,  4.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:47<00:00,  4.28it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:46<00:00,  4.29it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:45<00:00,  4.29it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:52<00:00,  4.24it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:48<00:00,  4.27it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:52<00:00,  4.24it/s]]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:59<00:00,  4.17it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:53<00:00,  4.22it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:57<00:00,  

In [111]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v2_semantic_chunking_l2_missing.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v2_semantic_chunking_missing.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries_missing.xlsx")

In [120]:
results_df = river_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=500,
    batch_size=2000,
    top_k_total=10,
    init_fraction=0.5,
    top_n_clusters=10
)

results_df.to_excel("../data/results/onlinekmeans_with_river_v2_final.xlsx")

üîß Using 50% of data (35713 samples) for initialization
‚öôÔ∏è Converting embeddings to River-friendly dict format...
‚úÖ Conversion done.


KeyboardInterrupt: 

# Evaluate retrieval with online clustering

In [55]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v4_semantic_chunking_l2_missing.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v4_semantic_chunking_missing.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v4_queries_missing.xlsx")

In [56]:
results_df = kmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=500,
    batch_size=2000,
    top_k_total=10,
    init_fraction=0.5,
    top_n_clusters=10)

results_df.to_excel("../data/results/onlinekmeans_with_kmeans_v4_final.xlsx")

üîß Using 50% of data (35713 samples) for initialization
‚úÖ Initialization done in 106.6367 s


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:35<00:00,  4.39it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:55<00:00,  4.21it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:56<00:00,  4.19it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:47<00:00,  4.28it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:49<00:00,  4.26it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:51<00:00,  4.24it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:56<00:00,  4.20it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:01<00:00,  4.16it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:09<00:00,  4.08it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:50<00:00,  4.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:57<00:00,  4.19it/s]]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [08:07<00:00,  4.10it/s]]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:54<00:00,  4.21it/s]]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:56<00:00, 

In [18]:
df_queries_train['context_id'].nunique(), df_semantic_train['context_id'].nunique()

(18891, 16058)

In [57]:
results_df = online_kmeans_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    n_clusters=360,
    max_clusters=2000,
    batch_size=2000,
    top_k_total=10,
    metric="cosine",
    init_fraction=0.5,
    merge_threshold=0.08,    
    decay=1.0,
    new_cluster_threshold=0.8,
    top_n_clusters=10
)

results_df.to_excel("../data/results/onlinekmeans_v4_from360clusters_final.xlsx")

üîß Using 50% of data (35713 samples) for initialization
‚úÖ Initialization done in 77.9709 s


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:40<00:00,  4.35it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:48<00:00,  4.27it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:50<00:00,  4.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:49<00:00,  4.26it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:46<00:00,  4.29it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:45<00:00,  4.30it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:54<00:00,  4.22it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:50<00:00,  4.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:50<00:00,  4.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:46<00:00,  4.28it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:55<00:00,  4.20it/s]]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:54<00:00,  4.21it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:51<00:00,  4.24it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:48<00:00,  

# Evaluate online FAISS

In [58]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v4_semantic_chunking_l2_missing.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v4_semantic_chunking_missing.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v4_queries_missing.xlsx")

In [59]:
results_df = faiss_retrieval_evaluation(
    chunk_embeddings=X_semantic_train,
    df_chunks=df_semantic_train,
    df_queries=df_queries_train,
    batch_size=2000,
    top_k_total=10,
    init_fraction=0.5,
    metric="cosine"
)

results_df.to_excel("../data/results/onlinekmeans_with_faiss_v4_final.xlsx")

üîß Using 50% of data (35713 samples) for FAISS initialization
‚úÖ FAISS index initialized with 35713 vectors in 10.3373s


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:14<00:00,  4.60it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:21<00:00,  4.53it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:15<00:00,  4.59it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:08<00:00,  4.66it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:15<00:00,  4.59it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:04<00:00,  4.71it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:08<00:00,  4.67it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:10<00:00,  4.64it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:15<00:00,  4.60it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:06<00:00,  4.69it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:08<00:00,  4.67it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:12<00:00,  4.63it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:09<00:00,  4.66it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [07:11<00:00,  4.