# Retrieval

This notebook contains the code for the retrival pipeline.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import time
import warnings

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Classes and functions

In [12]:
# Encoder model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [13]:
def retrieve_top_chunks_by_cluster(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    top_n_clusters=2,
    top_k_total=5
):
    # Compute cluster centroids
    unique_clusters = np.unique(cluster_labels)
    cluster_centroids = {
        cid: chunk_embeddings[cluster_labels == cid].mean(axis=0)
        for cid in unique_clusters
    }

    centroid_matrix = np.vstack(list(cluster_centroids.values()))
    centroid_ids = list(cluster_centroids.keys())

    # Get top-N clusters by similarity to query
    cluster_sims = cosine_similarity([query_embedding], centroid_matrix)[0]
    top_n_idx = cluster_sims.argsort()[::-1][:top_n_clusters]
    selected_clusters = [centroid_ids[i] for i in top_n_idx]

    # Collect all chunks from selected clusters
    mask = np.isin(cluster_labels, selected_clusters)
    selected_chunk_embeddings = chunk_embeddings[mask]
    selected_df = df_chunks[mask].reset_index(drop=True)

    # Compute similarity for all these chunks
    sims = cosine_similarity([query_embedding], selected_chunk_embeddings)[0]

    # Get top-K chunks overall
    top_k_idx = sims.argsort()[::-1][:top_k_total]
    results = []

    for idx in top_k_idx:
        results.append({
            "cluster": cluster_labels[mask][idx],
            "context_id": selected_df.iloc[idx]["context_id"],
            "chunk_id": selected_df.iloc[idx]["chunk_id"],
            "title": selected_df.iloc[idx]["title"],
            "chunk_embed_text": selected_df.iloc[idx]["chunk_embed_text"],
            "chunk_start": selected_df.iloc[idx]["chunk_start"],
            "similarity": sims[idx]
        })

    return pd.DataFrame(results).sort_values("similarity", ascending=False).reset_index(drop=True)

In [14]:
def retrieve_top_chunks_full(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    top_k_chunks=10
):
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k_chunks]
    
    results = []
    for idx in top_idx:
        results.append({
            "context_id": df_chunks.iloc[idx]["context_id"],
            "chunk_id": df_chunks.iloc[idx]["chunk_id"],
            "title": df_chunks.iloc[idx]["title"],
            "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
            "chunk_start": df_chunks.iloc[idx]["chunk_start"],
            "similarity": sims[idx]
        })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [15]:
# ---------- Answer Containment ----------
def is_answer_in_chunk(chunk_text, answer_text):
    return answer_text.lower().strip() in chunk_text.lower()

In [16]:
def evaluate_top_k_accuracy(df_queries, chunk_embeddings, df_chunks, cluster_labels, top_n_clusters=2, top_k_total=5):
    correct_doc_id = 0
    correct_chunk_context = 0
    total = len(df_queries)
    
    for i, row in tqdm(df_queries.iterrows()):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_by_cluster(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=cluster_labels,
            top_n_clusters=top_n_clusters,
            top_k_total=top_k_total
        )
        

        found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
        if found_doc_id:
            correct_doc_id += 1

            found_chunk_context = any(is_answer_in_chunk(row["answer_text"], chunk) for chunk in results["chunk_embed_text"])
            if found_chunk_context:
                correct_chunk_context += 1

    accuracy_doc_id = correct_doc_id / total
    accuracy_chunk_context = correct_chunk_context / total
    return accuracy_doc_id, accuracy_chunk_context


In [17]:
def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5):
    correct_doc_id = 0
    correct_chunk_context = 0
    total = len(df_queries)
    
    for i, row in tqdm(df_queries.iterrows()):
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_full(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )
        found_doc_id = any(row["context_id"] == doc_id for doc_id in results["context_id"])
        if found_doc_id:
            correct_doc_id += 1

            found_chunk_context = any(is_answer_in_chunk(row["answer_text"], chunk) for chunk in results["chunk_embed_text"])
            if found_chunk_context:
                correct_chunk_context += 1
    
    accuracy_doc_id = correct_doc_id / total
    accuracy_chunk_context = correct_chunk_context / total
    return accuracy_doc_id, accuracy_chunk_context


# Workflow

In [18]:
df_train = pd.read_excel("./data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_val = pd.read_excel("./data/labelled/squad_val_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("./data/prepared/squad_train_v2_queries.xlsx")
df_queries_train = df_queries_train[df_queries_train["context_id"].isin(df_train["context_id"].unique())].reset_index(drop=True)

X_train = np.load("./data/labelled/squad_train_v2_semantic_chunking_clustered.npy")
X_val = np.load("./data/labelled/squad_val_v2_semantic_chunking_clustered.npy")
df_queries_val = pd.read_excel("./data/prepared/squad_val_v2_queries.xlsx")

labels_train = df_train["cluster"].values

In [19]:
df_train.shape

(2201, 10)

In [20]:
df_queries_train.shape

(2329, 4)

In [21]:
df_queries_train.loc[100, "question"]

'In what year did the team lead by Knute Rockne win the Rose Bowl?'

In [22]:
query = 'In what year did the team lead by Knute Rockne win the Rose Bowl?'
query_emb = model.encode([query])[0]

In [35]:
start_time = time.time()

top_chunks_cluster = retrieve_top_chunks_by_cluster(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    cluster_labels=labels_train,
    top_n_clusters=3,
    top_k_total=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Cluster-based retrieval:")
print(top_chunks_cluster['chunk_embed_text'].tolist())

Runtime: 0.01423 seconds
Cluster-based retrieval:
["'96, Heisman Trophy winner Ty Detmer '90, and two-time Super Bowl winner Jim McMahon.", '23-year-old Candice Glover won the season with Kree Harrison taking the runner-up spot.', "BYU also claims notable professional football players including two-time NFL MVP and Super Bowl MVP and Pro Football Hall of Fame quarterback Steve Young '84 & J.D."]


In [36]:
start_time = time.time()

top_chunks_full = retrieve_top_chunks_full(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    top_k_chunks=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Full retrieval:")
print(top_chunks_full['chunk_embed_text'].tolist())

Runtime: 0.00212 seconds
Full retrieval:
["'96, Heisman Trophy winner Ty Detmer '90, and two-time Super Bowl winner Jim McMahon.", '23-year-old Candice Glover won the season with Kree Harrison taking the runner-up spot.', "BYU also claims notable professional football players including two-time NFL MVP and Super Bowl MVP and Pro Football Hall of Fame quarterback Steve Young '84 & J.D."]


# Evaluation

In [21]:
# Use the validation set for evaluation
top_k = 5
cluster_acc_doc_id, cluster_acc_chunk_context = evaluate_top_k_accuracy(df_queries_train, X_train, df_train, labels_train, top_n_clusters=5, top_k_total=top_k)
full_acc_doc_id, full_acc_chunk_context = evaluate_top_k_accuracy_full(df_queries_train, X_train, df_train, top_k_chunks=top_k)

2329it [00:25, 90.28it/s] 
2329it [00:23, 100.22it/s]


In [22]:
print(f"Cluster-based:")
print(f"Doc accuracy: {cluster_acc_doc_id:.4f}, Chunk accuracy: {cluster_acc_chunk_context:.4f}")
print(f"Full retrieval:")
print(f"Doc accuracy: {full_acc_doc_id:.4f}, Chunk accuracy: {full_acc_chunk_context:.4f}")

Cluster-based:
Doc accuracy: 0.9373, Chunk accuracy: 0.0013
Full retrieval:
Doc accuracy: 0.9721, Chunk accuracy: 0.0017
