# Retrieval

This notebook contains the code for the retrival pipeline.

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import time
import warnings

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

# Classes and functions

In [24]:
def retrieve_top_chunks_by_cluster(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    cluster_labels,
    top_n_clusters=2,
    top_k_chunks=5
):
    # Compute cluster centroids
    unique_clusters = np.unique(cluster_labels)
    cluster_centroids = {}
    for cid in unique_clusters:
        cluster_embs = chunk_embeddings[cluster_labels == cid]
        cluster_centroids[cid] = cluster_embs.mean(axis=0)
    
    centroid_matrix = np.vstack(list(cluster_centroids.values()))
    centroid_ids = list(cluster_centroids.keys())
    
    # Top-N relevant clusters
    cluster_sims = cosine_similarity([query_embedding], centroid_matrix)[0]
    top_n_idx = cluster_sims.argsort()[::-1][:top_n_clusters]
    selected_clusters = [centroid_ids[i] for i in top_n_idx]
    
    # Top-K chunks within selected clusters
    results = []
    for cid in selected_clusters:
        mask = cluster_labels == cid
        sims = cosine_similarity([query_embedding], chunk_embeddings[mask])[0]
        top_k_idx = sims.argsort()[::-1][:top_k_chunks] if len(sims) >= top_k_chunks else sims.argsort()[::-1]
        chunk_indices = np.where(mask)[0][top_k_idx]
        for idx, sim in zip(chunk_indices, sims[top_k_idx]):
            results.append({
                "cluster": cid,
                "context_id": df_chunks.iloc[idx]["context_id"],
                "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
                "question": df_chunks.iloc[idx]["question"],
                "answer_text": df_chunks.iloc[idx]["answer_text"],
                "similarity": sim
            })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [25]:
def retrieve_top_chunks_full(
    query_embedding,
    chunk_embeddings,
    df_chunks,
    top_k_chunks=10
):
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k_chunks]
    
    results = []
    for idx in top_idx:
        results.append({
            "context_id": df_chunks.iloc[idx]["context_id"],
            "chunk_embed_text": df_chunks.iloc[idx]["chunk_embed_text"],
            "question": df_chunks.iloc[idx]["question"],
            "answer_text": df_chunks.iloc[idx]["answer_text"],
            "similarity": sims[idx]
        })
    
    return pd.DataFrame(results).sort_values("similarity", ascending=False)


In [49]:
def is_answer_in_chunk(answer, chunk_text):
    return answer in chunk_text

In [58]:
def evaluate_top_k_accuracy(df_queries, chunk_embeddings, df_chunks, cluster_labels, top_n_clusters=2, top_k_chunks=5):
    correct = 0
    total = len(df_queries)
    
    for i, row in df_queries.iterrows():
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_by_cluster(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            cluster_labels=cluster_labels,
            top_n_clusters=top_n_clusters,
            top_k_chunks=top_k_chunks
        )
        
        found = any(is_answer_in_chunk(row["answer_text"], chunk) for chunk in results["chunk_embed_text"])
        if found:
            correct += 1
    
    accuracy = correct / total
    print(f"Top-{top_k_chunks} cluster-based retrieval accuracy: {accuracy:.4f}")
    return accuracy


In [59]:
def evaluate_top_k_accuracy_full(df_queries, chunk_embeddings, df_chunks, top_k_chunks=5):
    correct = 0
    total = len(df_queries)
    
    for i, row in df_queries.iterrows():
        query_emb = model.encode([row["question"]])[0]
        results = retrieve_top_chunks_full(
            query_embedding=query_emb,
            chunk_embeddings=chunk_embeddings,
            df_chunks=df_chunks,
            top_k_chunks=top_k_chunks
        )
        found = any(is_answer_in_chunk(row["answer_text"], chunk) for chunk in results["chunk_embed_text"])
        if found:
            correct += 1
    
    accuracy = correct / total
    print(f"Top-{top_k_chunks} full retrieval accuracy: {accuracy:.4f}")
    return accuracy


# Workflow

In [9]:
df_train = pd.read_excel("./data/labelled/squad_train_v1_semantic_chunking_clustered.xlsx")
df_val = pd.read_excel("./data/labelled/squad_val_v1_semantic_chunking_clustered.xlsx")

X_train = np.load("./data/labelled/squad_train_v1_semantic_chunking_clustered.npy")
X_val = np.load("./data/labelled/squad_val_v1_semantic_chunking_clustered.npy")

labels_train = df_train["cluster"].values

In [11]:
df_train.loc[100, "question"]

'Which prize did Frederick Buechner create?'

In [12]:
model = SentenceTransformer("all-MiniLM-L6-v2")
query = 'Which prize did Frederick Buechner create?'
query_emb = model.encode([query])[0]

In [64]:
start_time = time.time()

top_chunks_cluster = retrieve_top_chunks_by_cluster(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    cluster_labels=labels_train,
    top_n_clusters=1,
    top_k_chunks=3
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Cluster-based retrieval:")
print(top_chunks_cluster['chunk_embed_text'].tolist())

Runtime: 0.06483 seconds
Cluster-based retrieval:
['From Germany came Anton-Hermann Chroust (1907–1982) in classics and law, and Waldemar Gurian a German Catholic intellectual of Jewish descent.', 'From Germany came Anton-Hermann Chroust (1907–1982) in classics and law, and Waldemar Gurian a German Catholic intellectual of Jewish descent.', 'From Germany came Anton-Hermann Chroust (1907–1982) in classics and law, and Waldemar Gurian a German Catholic intellectual of Jewish descent.']


In [42]:
start_time = time.time()

top_chunks_full = retrieve_top_chunks_full(
    query_embedding=query_emb,
    chunk_embeddings=X_train,
    df_chunks=df_train,
    top_k_chunks=6
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time:.5f} seconds")
print("Full retrieval:")
print(top_chunks_full['chunk_embed_text'].tolist())

Runtime: 0.00651 seconds
Full retrieval:
['The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.', 'The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.', 'The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.', 'The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.', 'The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catho

# Evaluation

In [63]:
# Use the validation set for evaluation
df_queries = df_train[df_train['answer_in_chunk']==True].reset_index(drop=True)
top_k = 5
accuracy_cluster = evaluate_top_k_accuracy(df_queries, X_train, df_train, labels_train, top_n_clusters=2, top_k_chunks=top_k)
accuracy_full = evaluate_top_k_accuracy_full(df_queries, X_train, df_train, top_k_chunks=top_k)

Top-5 cluster-based retrieval accuracy: 0.5143
Top-5 full retrieval accuracy: 0.5018


In [67]:
df_squad = pd.read_excel("./data/prepared/squad_train_v1_semantic_chunking.xlsx")

In [71]:
df_squad[(df_squad.index > 1211) & (df_squad.index < 1232)]

Unnamed: 0,context_id,chunk_id,chunk_text,chunk_embed_text,chunk_start,chunk_end,question,answer_text,answer_start,answer_in_chunk,coref,chunking_type
1212,180,180_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,What caused many intellectual Catholics to lea...,The rise of Hitler and other dictators,0,False,False,semantic_chunks
1213,180,180_2,Positivism dominated American intellectual lif...,Positivism dominated American intellectual lif...,300,490,What caused many intellectual Catholics to lea...,The rise of Hitler and other dictators,0,False,False,semantic_chunks
1214,180,180_3,"Ivan Meštrović (1883–1962), a renowned sculpto...","Ivan Meštrović (1883–1962), a renowned sculpto...",491,584,What caused many intellectual Catholics to lea...,The rise of Hitler and other dictators,0,False,False,semantic_chunks
1215,180,180_4,"Yves Simon (1903–61), brought to ND in the 194...","Yves Simon (1903–61), brought to ND in the 194...",585,796,What caused many intellectual Catholics to lea...,The rise of Hitler and other dictators,0,False,False,semantic_chunks
1216,181,181_0,The rise of Hitler and other dictators in the ...,The rise of Hitler and other dictators in the ...,0,156,From where did Anton-Hermann Chroust come to r...,Germany,162,False,False,semantic_chunks
1217,181,181_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,From where did Anton-Hermann Chroust come to r...,Germany,162,True,False,semantic_chunks
1218,181,181_2,Positivism dominated American intellectual lif...,Positivism dominated American intellectual lif...,300,490,From where did Anton-Hermann Chroust come to r...,Germany,162,False,False,semantic_chunks
1219,181,181_3,"Ivan Meštrović (1883–1962), a renowned sculpto...","Ivan Meštrović (1883–1962), a renowned sculpto...",491,584,From where did Anton-Hermann Chroust come to r...,Germany,162,False,False,semantic_chunks
1220,181,181_4,"Yves Simon (1903–61), brought to ND in the 194...","Yves Simon (1903–61), brought to ND in the 194...",585,796,From where did Anton-Hermann Chroust come to r...,Germany,162,False,False,semantic_chunks
1221,182,182_0,The rise of Hitler and other dictators in the ...,The rise of Hitler and other dictators in the ...,0,156,What field of study did Anton-Hermann Chroust ...,classics and law,212,False,False,semantic_chunks


In [66]:
df_train[df_train['chunk_embed_text']=='From Germany came Anton-Hermann Chroust (1907–1982) in classics and law, and Waldemar Gurian a German Catholic intellectual of Jewish descent.']

Unnamed: 0,context_id,chunk_id,chunk_text,chunk_embed_text,chunk_start,chunk_end,question,answer_text,answer_start,answer_in_chunk,coref,chunking_type,cluster
1212,180,180_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,What caused many intellectual Catholics to lea...,The rise of Hitler and other dictators,0,False,False,semantic_chunks,14
1217,181,181_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,From where did Anton-Hermann Chroust come to r...,Germany,162,True,False,semantic_chunks,14
1222,182,182_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,What field of study did Anton-Hermann Chroust ...,classics and law,212,True,False,semantic_chunks,14
1227,183,183_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,Who did Waldemar Gurian receive his tutelage u...,Max Scheler,478,False,False,semantic_chunks,14
1232,184,184_1,From Germany came Anton-Hermann Chroust (1907–...,From Germany came Anton-Hermann Chroust (1907–...,157,299,What was Ivan Meštrović known for being?,a renowned sculptor,519,False,False,semantic_chunks,14
