In [1]:
import json

import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

In [2]:
# embedding model
model = SentenceTransformer("all-MiniLM-L12-v2")

In [3]:
# create vector store
from langchain_core.vectorstores import InMemoryVectorStore


class SentenceTransformerEmbeddings:
    def __init__(self, st_model):
        self.model = st_model

    def embed_documents(self, texts):
        return self.model.encode(texts)

    def embed_query(self, text):
        return self.model.encode(text)


# Create an instance of the embeddings class
embeddings = SentenceTransformerEmbeddings(model)
vector_store = InMemoryVectorStore(embeddings)

In [14]:
from pathlib import Path
from langchain_core.documents import Document

# load the dataset
txt_dir = Path("../data/txt")
qa_dir = Path("../data/generated")

document_chunks = []
doc_question_answers = {}

# load documents together with their question-answer pairs (and references)
for doc_file in txt_dir.iterdir():
    print(f"Processing {doc_file.name}...")
    if doc_file.suffix == ".txt":
        doc_name = doc_file.stem
        with open(doc_file, "r") as f:
            text = f.read()

        # load the corresponding queries
        qa_file = qa_dir / f"{doc_name}_qa_pairs_with_refs.json"
        if not qa_file.exists():
            print(f"Warning: {qa_file} does not exist. Skipping...")
            continue
        with open(qa_file, "r") as f:
            qa_pairs = json.load(f)
            qa_chunks = []
            for qa_pair in qa_pairs:
                question = qa_pair["question"]
                answer = qa_pair["answer"]
                chunk_start = qa_pair["reference"]["char_start"]
                chunk_end = qa_pair["reference"]["char_end"]
                chunk_id = qa_pair["reference"]["chunk_id"]
                chunk_text = text[chunk_start:chunk_end]
                # create a document chunk
                doc_chunk = Document(
                    id=f"{doc_name}_{chunk_id}",
                    page_content=chunk_text,
                    metadata={"doc_name": doc_name}
                )
                document_chunks.append(doc_chunk)
                
                # create a question-answer chunk
                qac = {
                    "question": question,
                    "answer": answer,
                    "chunk_text": chunk_text
                }
                qa_chunks.append(qac)
            doc_question_answers[doc_name] = qa_chunks

Processing DE000DFK0LN6.txt...
Processing DE000DK0EZN4_FT.txt...
Processing DE000HVB1VD1.txt...
Processing DE000LB3NU97.txt...
Processing DE000NLB26U6.txt...
Processing DE000NLB27B4.txt...
Processing DE000HLB2912.txt...
Processing DE000DK0T0W0.txt...
Processing DE000RLP1015.txt...
Processing DE000NLB3UQ5.txt...
Processing DE000A2AAVQ6.txt...
Processing DE000DG4UF19.txt...
Processing DE000HLB2WJ6.txt...
Processing DE000MHB4784.txt...
Processing DE000CB0F4C1.txt...
Processing DE000DDA0T84.txt...
Processing DE000LB1P7Y6.txt...
Processing DE000A3CLG08.txt...
Processing DE000LB2CJQ2.txt...
Processing DE000LB2CTL2.txt...
Processing rbi-basis-2018-final-terms-series-154-01072018.txt...
Processing DE000DK0E303_FT.txt...
Processing DE000HLB32J9.txt...
Processing DE000LB2ZX59.txt...
Processing DE000DDA0VN9.txt...
Processing DE000BLB9Q18.txt...
Processing DE000HLB3019.txt...
Processing rbi-basis-2014-final-terms-series-73-25092014.txt...
Processing DE000DDA0NU1.txt...
Processing DE000HLB7556.txt.

In [16]:
# add document chunks to the vector store
print(f"Adding {len(document_chunks)} document chunks to the vector store...")
vector_store.add_documents(document_chunks)

Adding 2372 document chunks to the vector store...


['DE000DFK0LN6_0',
 'DE000DFK0LN6_1',
 'DE000DFK0LN6_2',
 'DE000DFK0LN6_3',
 'DE000DFK0LN6_4',
 'DE000DFK0LN6_5',
 'DE000DK0EZN4_FT_0',
 'DE000DK0EZN4_FT_1',
 'DE000DK0EZN4_FT_2',
 'DE000DK0EZN4_FT_3',
 'DE000DK0EZN4_FT_4',
 'DE000DK0EZN4_FT_5',
 'DE000DK0EZN4_FT_6',
 'DE000DK0EZN4_FT_7',
 'DE000DK0EZN4_FT_8',
 'DE000DK0EZN4_FT_9',
 'DE000DK0EZN4_FT_10',
 'DE000DK0EZN4_FT_11',
 'DE000DK0EZN4_FT_12',
 'DE000DK0EZN4_FT_13',
 'DE000DK0EZN4_FT_14',
 'DE000DK0EZN4_FT_15',
 'DE000DK0EZN4_FT_16',
 'DE000DK0EZN4_FT_17',
 'DE000DK0EZN4_FT_18',
 'DE000DK0EZN4_FT_19',
 'DE000DK0EZN4_FT_20',
 'DE000DK0EZN4_FT_21',
 'DE000DK0EZN4_FT_22',
 'DE000DK0EZN4_FT_23',
 'DE000DK0EZN4_FT_24',
 'DE000DK0EZN4_FT_25',
 'DE000DK0EZN4_FT_26',
 'DE000DK0EZN4_FT_27',
 'DE000DK0EZN4_FT_28',
 'DE000DK0EZN4_FT_29',
 'DE000DK0EZN4_FT_30',
 'DE000DK0EZN4_FT_31',
 'DE000DK0EZN4_FT_32',
 'DE000DK0EZN4_FT_33',
 'DE000DK0EZN4_FT_34',
 'DE000DK0EZN4_FT_35',
 'DE000DK0EZN4_FT_36',
 'DE000DK0EZN4_FT_37',
 'DE000DK0EZN4_FT_38',

In [33]:
# helper function to check retrieval accuracy

def check_query_retrieval(query: str, vector_store: InMemoryVectorStore, expected_chunk_text: str, k: int = 5):
    """
    Check if the retrieval returns the expected chunk text.
    
    Args:
        query (str): The query to search for.
        vector_store (InMemoryVectorStore): The vector store to search in.
        expected_chunk_text (str): The expected chunk text to find.
        k (int): The number of results to return.
        
    Returns:
        bool: True if the expected chunk text is found, False otherwise.
    """
    results = vector_store.similarity_search(query=query, k=k)
    for doc in results:
        if expected_chunk_text in doc.page_content:
            return True
    return False

# function to check retrieval for all questions
def check_all_retrievals(doc_question_answers: dict, vector_store: InMemoryVectorStore, k=5):
    """
    Check retrieval for all questions in the document-question-answer pairs.
    
    Args:
        doc_question_answers: Dictionary containing document names and their question-answer pairs.
        vector_store: The vector store to search in.
        k: The number of results to return.
        
    Returns:
        dict: A dictionary with document names as keys and lists of booleans indicating retrieval success for each question.
    """
    retrieval_results = {}
    for qa_pairs in tqdm(doc_question_answers.values()):
        for qa_pair in qa_pairs:
            question = qa_pair["question"]
            answer = qa_pair["answer"]
            chunk_text = qa_pair["chunk_text"]
            success = check_query_retrieval(question, vector_store, chunk_text, k)
            retrieval_results[question] = {
                "success": success,
                "question": question,
                "answer": answer,
                "expected_chunk_text": chunk_text
            }
    return retrieval_results

# compute top-k retrieval accuracy
def compute_top_k_accuracy(retrieval_results: dict):
    """
    Compute the top-k retrieval accuracy.
    
    Args:
        retrieval_results: Dictionary with questions as keys and retrieval success as values.
        
    Returns:
        float: The top-k retrieval accuracy.
    """
    total_queries = len(retrieval_results)
    successful_retrievals = sum(v["success"] for v in retrieval_results.values())
    accuracy = successful_retrievals / total_queries
    return accuracy

In [34]:
# compute top-5 retrieval accuracy
retrieval_results_top5 = check_all_retrievals(doc_question_answers, vector_store, k=5)
top_k_accuracy = compute_top_k_accuracy(retrieval_results_top5)
print(f"Top-5 Retrieval Accuracy: {top_k_accuracy:.2%}")

100%|██████████| 113/113 [00:57<00:00,  1.98it/s]

Top-5 Retrieval Accuracy: 10.24%





In [35]:
# compute top-1 retrieval accuracy
retrieval_results_top1 = check_all_retrievals(doc_question_answers, vector_store, k=1)
top_k_accuracy = compute_top_k_accuracy(retrieval_results_top1)
print(f"Top-1 Retrieval Accuracy: {top_k_accuracy:.2%}")

100%|██████████| 113/113 [00:56<00:00,  2.00it/s]

Top-1 Retrieval Accuracy: 5.23%





In [38]:
import random
def show_random_retrievals(retrieval_results: dict, is_success: bool = True, n: int = 20):
    """
    Show random retrieval results.
    
    Args:
        retrieval_results: Dictionary with questions as keys and retrieval success as values.
        is_success: If True, show successful retrievals, otherwise show failed retrievals.
        n: The number of random results to show.
    """
    filtered_results = [v for v in retrieval_results.values() if v["success"] == is_success]
    random_results = random.sample(filtered_results, min(n, len(filtered_results)))
    
    for result in random_results:
        print(f"Question: {result['question']}")
        print(f"Answer: {result['answer']}")
        print(f"Expected Chunk Text: {result['expected_chunk_text']}\n")
        print("-" * 80)

In [39]:
# Show random successful retrievals
show_random_retrievals(retrieval_results_top1, is_success=True, n=20)

Question: What rights do Holders have regarding the Permanent Global Note according to the Swiss Federal Intermediated Securities Act?
Answer: Each Holder shall have a proportionate co-ownership interest in the Permanent Global Note to the extent of his claim against the Issuer.
Expected Chunk Text: he 
Swiss Principal Paying Agent with SIX SIS AG, Olten, 
Switzerland  or  any  other  intermediary  in  Switzerland 
recognised  for  such  purposes  by  the  SIX  Swiss 
Exchange  Ltd  ("SIX  SIS  AG"  or  any  such  other 
intermediary, the "Intermediary"). Once the Permanent 
Global  Note  is  deposited  with  the  Intermediary  and 
entered into the accounts of one or more participants of 

 

 

- 24 - 

 

 

 

stellen 

wurde, 

Teilnehmer 

der  Verwahrungsstelle 
mehrerer 
die 
gutgeschrieben 
Schuldverschreibungen 
Bucheffekten 
("Bucheffekten")  gemäß  den  Bestimmungen  des 
schweizerischen  Bucheffektengesetzes  dar.  Die 
Unterlagen  der  Verwahrungsstelle  bestimmen  die 


In [41]:
# Show random failed retrievals
show_random_retrievals(retrieval_results_top5, is_success=False, n=20)

Question: Which bank is responsible for managing the securities issuance mentioned in the document?
Answer: DZ BANK AG (Deutsche Zentral-Genossenschaftsbank, Frankfurt am Main) is responsible for managing the securities issuance.
Expected Chunk Text: L II/1: ZUSÄTZLICHE ANGABEN BEZOGEN AUF SCHULDVERSCHREIBUNGEN MIT EINER 

FESTGELEGTEN STÜCKELUNG VON MINDESTENS EUR 100.000 

nicht anwendbar 

ESSENTIAL INFORMATION 
GRUNDLEGENDE ANGABEN 

A. 
A. 
Interests of natural and legal persons involved in the issue / offer 
Interessen von Seiten natürlicher und juristischer Personen, die an 
der Emission / dem Angebot beteiligt sind 
  not applicable 
 
  Certain of the Dealers appointed under the Programme and their affiliates have engaged, and may 
in  future  engage,  in  investment  banking  and/or  commercial  banking  transactions  with,  and  may 
perform  services  for,  the  Issuer  in  the  ordinary  course  of  business.  Save  as  discussed  in  the 
previous sentence, so far as th