In [1]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

from financerag.tasks import FinanceBench

import numpy as np 
import pandas as pd
import torch

# For retrieval
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import CrossEncoder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chunkers import TableSplitter, SummarizeSplitter, KeyConceptSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings('ignore')

## Read Data

In [3]:
task = FinanceBench()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.


In [4]:
queries = task.queries
query_df = pd.DataFrame(queries.values(), index=queries.keys(), columns=["query"])
query_df.shape

(150, 1)

In [5]:
documents = task.corpus
documents_df = pd.DataFrame(documents.values(), index=documents.keys(), columns=["title", "text"])
documents_df["text"] = documents_df["title"] + " " + documents_df["text"]
documents_df.drop(columns=["title"], inplace=True)

## Initiliaze Vector Store

In [6]:
text_splitters = [RecursiveCharacterTextSplitter()]

In [7]:
embedder = HuggingFaceEmbeddings(model_name="msmarco-distilbert-base-v4")

persist_directory = ".chroma"

docs = []

for id, text in documents_df.text.items():
    for splitter in text_splitters:
        chunks = splitter.split_text(text)
        
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, 
                metadata={
                    "id": str(id), 
                    "splitter": type(splitter).__name__,  
                    "chunk_index": i
                }
            )
            docs.append(doc)

if os.path.exists(persist_directory):
    chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)
    print("Loaded existing ChromaDB from .chroma")
else:
    chroma_db = Chroma.from_documents(
        documents=docs,
        embedding=embedder,
        persist_directory=persist_directory,  
    )
    print("Created new ChromaDB and saved to .chroma")

Created new ChromaDB and saved to .chroma


## Retrieve

In [8]:
retriever = chroma_db.as_retriever(search_kwargs={"k": 100})

In [9]:
retrieved_df = pd.DataFrame([[{} ] for _ in query_df.index], index=query_df.index, columns=["Documents"])

In [10]:
for idx, query in query_df["query"].items():

    retrieved = retriever.invoke(query)

    retrieved = {
        str(doc.metadata["id"]):  1
        for doc in retrieved
    }
    retrieved_df.loc[idx]["Documents"] = retrieved

retrieved_results = retrieved_df["Documents"].to_dict()

## Re-Rank

In [11]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.tensor(x)))

for idx, query in query_df["query"].items():
    for doc_id in retrieved_results[idx]:

        raw_score = cross_encoder.predict((query, documents_df.loc[doc_id].text))
        normalized_score = sigmoid(raw_score).item()

        retrieved_results[idx][doc_id] = normalized_score
    
    retrieved_results[idx] = dict(sorted(retrieved_results[idx].items(), key=lambda item: item[1], reverse=True))

## Evaluate Retrieval

In [12]:
qrels = pd.read_csv('../../data/resources/finance_bench_qrels.tsv', sep='\t')

In [13]:
qrels_dict = {}
for index, row in qrels.iterrows():
    key = row['query_id']
    if key not in qrels_dict:
        qrels_dict[key] = {}
    qrels_dict[key][row['corpus_id']] = row['score']

In [14]:
k_values = [5, 10, 30]
results = task.evaluate(qrels=qrels_dict, results=retrieved_results, k_values=k_values)

In [15]:
metrics_df = pd.DataFrame(index=k_values, columns=["MAP", "NDCG", "P@K", "R@K"])

metrics_df["MAP"] = [results[1][f"MAP@{k}"] for k in k_values]
metrics_df["NDCG"] = [results[0][f"NDCG@{k}"] for k in k_values]
metrics_df["P@K"] = [results[3][f"P@{k}"] for k in k_values]
metrics_df["R@K"] = [results[2][f"Recall@{k}"] for k in k_values]

metrics_df

Unnamed: 0,MAP,NDCG,P@K,R@K
5,0.62426,0.68502,0.21333,0.83333
10,0.63167,0.69779,0.11111,0.86667
30,0.63561,0.71482,0.04074,0.93333
