In [1]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

from financerag.tasks import FinDER

import numpy as np 
import pandas as pd
import torch

# For retrieval
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import CrossEncoder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chunkers import TableSplitter, SummarizeSplitter, KeyConceptSplitter

# For generation
from langchain.vectorstores import Chroma
from langchain import hub
from langchain_openai import OpenAI
from langchain.agents import Tool, create_react_agent, AgentExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings('ignore')

## Read Data

In [3]:
task = FinDER()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.


In [4]:
queries = task.queries
query_df = pd.DataFrame(queries.values(), index=queries.keys(), columns=["query"])

In [5]:
documents = task.corpus
documents_df = pd.DataFrame(documents.values(), index=documents.keys(), columns=["title", "text"])
documents_df["text"] = documents_df["title"] + " " + documents_df["text"]
documents_df.drop(columns=["title"], inplace=True)

## Initiliaze Database

In [6]:
text_splitter = RecursiveCharacterTextSplitter()

In [7]:
embedder = HuggingFaceEmbeddings(model_name="msmarco-distilbert-base-v4")

persist_directory = ".chroma"

docs = []

for id, text in documents_df.text.items():
    # Split the document into chunks
    chunks = text_splitter.split_text(text)
    
    for i, chunk in enumerate(chunks):
        doc = Document(page_content=chunk, metadata={"id": str(id), "chunk_index": i})
        docs.append(doc)

if os.path.exists(persist_directory):
    # Load the existing ChromaDB
    chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)
    print("Loaded existing ChromaDB from .chroma")
else:
    # Create ChromaDB and store the documents
    chroma_db = Chroma.from_documents(
        documents=docs,
        embedding=embedder,
        persist_directory=persist_directory,  
    )
    print("Created new ChromaDB and saved to .chroma")

Created new ChromaDB and saved to .chroma


## Retrieve

In [8]:
retriever = chroma_db.as_retriever(search_kwargs={"k": 100})

In [9]:
retrieved_df = pd.DataFrame([[{} ] for _ in query_df.index], index=query_df.index, columns=["Documents"])

In [10]:
for idx, query in query_df["query"].items():

    retrieved = retriever.invoke(query)

    retrieved = {
        str(doc.metadata["id"]):  1
        for doc in retrieved
    }
    retrieved_df.loc[idx]["Documents"] = retrieved

retrieved_results = retrieved_df["Documents"].to_dict()

## Re-Rank

In [11]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.tensor(x)))

In [12]:
for idx, query in query_df["query"].items():
    for doc_id in retrieved_results[idx]:

        raw_score = cross_encoder.predict((query, documents_df.loc[doc_id].text))
        normalized_score = sigmoid(raw_score).item()

        retrieved_results[idx][doc_id] = normalized_score
    
    retrieved_results[idx] = dict(sorted(retrieved_results[idx].items(), key=lambda item: item[1], reverse=True))

## Evaluate Retrieval

In [13]:
qrels = pd.read_csv('../../data/resources/finder_qrels.tsv', sep='\t')

In [14]:
qrels_dict = {}
for index, row in qrels.iterrows():
    key = row['query_id']
    if key not in qrels_dict:
        qrels_dict[key] = {}
    qrels_dict[key][row['corpus_id']] = row['score']

In [15]:
k_values = [10, 50, 100]
results = task.evaluate(qrels=qrels_dict, results=retrieved_results, k_values=k_values)

In [16]:
metrics_df = pd.DataFrame(index=k_values, columns=["MAP", "NDCG", "P@K", "R@K"])

metrics_df["MAP"] = [results[1][f"MAP@{k}"] for k in k_values]
metrics_df["NDCG"] = [results[0][f"NDCG@{k}"] for k in k_values]
metrics_df["P@K"] = [results[3][f"P@{k}"] for k in k_values]
metrics_df["R@K"] = [results[2][f"Recall@{k}"] for k in k_values]

metrics_df

Unnamed: 0,MAP,NDCG,P@K,R@K
10,0.23395,0.2732,0.04844,0.36953
50,0.24841,0.32962,0.01688,0.59609
100,0.24956,0.33817,0.00953,0.63672


WİTH ONLY RETRİEVAL

| K   | MAP     | NDCG    | P@K     | R@K    |
|-----|---------|---------|---------|--------|
| 10  | 0.00781 | 0.00986 | 0.00156 | 0.01562|
| 50  | 0.01750 | 0.06159 | 0.00656 | 0.24844|
| 100 | 0.02224 | 0.11573 | 0.00781 | 0.5609 |

WİTH RE-RANKİNG - NO CHUNKİNG

| K   | MAP     | NDCG    | P@K     | R@K    |
|-----|---------|---------|---------|--------|
| 10  | 0.22589 | 0.25574 | 0.04063 | 0.32812|
| 50  | 0.23863 | 0.30929 | 0.01469 | 0.54531|
| 100 | 0.23915 | 0.31293 | 0.00781 | 0.56094|

WİTH RECURSİVE CHUNKİNG

| K   | MAP     | NDCG    | P@K     | R@K    |
|-----|---------|---------|---------|--------|
| 10  | 0.24140 | 0.27449 | 0.04375 | 0.35938|
| 50  | 0.25273 | 0.32490 | 0.01500 | 0.56484|
| 100 | 0.25273 | 0.32490 | 0.00750 | 0.56484|

With titles concatanated to tbe texts beginning

| K   | MAP     | NDCG    | P@K     | R@K    |
|-----|---------|---------|---------|--------|
| 10  | 0.30675 | 0.34873 | 0.06563 | 0.44219|
| 50  | 0.32086 | 0.40100 | 0.01938 | 0.65234|
| 100 | 0.32086 | 0.40100 | 0.00969 | 0.65234|

## Generate

In [17]:
# Helper Methods

def format_retrieved_docs(docs):
    docs = docs[:5]
    if docs:
        return "\n\n".join([f"\n{doc.page_content}" for doc, score in docs])
    else:
        return "No relevant documents found."
    
def re_rank_docs(query, docs, cross_encoder):
    re_ranked_docs = []
    for doc in docs:
        raw_score = cross_encoder.predict((query, doc.page_content))
        normalized_score = sigmoid(raw_score).item()
        re_ranked_docs.append((doc, normalized_score))
    return sorted(re_ranked_docs, key=lambda item: item[1], reverse=True)

def retrieve_action(query, retriever, cross_encoder):
    retrieved = retriever.invoke(query)
    re_ranked = re_rank_docs(query, retrieved, cross_encoder)
    formatted_docs = format_retrieved_docs(re_ranked)
    return formatted_docs


In [18]:
# Wrap the retrieval tool
retrieve_tool = Tool(
    name="Document Retriever",
    func=lambda query: retrieve_action(query, retriever, cross_encoder),
    description="Retrieve documents relevant to the query."
)

# Step 5: Create the ReAct Agent using the LLM and retrieval tool
tools = [retrieve_tool]

prompt = hub.pull("hwchase17/react")


# Choose the LLM to use
llm = OpenAI()

# Construct the ReAct agent
agent = create_react_agent(llm, tools, prompt)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
agent_executor.invoke({"input": query_df.iloc[10]["query"]})