In [2]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

from financerag.tasks import TATQA, MultiHiertt

import numpy as np 
import pandas as pd
import torch

# For retrieval
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import CrossEncoder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chunkers import TableExtractor
# For generation
from langchain.vectorstores import Chroma
from langchain import hub
from langchain_openai import OpenAI
from langchain.agents import Tool, create_react_agent, AgentExecutor
from typing import Optional, Union, List, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import warnings

warnings.filterwarnings('ignore')

## Read Data

In [5]:
task = TATQA()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.


In [6]:
queries = task.queries
query_df = pd.DataFrame(queries.values(), index=queries.keys(), columns=["query"])
query_df.shape

(1663, 1)

In [8]:
documents = task.corpus
documents_df = pd.DataFrame(documents.values(), index=documents.keys(), columns=["title", "text"])
documents_df.shape

(2756, 2)

## Initiliaze Database

In [7]:
import json

tables = json.load(open("tables.json"))

In [8]:
def extract_paragraph(text):
    array = text.split("\n")
    return "\n".join([i for i in array if "-" not in i and "|" not in i])

In [9]:
embedder = HuggingFaceEmbeddings(model_name="msmarco-distilbert-base-v4")

persist_directory = ".chroma"

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,  
    chunk_overlap=32 
)

docs = []
for id, text in documents_df.text.items():
    text = extract_paragraph(text)
    chunks = text_splitter.split_text(text)
    
    for i, chunk in enumerate(chunks):
        try:
            doc = Document(page_content=chunk, metadata={"id": str(id), "chunk_index": i, "table" : json.dumps(tables[str(id)])})
        except:
            doc = Document(page_content=chunk, metadata={"id": str(id), "chunk_index": i})
        docs.append(doc)

if os.path.exists(persist_directory):
    chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)
    print("Loaded existing ChromaDB from .chroma")
else:
    chroma_db = Chroma.from_documents(
        documents=docs,
        embedding=embedder,
        persist_directory=persist_directory,  
    )
    print("Created new ChromaDB and saved to .chroma")


Loaded existing ChromaDB from .chroma


## Retrieve

In [10]:
retriever = chroma_db.as_retriever(search_kwargs={"k": 100})

In [11]:
retrieved_df = pd.DataFrame([[{} ] for _ in query_df.index], index=query_df.index, columns=["Documents"])

In [None]:
for idx, query in query_df["query"].items():

    retrieved = retriever.invoke(query)

    retrieved = {
        str(doc.metadata["id"]):  1
        for doc in retrieved
    }
    retrieved_df.loc[idx]["Documents"] = retrieved

retrieved_results = retrieved_df["Documents"].to_dict()

## Re-Rank

In [15]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.tensor(x)))

In [None]:
for idx, query in query_df["query"].items():
    for doc_id in retrieved_results[idx]:

        raw_score = cross_encoder.predict((query, documents_df.loc[doc_id].text))
        normalized_score = sigmoid(raw_score).item()

        retrieved_results[idx][doc_id] = normalized_score
    
    retrieved_results[idx] = dict(sorted(retrieved_results[idx].items(), key=lambda item: item[1], reverse=True))

## Evaluate Retrieval

In [None]:
qrels = pd.read_csv('../../data/resources/tatqa_qrels.tsv', sep='\t')

In [None]:
qrels_dict = {}
for index, row in qrels.iterrows():
    key = row['query_id']
    if key not in qrels_dict:
        qrels_dict[key] = {}
    qrels_dict[key][row['corpus_id']] = row['score']

In [None]:
k_values = [10, 50, 100]
results = task.evaluate(qrels=qrels_dict, results=retrieved_results, k_values=k_values)

In [None]:
metrics_df = pd.DataFrame(index=k_values, columns=["MAP", "NDCG", "P@K", "R@K"])

metrics_df["MAP"] = [results[1][f"MAP@{k}"] for k in k_values]
metrics_df["NDCG"] = [results[0][f"NDCG@{k}"] for k in k_values]
metrics_df["P@K"] = [results[3][f"P@{k}"] for k in k_values]
metrics_df["R@K"] = [results[2][f"Recall@{k}"] for k in k_values]

metrics_df

## Generate

In [40]:
# Helper Methods

def format_retrieved_docs(docs):
    docs = docs[:5]
    if docs:
        try:
            return "\n\n".join([f"\n{doc.page_content}\n{doc.metadata["table"]}" for doc, score in docs])
        except:
            print("No table found")
            return "\n\n".join([doc.page_content for doc, score in docs])
    else:
        return "No relevant documents found."
    
def re_rank_docs(query, docs, cross_encoder):
    re_ranked_docs = []
    for doc in docs:
        raw_score = cross_encoder.predict((query, doc.page_content))
        normalized_score = sigmoid(raw_score).item()
        re_ranked_docs.append((doc, normalized_score))
    return sorted(re_ranked_docs, key=lambda item: item[1], reverse=True)

def retrieve_action(query, retriever, cross_encoder):
    retrieved = retriever.invoke(query)
    re_ranked = re_rank_docs(query, retrieved, cross_encoder)
    formatted_docs = format_retrieved_docs(re_ranked)
    return formatted_docs


In [41]:
# Wrap the retrieval tool
retrieve_tool = Tool(
    name="Document Retriever",
    func=lambda query: retrieve_action(query, retriever, cross_encoder),
    description="Retrieve documents relevant to the query."
)

# Step 5: Create the ReAct Agent using the LLM and retrieval tool
tools = [retrieve_tool]

prompt = hub.pull("hwchase17/react")


# Choose the LLM to use
llm = OpenAI()

# Construct the ReAct agent
agent = create_react_agent(llm, tools, prompt)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [None]:
agent_executor.invoke({"input": query_df.loc["q1a741ac6"]["query"]})