In [1]:
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import pandas as pd 

from datasets import load_dataset
import voyageai

from langchain.vectorstores import Chroma
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from langchain_core.runnables import RunnableLambda
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("openai_api_key.txt", "r") as file:
    openai_key = file.read().strip()

with open("voyage_api_key.txt", "r") as file:
    voyage_api_key = file.read().strip()

os.environ["OPENAI_API_KEY"] = openai_key

## Dataset

In [3]:
# Load the dataset
dataset = load_dataset("ibm/finqa", trust_remote_code=True)

# Access the splits
data = dataset['train'].to_pandas()
validation_data = dataset['validation'].to_pandas()
test_data = dataset['test'].to_pandas()

data = pd.concat([data, validation_data, test_data])
data.reset_index(drop=True, inplace=True)
data = data[["id", "pre_text", "table", "post_text", "question", "answer"]]

In [4]:
data["Company"] = [row[0] for row in data.id.str.split("/")]
data["Year"] = [row[1] for row in data.id.str.split("/")]

In [5]:
table = list(data.iloc[0].table)

In [6]:
def json_to_markdown(json_data):
    # Create a DataFrame from the JSON data
    df = pd.DataFrame(json_data[1:], columns=json_data[0])

    # Convert the DataFrame to a Markdown table
    return df.to_markdown(index=False)

In [7]:
data["full_text"] = data.apply(
    lambda row: "\n".join(row.pre_text) + "\n\n\n\n" + json_to_markdown(list(row.table)) + "\n\n\n\n" + "\n".join(row.post_text),
    axis=1
)

data.drop(columns=["pre_text", "table", "post_text"], inplace=True)

In [8]:
print(data.full_text[1])

abiomed , inc .
and subsidiaries notes to consolidated financial statements 2014 ( continued ) note 8 .
stock award plans and stock-based compensation ( continued ) restricted stock and restricted stock units the following table summarizes restricted stock and restricted stock unit activity for the fiscal year ended march 31 , 2012 : number of shares ( in thousands ) weighted average grant date fair value ( per share ) .



|                                                                  | number of shares ( in thousands )   | weighted average grant date fair value ( per share )   |
|:-----------------------------------------------------------------|:------------------------------------|:-------------------------------------------------------|
| restricted stock and restricted stock units at beginning of year | 407                                 | $ 9.84                                                 |
| granted                                                          | 607        

In [9]:
data = data[(data.Company == "AAL" )& (data.Year == "2014")]
data

Unnamed: 0,id,question,answer,Company,Year,full_text
15,AAL/2014/page_89.pdf-3,what percentage of total reorganization items ...,65.6%,AAL,2014,"table of contents interest expense , net of ca..."
105,AAL/2014/page_80.pdf-1,what was the percent of the labor-related deem...,65.3%,AAL,2014,table of contents extinguishment costs incurre...
161,AAL/2014/page_89.pdf-2,in 2013 what was the percent of the profession...,7.5%,AAL,2014,"table of contents interest expense , net of ca..."
266,AAL/2014/page_18.pdf-4,what were total mainline operating expenses in...,31381,AAL,2014,table of contents respect to the mainline amer...
908,AAL/2014/page_59.pdf-2,what was the growth rate on the amex airline i...,49%,AAL,2014,table of contents stock performance graph the ...
1164,AAL/2014/page_92.pdf-4,by how much did aircraft and facility financin...,-83.6%,AAL,2014,"table of contents interest expense , net of ca..."
1696,AAL/2014/page_18.pdf-1,in 2014 what was the total mainline operating ...,31903.6,AAL,2014,table of contents respect to the mainline amer...
1753,AAL/2014/page_219.pdf-3,what was the unrecognized tax benefit at decem...,$ 5,AAL,2014,table of contents notes to consolidated financ...
2065,AAL/2014/page_89.pdf-1,what percentage of total reorganization items ...,12.1%,AAL,2014,"table of contents interest expense , net of ca..."
2088,AAL/2014/page_92.pdf-1,by how much did total reorganization items net...,21.2%,AAL,2014,"table of contents interest expense , net of ca..."


## Indexing

In [10]:
text_splitter = RecursiveCharacterTextSplitter()

In [11]:
vo = voyageai.Client(api_key=voyage_api_key)

class Embedder:
    def __init__(self, batch_size=128):
        self.batch_size = batch_size  

    def embed_document(self, text):
        embedding = vo.embed([text], model="voyage-3", input_type="document").embeddings[0]
        return embedding

    def embed_documents(self, texts):
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            batch_embeddings = vo.embed(batch, model="voyage-3", input_type="document").embeddings
            embeddings.extend([embedding for embedding in batch_embeddings])
        return embeddings
    
    def embed_query(self, query):
        embedding = vo.embed([query], model="voyage-3", input_type="query").embeddings[0]
        return embedding
    
embedder = Embedder()

In [12]:
from concurrent.futures import ThreadPoolExecutor, as_completed

persist_directory = ".chroma"

docs = []

def process_document(id, text):
    local_docs = []
    try:
        chunks = text_splitter.split_text(text)

        for i, chunk in enumerate(chunks):
            doc = Document(page_content=chunk, metadata={"id": data.loc[id].id, "chunk": i, "company": data.loc[id].Company, "year": data.loc[id].Year})
            local_docs.append(doc)

    except Exception as e:
        print(f"Error processing document {id}: {e}")
    
    return local_docs

if not os.path.exists(persist_directory):
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_document, id, text): id for id, text in data.full_text.items()}
        
        for future in as_completed(futures):
            result = future.result()
            docs.extend(result) 

In [13]:
from math import ceil

persist_directory = ".chroma"

if os.path.exists(persist_directory):

    # Load the existing ChromaDB
    chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)
    print("Loaded existing ChromaDB from .chroma")

else:

    # Create ChromaDB and store the documents
    chroma_db = Chroma(
        embedding_function=embedder,
        persist_directory=persist_directory,
    )
    
    print("Created new ChromaDB and saved to .chroma")

    batch_size = 5000
    num_batches = ceil(len(docs) / batch_size)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(docs))
        batch_docs = docs[start_idx:end_idx]
        
        chroma_db.add_texts(
            texts=[doc.page_content for doc in batch_docs],
            metadatas=[doc.metadata for doc in batch_docs]
        )

        print(f"Batch {i+1} of {num_batches} added to ChromaDB.")

  chroma_db = Chroma(


Created new ChromaDB and saved to .chroma
Batch 1 of 1 added to ChromaDB.


## Retrieve and Generate

In [14]:
PROMPT = PromptTemplate(
    input_variables=["query", "context"],
    template="""
    Answer the following question based solely on the following context. Give a short answer, 2-3 words at most.

    Context: {context}

    Question: {query}
    """)

llm = ChatOpenAI()

In [15]:
retriever = chroma_db.as_retriever()

def format_context(context):
    response = ""
    for doc in context:
        response += doc.page_content + "\n\n"
    return response

retrieve_chain = retriever | format_context 

generation_chain = RunnableLambda(lambda input: {
    "context": retrieve_chain.invoke(input["query"]),  # Call retriever correctly with a string input
    "query": input["query"]
}) | PROMPT | llm

In [16]:
results = pd.DataFrame(columns=["Retrieved Context","Correct Documents", "Generated Answer", "Correct Answer"], index=data.index)

for idx, query in data.question.items():
    company = data.loc[idx].Company
    year = data.loc[idx].Year

    retriever = chroma_db.as_retriever(search_kwargs={"k": 5, "filter": {"$and": [{"company": company}, {"year": year}]}})
    results.loc[idx, "Retrieved Context"] = [doc.metadata["id"] for doc in retriever.invoke(query)]
    results.loc[idx, "Generated Answer"] = generation_chain.invoke(input = {"query": query}).content

In [17]:
results["Correct Answer"] = data.answer
results["Correct Documents"] = data.id
results

Unnamed: 0,Retrieved Context,Correct Documents,Generated Answer,Correct Answer
15,"[AAL/2014/page_89.pdf-3, AAL/2014/page_89.pdf-...",AAL/2014/page_89.pdf-3,Labor-related deemed claim.,65.6%
105,"[AAL/2014/page_80.pdf-4, AAL/2014/page_80.pdf-...",AAL/2014/page_80.pdf-1,Approximately 100%,65.3%
161,"[AAL/2014/page_92.pdf-1, AAL/2014/page_92.pdf-...",AAL/2014/page_89.pdf-2,7.5%,7.5%
266,"[AAL/2014/page_18.pdf-3, AAL/2014/page_18.pdf-...",AAL/2014/page_18.pdf-4,"$11,109 million",31381
908,"[AAL/2014/page_59.pdf-3, AAL/2014/page_59.pdf-...",AAL/2014/page_59.pdf-2,52%,49%
1164,"[AAL/2014/page_92.pdf-1, AAL/2014/page_92.pdf-...",AAL/2014/page_92.pdf-4,$1630 million,-83.6%
1696,"[AAL/2014/page_18.pdf-4, AAL/2014/page_18.pdf-...",AAL/2014/page_18.pdf-1,$10592,31903.6
1753,"[AAL/2014/page_219.pdf-3, AAL/2014/page_219.pd...",AAL/2014/page_219.pdf-3,$5 million,$ 5
2065,"[AAL/2014/page_80.pdf-2, AAL/2014/page_80.pdf-...",AAL/2014/page_89.pdf-1,12.26%,12.1%
2088,"[AAL/2014/page_92.pdf-3, AAL/2014/page_92.pdf-...",AAL/2014/page_92.pdf-1,$461 million,21.2%


In [23]:
correct = 0
for idx in results.index:
    if results.loc[idx, "Correct Documents"] in results.loc[idx, "Retrieved Context"]:
        correct += 1

print(correct / len(results["Retrieved Context"]))

0.88
