In [1]:
import os
import torch
import pandas as pd 

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, as_completed

from pdf2image import convert_from_path
import pytesseract

from datasets import load_dataset
import voyageai

from langchain.vectorstores import Chroma
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field

In [2]:
with open("keys/openai_api_key.txt", "r") as file:
    openai_key = file.read().strip()

with open("keys/voyage_api_key.txt", "r") as file:
    voyage_api_key = file.read().strip()

os.environ["OPENAI_API_KEY"] = openai_key

## Dataset

In [3]:
# Load the dataset
dataset = load_dataset("ibm/finqa", trust_remote_code=True)

# Access the splits
data = dataset['train'].to_pandas()
validation_data = dataset['validation'].to_pandas()
test_data = dataset['test'].to_pandas()

data = pd.concat([data, validation_data, test_data])
data.reset_index(drop=True, inplace=True)

In [4]:
data = data[["id", "question", "answer", "gold_inds"]]
data["Company"] = [row[0] for row in data.id.str.split("/")]
data["Year"] = [row[1] for row in data.id.str.split("/")]

In [5]:
unique_companies = set(data.Company.unique())

needed_years = {}

for company in unique_companies:
    needed_years[company] = list(data[data.Company == company].Year.unique())

file_count = 0

for company in needed_years.keys():
    for year in needed_years[company]:
        try:
            file_count += len(os.listdir(f"docs/{company}/{year}/"))
        except:
            print(f"docs/{company}/{year}/")
            
file_count

docs/AAP/2006/


29159

In [6]:
data = data[(data.Company == "AAL" ) & (data.Year == "2014")]
data.shape

(25, 6)

## Indexing

In [7]:
import pickle
 
if os.path.exists("documents.pkl"):
    with open("documents.pkl", "rb") as file:
        documents = pickle.load(file)
        
else:
    documents = {}
    tables = {}

    companies = ["AAL"]

    # Define the function to process a single page
    def process_page(company, year, page):
        image = convert_from_path(f"docs/{company}/{year}/{page}")[0]
        text = pytesseract.image_to_string(image)
        return f"{company}/{year}/{page}", text

    # Define companies and years
    companies = ["AAL"]

    with ThreadPoolExecutor() as executor:
        
        futures = []

        for company in companies:
            years = os.listdir(f"docs/{company}/")
            years = ["2014"]

            for year in years:
                pages = os.listdir(f"docs/{company}/{year}/")

                # Submit tasks for each page
                for page in pages:
                    futures.append(executor.submit(process_page, company, year, page))

        # Gather results
        for future in futures:
            page_key, text = future.result()
            documents[page_key] = text

In [8]:
text_splitter = RecursiveCharacterTextSplitter()

In [9]:
vo = voyageai.Client(api_key=voyage_api_key)

class Embedder:
    def __init__(self, batch_size=128):
        self.batch_size = batch_size  

    def embed_document(self, text):
        embedding = vo.embed([text], model="voyage-3", input_type="document").embeddings[0]
        return embedding

    def embed_documents(self, texts):
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            batch_embeddings = vo.embed(batch, model="voyage-3", input_type="document").embeddings
            embeddings.extend([embedding for embedding in batch_embeddings])
        return embeddings
    
    def embed_query(self, query):
        embedding = vo.embed([query], model="voyage-3", input_type="query").embeddings[0]
        return embedding
    
embedder = Embedder()

In [10]:
persist_directory = ".chroma"

docs = []

def process_document(id, text):
    local_docs = []
    try:
        chunks = text_splitter.split_text(text)
        
        company = id.split("/")[0]
        year = id.split("/")[1]

        for i, chunk in enumerate(chunks):
            doc = Document(page_content=chunk, metadata={"id": id, "chunk": i, "company":company, "year": year})
            local_docs.append(doc)

    except Exception as e:
        print(f"Error processing document {id}: {e}")
    
    return local_docs

if not os.path.exists(persist_directory):
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_document, id, text): id for id, text in documents.items()}
        
        for future in as_completed(futures):
            result = future.result()
            docs.extend(result) 

In [11]:
from math import ceil

persist_directory = ".chroma"

if os.path.exists(persist_directory):

    # Load the existing ChromaDB
    chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)
    print("Loaded existing ChromaDB from .chroma")

else:

    # Create ChromaDB and store the documents
    chroma_db = Chroma(
        embedding_function=embedder,
        persist_directory=persist_directory,
    )
    
    print("Created new ChromaDB and saved to .chroma")

    batch_size = 5000
    num_batches = ceil(len(docs) / batch_size)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(docs))
        batch_docs = docs[start_idx:end_idx]
        
        chroma_db.add_texts(
            texts=[doc.page_content for doc in batch_docs],
            metadatas=[doc.metadata for doc in batch_docs]
        )

        print(f"Batch {i+1} of {num_batches} added to ChromaDB.")

  chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)


Loaded existing ChromaDB from .chroma


## Retrieve and Generate

In [12]:
PROMPT = PromptTemplate(
    input_variables=["query", "context"],
    template="""
    Answer the following question based solely on the following context. Give a short answer, 2-3 words at most. Then explain the steps you took to arrive at your answer.

    Context: {context}

    Question: {query}
    """)

llm = ChatOpenAI()

In [13]:
retriever = chroma_db.as_retriever()

def format_context(context):
    response = ""
    for doc in context:
        response += doc.page_content + "\n\n"
    return response

retrieve_chain = retriever | format_context 

generation_chain = RunnableLambda(lambda input: {
    "context": retrieve_chain.invoke(input["query"]), 
    "query": input["query"]
}) | PROMPT | llm

In [14]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.tensor(x)))

def rerank(query, documents, ids, top_k=1):
    scores = {}
    reranking = vo.rerank(query=query, documents=documents, model="rerank-2", top_k=len(documents))

    for i, r in enumerate(reranking.results):
        normalized_score = sigmoid(r.relevance_score).item()
        scores[ids[i]] = normalized_score

    top_scorers = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
    return {id: score for id, score in top_scorers}

In [15]:
results = pd.DataFrame(columns=["Retrieved Context","Correct Documents", "Generated Answer", "Correct Answer"], index=data.index)

# Define a function to process each item
def process_item(idx):
    query = data.loc[idx, "question"]
    company = data.loc[idx, "Company"]
    year = data.loc[idx, "Year"]

    # Initialize retriever
    retriever = chroma_db.as_retriever(search_kwargs={"k": 20, "filter": {"$and": [{"company": company}, {"year": year}]}})
    
    # Retrieve and rerank
    retrieved_docs = retriever.invoke(query)
    retrieved = rerank(query, [doc.page_content for doc in retrieved_docs], [doc.metadata["id"] for doc in retrieved_docs])
    
    # Populate results
    retrieved_context = list(retrieved.keys())[0]
    generated_answer = generation_chain.invoke(input={"query": query}).content

    return idx, retrieved_context, generated_answer

with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_item, idx) for idx in data.index]

    # Gather results
    for future in futures:
        idx, retrieved_context, generated_answer = future.result()
        results.loc[idx, "Retrieved Context"] = retrieved_context
        results.loc[idx, "Generated Answer"] = generated_answer

In [16]:
results["Correct Answer"] = data.answer
results["Correct Documents"] = data.id
results["Golden Context"] = data.gold_inds
results.to_csv("vanilla.csv")