In [None]:
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import pandas as pd 
import base64
from PIL import Image
from io import BytesIO
from IPython.display import display

from datasets import load_dataset

from byaldi import RAGMultiModalModel

from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

In [None]:
with open('hf_key.txt', 'r') as file:
    hf_key = file.read().strip()

with open("openai_api_key.txt", "r") as file:
    openai_key = file.read().strip()

os.environ["HF_TOKEN"] = hf_key
os.environ["OPENAI_API_KEY"] = openai_key

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import warnings

warnings.filterwarnings('ignore')

## Dataset 

In [None]:
# Load the dataset
dataset = load_dataset("ibm/finqa", trust_remote_code=True)

# Access the splits
data = dataset['train'].to_pandas()
validation_data = dataset['validation'].to_pandas()
test_data = dataset['test'].to_pandas()

data = pd.concat([data, validation_data, test_data])
data.reset_index(drop=True, inplace=True)
data = data[["id", "question", "answer", "gold_inds"]]

In [None]:
data["Company"] = [row[0] for row in data.id.str.split("/")]
data["Year"] = [row[1] for row in data.id.str.split("/")]

In [None]:
unique_companies = set(data.Company.unique())

needed_years = {}

for company in unique_companies:
    needed_years[company] = list(data[data.Company == company].Year.unique())

file_count = 0

for company in needed_years.keys():
    for year in needed_years[company]:
        try:
            file_count += len(os.listdir(f"docs/{company}/{year}/"))
        except:
            print(f"docs/{company}/{year}/")
            
file_count

In [None]:
data = data[(data.Company == "AAL" )& (data.Year == "2014")]
data.head(1)

## Indexing 

In [None]:
RAG = RAGMultiModalModel.from_pretrained("vidore/colqwen2-v1.0", device="mps")

In [None]:
RAG.index(
    input_path="docs/temp/",
    index_name="finqa",
    overwrite=True,
)

In [None]:
companies = ["AAL"]

for company in companies:

    years = os.listdir(f"docs/{company}/")
    years = ["2014"]

    for year in years:

        pages = os.listdir(f"docs/{company}/{year}/")

        for page in pages:
            
            RAG.add_to_index(
                input_item=f"docs/{company}/{year}/{page}",
                store_collection_with_index=True,
                metadata={"Company": company, "Year": year, "Page" : page},
            )

## Retrieve and Generate

In [None]:
model = ChatOpenAI(model="gpt-4o")

In [None]:
def image_prompt(image, question):

    query = f"""
    Answer the following query based solely on the provided image,  Give a short answer, 2-3 words at most. Then explain the steps you took to arrive at your answer.

    Query: {question}
    """

    message = HumanMessage(
    content=[
        {"type": "text", "text": query},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/pdf;base64,{image}"},
        },
    ],
    )

    return message

In [16]:
results = pd.DataFrame(columns=["Retrieved Context","Correct Documents", "Generated Answer", "Correct Answer"], index=data.index)

for idx, query in data.question.items():

    company = data.loc[idx].Company
    year = data.loc[idx].Year

    retrieved = RAG.search(query, k = 1, filter_metadata={"Company": company, "Year": year})
    
    results.loc[idx, "Retrieved Context"] = company + "/" +  year + "/" +  retrieved[0].metadata["Page"]
    results.loc[idx, "Generated Answer"] = model.invoke( [image_prompt(retrieved[0]["base64"], query) ]).content

In [17]:
results["Correct Answer"] = data.answer
results["Correct Documents"] = data.id
results["Golden Context"] = data.gold_inds

results.to_csv("colpali.csv")