In [None]:
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import pandas as pd 
import base64
from PIL import Image
from io import BytesIO
from IPython.display import display

from datasets import load_dataset

from byaldi import RAGMultiModalModel

from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

In [None]:
with open('hf_key.txt', 'r') as file:
    hf_key = file.read().strip()

with open("openai_api_key.txt", "r") as file:
    openai_key = file.read().strip()

os.environ["HF_TOKEN"] = hf_key
os.environ["OPENAI_API_KEY"] = openai_key

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import warnings

warnings.filterwarnings('ignore')

## Dataset 

In [None]:
# Load the dataset
dataset = load_dataset("ibm/finqa", trust_remote_code=True)

# Access the splits
train_data = dataset['train'].to_pandas()
validation_data = dataset['validation'].to_pandas()
test_data = dataset['test'].to_pandas()

train_data = pd.concat([train_data, validation_data, test_data])
train_data.reset_index(drop=True, inplace=True)
train_data = train_data[["id", "question", "answer"]]

In [None]:
train_data["Company"] = [row[0] for row in train_data.id.str.split("/")]
train_data["Year"] = [row[1] for row in train_data.id.str.split("/")]

In [16]:
unique_companies = set(train_data.Company.unique())

needed_years = {}

for company in unique_companies:
    needed_years[company] = list(train_data[train_data.Company == company].Year.unique())

file_count = 0

for company in needed_years.keys():
    for year in needed_years[company]:
        try:
            file_count += len(os.listdir(f"docs/{company}/{year}/"))
        except:
            print(f"docs/{company}/{year}/")
            
file_count

docs/AAP/2006/


29159

In [None]:
query = train_data[train_data["Company"] == "AAPL"].iloc[0]["question"]
answer = train_data[train_data["Company"] == "AAPL"].iloc[0]["answer"]

print(f"Query: {query}")
print(f"Answer: {answer}")

## Indexing 

In [None]:
RAG = RAGMultiModalModel.from_pretrained("vidore/colqwen2-v1.0", device="mps")

In [None]:
RAG.index(
    input_path="docs/temp/",
    index_name="finqa",
    overwrite=True,
)

In [None]:
companies = ["AAPL"]

for company in companies:
    for year in os.listdir(f"docs/{company}/"):
        for page in os.listdir(f"docs/{company}/{year}/"):
            RAG.add_to_index(
                input_item=f"docs/{company}/{year}/{page}",
                store_collection_with_index=True,
                metadata={"Company": company, "Year": year, "Page" : page},
            )

## Retrieval 

In [None]:
results = RAG.search(query, k=1, filter_metadata={"Company": "AAPL", "Year": "2013"})
results

## Generation

In [None]:
model = ChatOpenAI(model="gpt-4o")
message = HumanMessage(
    content=[
        {"type": "text", "text": query},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/pdf;base64,{results[0]['base64']}"},
        },
    ],
)

response = model.invoke([message])
response.content

In [None]:
# Decode the base64 string
image_data = base64.b64decode(results[0]['base64'])

# Convert the bytes to an image
image = Image.open(BytesIO(image_data))

# Display the image in the notebook
display(image)