# Experiments

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

## Access question json

In [2]:
import requests
import json

url = "https://rag.timetoact.at/data/r2.0-test/questions.json"
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request failed

data = response.json()  # Parse JSON into a Python dict/list

# store data
with open("../data/questions/test.json", "w") as file:
    json.dump(data, file, indent=4)

## Docling - strong!

In [4]:
# company = "reit"
# company = "TSX_Y_2022"
company = "NASDAQ_CLXT_2022"

test_pdf_path = f"../data/pdfs/{company}.pdf"

In [3]:
# Potential for parallelization for files?
from docling.document_converter import DocumentConverter

# source = "https://arxiv.org/pdf/2408.09869"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(test_pdf_path)
print(result.document.export_to_markdown())

# REIT: execution for 7.4MB with 97 pages -> 5 min
# Yellow Pages: execution for 1.4MB with 77 pages -> 4 min
# Calyxt: execution for 0.7MB with 88 pages -> 3min

KeyboardInterrupt: 

In [None]:
# domain knwoledge
# brackets in tables indicate negative values
# parsing issues (see current system prompt)

In [22]:
# STORE as md with correct utf-8 encoding
with open(f"../data/docling_md/dev/docling_{company}.md", "w", encoding="utf-8") as file:
    file.write(result.document.export_to_markdown())

## Google Gemini

In [7]:
import os

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

from google import genai

model_id = "gemini-2.0-flash"  # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05", at least 1M token input context window

client = genai.Client(api_key=GEMINI_API_KEY)
response = client.models.generate_content(
    model=model_id, contents="What is your maximum input window size? What are the free limits for parsing PDFs?"
)

print(response.text)

Okay, let's break down my input limitations and how I handle PDFs.

**Maximum Input Window Size**

My maximum input window size, or context window, is currently **200,000 tokens.**

*   **What does this mean?**  This refers to the amount of text I can "remember" and consider at one time when processing a request. A "token" is roughly equivalent to a word or part of a word. So, 200,000 tokens translates to roughly 150,000 words, but this can vary depending on the specific text.
*   **Why is there a limit?** Processing large amounts of text requires significant computational resources.  The context window limit balances the ability to handle complex tasks with efficiency.

**Free Limits for Parsing PDFs**

There are no inherent limits on the size or number of PDFs that you can ask me questions about as long as the content of the PDFs, when extracted as text, fits within my 200,000 token context window.

*   **How it works:** I don't directly "read" PDFs in their native binary format.  In

In [5]:
# Upload PDF
invoice_pdf = client.files.upload(file=test_pdf_path, config={'display_name': f'{company}_annual_report'})

In [8]:
file_size = client.models.count_tokens(model=model_id, contents=invoice_pdf)
print(f'File: {invoice_pdf.display_name} equals to {file_size.total_tokens} tokens')

File: calyxt_annual_report equals to 103341 tokens


In [22]:
from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum


class Metric(str, Enum):
    total_revenue = "total revenue"
    net_income = "net income"
    total_liabilities = "total liabilities"
    total_assets = "total assets"

    # rad_expenses = "research and development expenses"
    # risk_management_spending = "risk management spending"
    # debt_to_equity_ratio = "Debt-To-Equity ratio"
    # number_of_stores = "Number of stores"
    # return_on_assets = "Return on Assets (ROA)"
    # return_on_equity = "Return on Equity (ROE)"
    # customer_acquisition_spending = "customer acquisition spending"
    # operating_margin = "operating margin"
    # market_capitalization = "market capitalization"
    # sustainability_initiatives_spending = "sustainability initiatives spending"
    # gross_profit_margin = "gross profit margin"
    # net_profit_margin = "net profit margin"
    # intangible_assets = "intangible assets"
    # marketing_spending = "marketing spending"
    # free_cash_flow = "free cash flow"
    # earnings_per_share = "earnings per share (EPS)"
    # accounts_receivable = "accounts_receivable"
    # acquisition_costs = "acquisition costs"
    # shareholders_equity = "shareholders' equity"
    # operating_cash_flow = "operating cash flow"
    # quick_ratio = "Quick Ratio"
    # inventory = "inventory"


class DocumentDataPoint(BaseModel):
    metric_type: Metric = Field(..., title="One of the possible metric types")
    chain_of_thought: str = Field(..., title="Chain of thought leading to the metric")
    value: float = Field(..., title="value according")
    point_in_time_as_iso_date: Optional[str] = Field(..., title="Point in time of the metric as ISO date")


class DocumentContent(BaseModel):
    data_points: list[DocumentDataPoint] = Field(..., title="Data points extracted from the document")  # according to guidelines

# Define the prompt
prompt = "You are an assistant with the task of extracting precise information from long documents. "
"You will be prompted with the contents of a document. Your task is to extract various metrics "
# "as well as company role assignments "
"from this document. With each metric, supply the point in "
"time when the metric was measured according to the document."
# "as well as the currency (if applicable). "
"If the metric is given in some unit, convert it to the exact amount (e.g. "
"if the amount in the document is given as '100 (in thousands)' "
"or '100k', insert the full value '100000')."
# "With each role assignment, supply when the role assignment started and ended, if possible."
"\n\n"
"Do your best to include as many metrics for as many points in time as possible!"

'Do your best to include as many metrics for as many points in time as possible!'

In [20]:
# Simple start
class DocumentContent(BaseModel):
    company_name: str = Field(..., title="Name of the company")
    year: int = Field(..., title="Year of the annual report")

prompt = "You are an assistant to find out the company name and year of the document provided. "

In [26]:
response.parsed.data_points[0]

DocumentDataPoint(metric_type=<Metric.total_assets: 'total assets'>, chain_of_thought='The consolidated balance sheets list the total assets. The value of the total assets is the last value of the column for asset side.', value=22.421, point_in_time_as_iso_date='2022-12-31')

In [23]:
# Generate a response using the Person model
# response = client.models.generate_content(model=model_id, contents=prompt,
#                                           config={'response_mime_type': 'application/json',
#                                                   'response_schema': DocumentContent})

response = client.models.generate_content(model=model_id, contents=[prompt, invoice_pdf], config={'response_mime_type': 'application/json', 'response_schema': DocumentContent})

for data_point in response.parsed.data_points:
    print(data_point)

metric_type=<Metric.total_assets: 'total assets'> chain_of_thought='The consolidated balance sheets list the total assets. The value of the total assets is the last value of the column for asset side.' value=22.421 point_in_time_as_iso_date='2022-12-31'
metric_type=<Metric.total_liabilities: 'total liabilities'> chain_of_thought='The consolidated balance sheets list the total liabilities. The value of the total liabilities is the last value of the liabilities side.' value=15.188 point_in_time_as_iso_date='2022-12-31'
metric_type=<Metric.net_income: 'net income'> chain_of_thought='The consolidated statements of operations list the net loss which will be converted to net income. From the image I found the value for net loss and change it from negative to positive to get net income.' value=16.891 point_in_time_as_iso_date='2022-12-31'
metric_type=<Metric.total_revenue: 'total revenue'> chain_of_thought='The consolidated statements of operations list the revenue. The total revenue is prese

## Extracting list of companies

In [14]:
import json

with open(r"C:\Users\felix.krause\code\trustbit\enterprise-rag-challenge\dataset_v2.json", "r") as file:
    dataset = json.load(file)

In [20]:
company_ls = {}
for idx, data in dataset.items():
    if "sha1" not in data:
        continue
    if "meta" not in data:
        continue
    company_name = data["meta"]["company_name"]
    sha = data["sha1"]
    company_ls[company_name] = {"id": idx[:-4], "sha1": sha}

company_ls

{'Anixa Biosciences, Inc.': {'id': 'NASDAQ_ANIX_2022.pdf',
  'sha1': 'ce9e5024041b2ece2bafa2a9d9516bb174ee8949'},
 'Maravai LifeSciences Holdings, Inc.': {'id': 'NASDAQ_MRVI_2022.pdf',
  'sha1': 'f71415f9ca0cff70e5fa193616b6197f361130ed'},
 'KLA Corporation': {'id': 'NASDAQ_KLAC_2022.pdf',
  'sha1': '4a9d2b853e05970776121a810460f0962a18c5a1'},
 'Ameresco, Inc.': {'id': 'NYSE_AMRC_2022.pdf',
  'sha1': 'f973dd219c534accb0d4e72d8e12f51284d48d10'},
 'Battery Minerals Limited': {'id': 'ASX_BAT_2022.pdf',
  'sha1': '4e27f4c3402c657d548760cb3a164b036cefaabb'},
 'Caesars Entertainment, Inc.': {'id': 'NASDAQ_CZR_2022.pdf',
  'sha1': 'ac9921e0f4ff34f1a9adb1807b88675adf4b6091'},
 'Northfield Bancorp': {'id': 'NASDAQ_NFBK_2022.pdf',
  'sha1': '8564a818c6467b2116d60c17d4cc9476544e9f82'},
 'Sonic Automotive, Inc.': {'id': 'NASDAQ_SAH_2022.pdf',
  'sha1': '682de8e45fd9688f3452bc0e18257132a8f3cff6'},
 'Charles River Laboratories International, Inc.': {'id': 'NYSE_CRL_2022.pdf',
  'sha1': 'ce72b016beb8

In [9]:
len(list(dataset.keys()))
company_names = list(dataset.keys())

7747

## Experimenting with IBM Watson API

In [17]:
import os

import requests
from dotenv import load_dotenv

load_dotenv()
IBM_API_KEY = os.getenv("IBM_API_KEY")

In [27]:
# 1. Get Balance
balance_url = "https://rag.timetoact.at/ibm/balance"
balance_headers = {
    "Authorization": f"Bearer {IBM_API_KEY}"
}
balance_response = requests.get(balance_url, headers=balance_headers)
print("Balance Response:", balance_response.json())

Balance Response: {'balance': 49.9998944}


In [10]:
# 2. Get Available Models
models_url = "https://rag.timetoact.at/ibm/foundation_model_specs"
models_response = requests.get(models_url)
print("Models Response:", models_response.json())

Models Response: ['codellama/codellama-34b-instruct-hf', 'cross-encoder/ms-marco-minilm-l-12-v2', 'google/flan-t5-xl', 'google/flan-t5-xxl', 'google/flan-ul2', 'ibm/granite-13b-instruct-v2', 'ibm/granite-20b-code-instruct', 'ibm/granite-20b-multilingual', 'ibm/granite-3-2-8b-instruct-preview-rc', 'ibm/granite-3-2b-instruct', 'ibm/granite-3-8b-instruct', 'ibm/granite-34b-code-instruct', 'ibm/granite-3b-code-instruct', 'ibm/granite-8b-code-instruct', 'ibm/granite-embedding-107m-multilingual', 'ibm/granite-embedding-278m-multilingual', 'ibm/granite-guardian-3-2b', 'ibm/granite-guardian-3-8b', 'ibm/granite-ttm-1024-96-r2', 'ibm/granite-ttm-1536-96-r2', 'ibm/granite-ttm-512-96-r2', 'ibm/slate-125m-english-rtrvr', 'ibm/slate-125m-english-rtrvr-v2', 'ibm/slate-30m-english-rtrvr', 'ibm/slate-30m-english-rtrvr-v2', 'intfloat/multilingual-e5-large', 'meta-llama/llama-2-13b-chat', 'meta-llama/llama-3-1-70b-instruct', 'meta-llama/llama-3-1-8b-instruct', 'meta-llama/llama-3-2-11b-vision-instruct', 

In [26]:
# 3. Generate Text
text_generation_url = "https://rag.timetoact.at/ibm/text_generation"
text_generation_headers = {
    "Authorization": f"Bearer {IBM_API_KEY}",
    "Content-Type": "application/json"
}

payload = {
    "input": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the date today?"}
    ],
    # "model_id": "ibm/granite-34b-code-instruct",
    "model_id": "deepseek/deepseek-r1-distill-llama-70b",
    "parameters": {
        "max_new_tokens": 100,
        "min_new_tokens": 1
    }
}
text_response = requests.post(text_generation_url, headers=text_generation_headers, json=payload)
print("Text Generation Response:", text_response.json())

Text Generation Response: {'model_id': 'deepseek/deepseek-r1-distill-llama-70b', 'created_at': '2025-02-24T10:25:50Z', 'results': [{'generated_text': "Hi there! I suggest getting online to get real-time information. If you have any other questions, please don't hesitate to let me know!", 'generated_token_count': 33, 'input_token_count': 15, 'stop_reason': 'stop'}]}


In [None]:
# 3. Generate Text
text_generation_url = "https://rag.timetoact.at/ibm/text_generation"
text_generation_headers = {
    "Authorization": f"Bearer {IBM_API_KEY}",
    "Content-Type": "application/json"
}

payload = {
    "input": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the date today?"}
    ],
    "model_id": "ibm/granite-34b-code-instruct",
    # "model_id": "deepseek/deepseek-r1-distill-llama-70b",
    # "model_id": "ibm/granite-embedding-107m-multilingual",
    # "model_id": "ibm/granite-embedding-278m-multilingual",
    "parameters": {
        "max_new_tokens": 100,
        "temperature": 0.0,
        "min_new_tokens": 1
    }
}

try:
    text_response = requests.post(text_generation_url, headers=text_generation_headers, json=payload)
    text_response.raise_for_status()
    print("Text Generation Response:", text_response.json())
except requests.HTTPError as err:
    print(err)

## Not working well

### PyMuPDF4LLM - tables bad

In [None]:
## Observations
# tables are pretty bad
# format in general seems ok

In [6]:
# Store as file
import pathlib

pathlib.Path("../data/docling_md/dev/pymupdf4llm_output.md").write_bytes(md_text.encode())

261957

In [5]:
md_text[:1000]

'-----\n\n# Table of Contents\n\nManagement’s Discussion and Analysis........................................................................................................ 2\n\nIndependent Auditor’s Report ................................................................................................................. 29-32\n\nConsolidated Statements of Income and Other Comprehensive Income ..................................................... 33\n\nConsolidated Statements of Financial Position ............................................................................................ 34\n\nConsolidated Statements of Changes in Equity ...................................................................................... 35-36\n\nConsolidated Statements of Cash Flows ..................................................................................................... 37\n\nNotes To The Consolidated Financial Statements .................................................................

### Camelot (table detection) - Not working well

In [None]:
import camelot

# Extract tables from all pages of the PDF
tables = camelot.read_pdf(test_pdf_path, pages="all")

# Export each table to CSV (or process further as needed)
for i, table in enumerate(tables):
    csv_filename = f"table_{i}.csv"
    table.to_csv(csv_filename)
    print(f"Exported table {i} to {csv_filename}")
