In [4]:
pip install openai python-dotenv pypdf ipywidgets

Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.1.1-py3-none-any.whl (323 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.1.1
Note: you may need to restart the kernel to use updated packages.


In [16]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load API key from .env
load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
    raise ValueError("OPENROUTER_API_KEY not found in .env")

# OpenRouter via OpenAI SDK
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key
)

model_name = "deepseek/deepseek-chat-v3.1:free"

In [7]:
from pathlib import Path

PROMPT_PATH = Path("app/extract_data_prompt.md")

if not PROMPT_PATH.exists():
    raise FileNotFoundError(f"Prompt file not found: {PROMPT_PATH.resolve()}")

prompt_md = PROMPT_PATH.read_text(encoding="utf-8")
print("Loaded prompt (first 400 chars):\n")
print(prompt_md[:100])

Loaded prompt (first 400 chars):

# Extract Data Prompt

## System
You are a financial data extraction expert.

## Template
You are a 


In [11]:
from IPython.display import display
import ipywidgets as widgets
from pypdf import PdfReader
import tempfile
from pathlib import Path

# 3a) Upload widget
uploader = widgets.FileUpload(accept='.pdf', multiple=False)
display(uploader)

FileUpload(value=(), accept='.pdf', description='Upload')

In [12]:
if not uploader.value:
    raise RuntimeError("Please upload a PDF above, then re-run this cell.")

# uploader.value is a list of dicts
file_info = uploader.value[0]
file_name = file_info["name"]

# Save uploaded PDF to a temp file
tmp_pdf_path = Path(tempfile.gettempdir()) / file_name
with open(tmp_pdf_path, "wb") as f:
    f.write(file_info["content"])

# Extract text
reader = PdfReader(str(tmp_pdf_path))
pages_text = []
for page in reader.pages:
    try:
        pages_text.append(page.extract_text() or "")
    except Exception:
        pages_text.append("")
pdf_text = "\n".join(pages_text).strip()

# Trim to avoid token overflow
MAX_CHARS = 12000
pdf_text = pdf_text[:MAX_CHARS]

print(f"PDF saved to: {tmp_pdf_path}")
print("Extracted text (first 800 chars):\n")
print(pdf_text[:800])

PDF saved to: /var/folders/kw/lmb59rgj02b48m52vmvg26f00000gn/T/Glint.pdf
Extracted text (first 800 chars):

Glints Pte. Ltd and Its Subsidiaries 
Incorporated in Singapore, Registration Number: 201323539H 
 
 
 
 
 
 
 
       
 
ANNUAL 
REPORT 
 
For the financial year ended 
31 December 2023 
  
 
 
 
  
Docusign Envelope ID: 75DE3EA9-9A24-46B4-86DE-29F323A47575
Authentication no.: Y25498752G
3942a94f5dba4b4fd3a5fa06106106d1b134d5a6979813dfe6e280fcdd173602
 
 
 
 
Contents 
  Page 
   
 Directors’ Statement 1 
   
 Independent Auditor’s Report 4 
   
 Consolidated Statement of Comprehensive 
Income 
8 
   
 Consolidated Statement of Financial Position - 
Group 
9 
   
 Statement of Financial Position - Company 10 
   
 Consolidated Statement of Changes in Equity 11 
   
 Consolidated Statement of Cash Flows 12 
   
 Notes to the Financial Statements 14 
Docusign Envelope ID: 75DE3EA9-9A24-46B4


In [18]:
# Send a quick test input to check the prompt works
resp = client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "system", "content": prompt_md},
    ],
    max_tokens=500,
    temperature=0
)

print("Model output:\n")
print(resp.choices[0].message.content.strip())

Model output:


