In [31]:
# --- Setup ---
import fitz
import openai
import os
import io
from dotenv import load_dotenv
from ipywidgets import FileUpload
from IPython.display import display

# Load your API key
load_dotenv(dotenv_path="key.env")
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

# --- Upload Widget ---
uploader = FileUpload(accept='.pdf', multiple=False)
display(uploader)

# --- Extract Text from Uploaded PDF ---
def extract_text_from_uploaded_pdf(uploader_widget):
    if not uploader_widget.value:
        print("❌ No file uploaded.")
        return None

    uploaded_file = uploader_widget.value[0]
    content = uploaded_file['content']
    file_stream = io.BytesIO(content)

    text = ""
    with fitz.open(stream=file_stream, filetype="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text

# --- Extract Data with GPT ---
def extract_data_with_gpt(pdf_text, prompt_instruction):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a document parser that returns clean JSON."},
            {"role": "user", "content": f"{prompt_instruction}\n\nDocument:\n{pdf_text}"}
        ],
        temperature=0.0
    )
    return response.choices[0].message.content

# --- Prompt for GPT ---
prompt = """You will be given the raw OCR text of an inventory entry document in Spanish.

Please extract the following fields and return them in clean JSON format:

- "entrada_almacen_no": (Entrada de Almacén No.)
- "empresa": (Buyer Company Name)
- "nit_empresa": (Buyer Tax ID)
- "direccion_empresa": (Buyer Address)
- "ciudad_empresa": (Buyer City)
- "proveedor": (Vendor Name)
- "direccion_proveedor": (Vendor Address)
- "nit_proveedor": (Vendor Tax ID)
- "proyecto": (Project Name)
- "orden_compra_no": (Purchase Order Number)
- "fecha_factura": (Invoice Date)
- "fecha_remision": (Remission Date)

Then extract the "detalle_entrada_almacen" table:
Return it as a list of rows with this format:
  - "concepto" (item name)
  - "vr_iva" (VAT amount)
  - "vr_total" (Total amount)

Lastly, return the summary:
- "subtotal"
- "iva_total"
- "total_general"

All numeric fields should be parsed cleanly without formatting characters (no $ or commas).
Return valid JSON only.
"""

# --- Run Extraction ---
raw_text = extract_text_from_uploaded_pdf(uploader)

if raw_text:
    print("📄 PDF Text Preview:\n")
    print(raw_text[:1000])  # Optional preview

    print("\n🧠 Sending to GPT...")
    result = extract_data_with_gpt(raw_text, prompt)

    print("\n📦 Extracted Structured Data:")
    print(result)


FileUpload(value=(), accept='.pdf', description='Upload')

❌ No file uploaded.


In [33]:
raw_text = extract_text_from_uploaded_pdf(uploader)

if raw_text:
    print("📄 PDF Text Preview:\n")
    print(raw_text[:1000])  # Optional preview

    print("\n🧠 Sending to GPT...")
    result = extract_data_with_gpt(raw_text, prompt)

    print("\n📦 Extracted Structured Data:")
    print(result)

📄 PDF Text Preview:

Cuenta
Valor
143
PISCINAS
142121
119,469.73
170
COMUNAL
142064
41,000.93
171
PORTERÍA
142064
136,267.56
2
CIMENTACION
142002
653,448.59
3
ESTRUCTURA
142003
850,033.65
IVA Separable
13553001
342,041.89
Empresa
CONSTRUCCIONES OBYCON S.A.S.
196239
NIT
860527800
Dirección
Calle 93 b No. 13 - 92
Teléfono
6228080
Ciudad
BOGOTÁ D.C.
Fecha y hora de impresión: 23/06/2025 06:11 pm
ENTRADA DE ALMACÉN NO. 16300815
Proveedor
Proveedor:
ULTRACEM ZF S.A.S
NIT/CC:
900984889
Dirección:
ZF PERMANENTE PQUE CENTR
Teléfono:
3105923536
Entrada de almacén
Sucursal:
02063101 - BÁLTICO
Fecha:
09/06/2025
Proyecto:
02063101 - BÁLTICO
Factura No:
FEPZ55198
Bodega:
Bodega Principal
Fecha Factura:
09/06/2025
Orden de Compra No:
16300088
Remisión No:
30259723
Sitio de entrega:
Parque Heredia Báltico
Fecha Remisión:
07/06/2025
Descripción OC:
Pedidos Obra
Obervaciones OC:
Concreto para la construcción de las unidades estructurales de edificio de parqueaderos, comunal, portería y piscina
s. ,Conc