# Prova de conceito utilizando modelo **winninghealth/olmOCR-2-7B-1025-INT4**


---
---


Observação: este notebook foi inteiramente projetado para ser executado no ambiente Google Colab!


---


## Instalação de dependências

Instalação das bibliotecas necessárias para o projeto


In [None]:
!apt-get install -y poppler-utils


In [None]:
!pip install auto-round


In [None]:
!pip install json-repair pdf2image


## Importação de bibliotecas

Importação de todas as bibliotecas necessárias para processamento de imagens, PDFs e inferência do modelo

- `Transformers`: para carregar o modelo pré-treinado e o tokenizador
- `Torch`: para manipulação de tensores e operações relacionadas a deep learning
- `PIL (Python Imaging Library)`: para manipulação e processamento de imagens
- `pdf2image`: para converter páginas de PDF em imagens
- `json_repair`: para reparar arquivos JSON corrompidos
- `json`: para manipulação de arquivos JSON


In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
from pdf2image import convert_from_path
from json_repair import repair_json
from pathlib import Path
from io import BytesIO
from PIL import Image

import base64
import torch
import json
import time


## Carregando o modelo

Carregando o modelo olmOCR-2-7B quantizado e seu processador


In [None]:
processor = AutoProcessor.from_pretrained(
    'winninghealth/olmOCR-2-7B-1025-INT4'
)
print('Processor loaded successfully!')

model = AutoModelForVision2Seq.from_pretrained(
    'winninghealth/olmOCR-2-7B-1025-INT4',
    device_map = 'auto'
)
print('Model loaded successfully!')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')


## Funções de pré-processamento

Funções para redimensionar imagens e converter PDFs em imagens


In [None]:
def resize_image(image, max_size = 1200):
    """
    Resize image maintaining aspect ratio with maximum dimension of max_size pixels.

    Args:
        image: PIL Image object
        max_size: Maximum dimension in pixels (default: 1200)

    Returns:
        PIL Image object resized
    """
    width, height = image.size

    if width <= max_size and height <= max_size:
        return image

    if width > height:
        new_width = max_size
        new_height = int((max_size / width) * height)
    else:
        new_height = max_size
        new_width = int((max_size / height) * width)

    return image.resize((new_width, new_height), Image.LANCZOS)


def convert_pdf_to_images(pdf_path, max_size = 1200):
    """
    Convert PDF pages to images and resize them.

    Args:
        pdf_path: Path to PDF file
        max_size: Maximum dimension in pixels for each page (default: 1200)

    Returns:
        List of PIL Image objects, one per page
    """
    images = convert_from_path(pdf_path)
    resized_images = [resize_image(img, max_size) for img in images]

    return resized_images


def image_to_base64(image):
    """
    Convert PIL Image to base64 string.

    Args:
        image: PIL Image object
    
    Returns:
        Base64 encoded string
    """
    buffered = BytesIO()
    image.save(buffered, format = 'PNG')

    return base64.b64encode(buffered.getvalue()).decode('utf-8')


## Função de inferência

Função principal para processar imagens e extrair informações estruturadas


In [None]:
def extract_structured_info(image, processor, model, device):
    """
    Extract structured information from an image using the OCR model.
    
    Args:
        image: PIL Image object
        processor: Model processor
        model: Vision2Seq model
        device: Device to run inference on
    
    Returns:
        Dictionary with extracted information
    """
    image_base64 = image_to_base64(image)

    messages = [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Extract structured information from this image in JSON format.'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{image_base64}'}},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    inputs = processor(
        text = [text],
        images = [main_image],
        padding = True,
        return_tensors = 'pt',
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}

    output = model.generate(
        **inputs,
        temperature = 0.1,
        max_new_tokens = 8192, # This value NEEDS to be adjusted based on expected output size!
        num_return_sequences = 1,
        do_sample = True,
    )

    prompt_length = inputs['input_ids'].shape[1]
    new_tokens = output[:, prompt_length:]
    text_output = processor.tokenizer.batch_decode(
        new_tokens,
        skip_special_tokens = True
    )

    return parse_json_output(text_output[0])


def parse_json_output(raw_output):
    """
    Parse and repair JSON output from model.
    
    Args:
        raw_output: Raw string output from model
    
    Returns:
        Parsed JSON dictionary
    """
    raw_text = raw_output.replace('```json', '').replace('```', '').strip()
    fixed_json = repair_json(raw_text)

    return json.loads(fixed_json)


## Função principal de processamento

Função que processa diferentes tipos de documentos (imagens e PDFs)


In [None]:
def process_document(file_path, processor, model, device, max_size = 1200):
    """
    Process a document (image or PDF) and extract structured information.
    
    Args:
        file_path: Path to document file
        processor: Model processor
        model: Vision2Seq model
        device: Device to run inference on
        max_size: Maximum image dimension in pixels
    
    Returns:
        Dictionary with extracted information
    """
    file_path = Path(file_path)
    file_extension = file_path.suffix.lower()

    start_time = time.time()

    if file_extension == '.pdf':
        print(f'Processing PDF: {file_path.name}')
        images = convert_pdf_to_images(str(file_path), max_size)

        results = {}
        for idx, image in enumerate(images, 1):
            print(f'  Processing page {idx}/{len(images)}...')
            page_start = time.time()

            page_data = extract_structured_info(image, processor, model, device)
            results[f"page_{idx:02d}"] = page_data

            page_time = time.time() - page_start
            print(f'  Page {idx} completed in {page_time:.2f} seconds')

        result = {'pdf': results}

    elif file_extension in ['.jpg', '.jpeg', '.png']:
        print(f'Processing image: {file_path.name}')

        image = Image.open(file_path)
        resized_image = resize_image(image, max_size)
        
        result = extract_structured_info(resized_image, processor, model, device)
    
    else:
        raise ValueError(f'Unsupported file format: {file_extension}')

    total_time = time.time() - start_time
    print(f'Total processing time: {total_time:.2f} seconds\n')

    return result


## Processamento dos documentos

Processamento de todos os documentos fornecidos para o desafio técnico


In [None]:
data_folder = Path('data_challenge')
documents = [
    data_folder / 'case_01_drivers_license.jpeg',
    data_folder / 'case_02_bill.jpg',
    data_folder / 'case_03_large_document.pdf'
]

all_results = {}

for doc_path in documents:
    if doc_path.exists():
        print(f'{"="*60}')
        result = process_document(doc_path, processor, model, device)
        all_results[doc_path.stem] = result
    else:
        print(f'File not found: {doc_path}')

print(f'{"="*60}')
print('All documents processed successfully!')


## Visualização dos resultados

Exibição dos resultados extraídos de cada documento


In [None]:
for doc_name, result in all_results.items():
    print(f'\n{"="*60}')
    print(f'Document: {doc_name}')
    print(f'{"="*60}')
    print(json.dumps(result, indent = 4, ensure_ascii = False))
    print()


## Salvando os resultados

Salvando os resultados em arquivos JSON individuais


In [None]:
output_folder = Path('data_challenge/results')
output_folder.mkdir(exist_ok = True)

for doc_name, result in all_results.items():
    output_file = output_folder / f'{doc_name}_result.json'
    with open(output_file, 'w', encoding = 'utf-8') as f:
        json.dump(result, f, indent = 4, ensure_ascii = False)
    print(f'Saved: {output_file}')

print('\nAll results saved successfully!')
