## PDF Preprocessing and Embedding Pipeline

Replica do fluxo de `main.py` com geração adicional de embeddings usando `sentence-transformers/all-MiniLM-L6-v2`. Execute as células sequencialmente.

In [4]:
# (Opcional) Instale dependências necessárias
%pip install --quiet "unstructured[pdf]" sentence-transformers pymupdf4llm pdfminer.six pi-heif

Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
import re
from pathlib import Path

from sentence_transformers import SentenceTransformer
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
import pymupdf4llm

file_path = Path('documents')
base_file_name = 'Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO'
pdf_path = file_path / f'{base_file_name}.pdf'
json_output_path = file_path / f'{base_file_name}-output.json'
chunks_output_path = file_path / f'{base_file_name}-chunks.json'
embeddings_output_path = file_path / f'{base_file_name}-embeddings.json'
print(f'PDF path: {pdf_path}')
print(f'JSON output: {json_output_path}')
print(f'Chunks output: {chunks_output_path}')
print(f'Embeddings output: {embeddings_output_path}')


PDF path: documents/Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO.pdf
JSON output: documents/Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO-output.json
Chunks output: documents/Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO-chunks.json
Embeddings output: documents/Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO-embeddings.json


In [6]:
def clean_text(text: str) -> str:
    """Clean and normalize text content."""
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\ufb01', 'fi')
    text = text.replace('\ue009', 'tt')
    text = re.sub(r'^\d+$', '', text)
    text = re.sub(r'[•◦▪▫]', '•', text)
    return text.strip()

def determine_hierarchy_level(element: dict) -> str:
    element_type = element.get('type')
    text = element.get('text', '')
    if element_type == 'Title':
        if re.match(r'^\d+\.', text):
            level = len(text.split('.')[0])
            return f'level_{level}'
        return 'title_main'
    if element_type == 'ListItem':
        return 'list_item'
    return 'body'

def extract_section_info(element: dict) -> str:
    text = element.get('text', '')
    element_type = element.get('type')
    if element_type == 'Title' and re.match(r'^\d+\.', text):
        return text.split('.')[0] + '.' + text.split('.')[1].strip() if '.' in text else text
    if element_type == 'ListItem' and re.match(r'^\d+\.', text):
        return text
    return 'general'


In [8]:
def get_chunk_metadata(chunk_elements):
    if not chunk_elements:
        return {}
    return {
        'chunk_size': len(' '.join(chunk_elements)),
        'element_count': len(chunk_elements),
        'chunk_type': 'mixed'
    }

def extract_enhanced_metadata(element: dict) -> dict:
    metadata = element.get('metadata', {})
    return {
        'element_id': element.get('element_id'),
        'element_type': element.get('type'),
        'page_number': metadata.get('page_number'),
        'parent_id': metadata.get('parent_id'),
        'text_length': len(element.get('text', '')),
        'is_header': element.get('type') in ['Title'],
        'is_list_item': element.get('type') == 'ListItem',
        'is_table_content': element.get('type') in ['Table', 'TableRow'],
        'hierarchy_level': determine_hierarchy_level(element),
        'section': extract_section_info(element)
    }

def preprocess_elements(elements):
    processed_elements = []
    for element in elements:
        text = element.get('text', '').strip()
        if not text or len(text) < 10:
            continue
        if element.get('type') == 'Footer':
            continue
        cleaned_text = clean_text(text)
        if not cleaned_text:
            continue
        processed_elements.append({
            'text': cleaned_text,
            'metadata': extract_enhanced_metadata(element),
            'original_element': element
        })
    return processed_elements

def create_contextual_chunks(processed_elements, max_tokens=400):
    chunks = []
    current_section = 'Introduction'
    current_subsection = ''
    for element in processed_elements:
        text = element['text']
        element_type = element['metadata']['element_type']
        if element_type == 'Title' and any(char.isdigit() for char in text[:5]):
            current_section = text
            current_subsection = ''
        elif element_type == 'Title':
            current_subsection = text
        chunk_metadata = {
            **element['metadata'],
            'section': current_section,
            'subsection': current_subsection,
            'document_type': 'admission_notice',
            'language': 'portuguese'
        }
        chunks.append({'text': text, 'metadata': chunk_metadata})
    return chunks

def optimize_chunks(chunks, target_size=300):
    optimized = []
    i = 0
    while i < len(chunks):
        current_chunk = chunks[i]
        current_text = current_chunk['text']
        current_metadata = current_chunk['metadata']
        j = i + 1
        while (j < len(chunks) and len(current_text) < target_size and
               chunks[j]['metadata']['section'] == current_metadata['section']):
            combined_text = current_text + ' ' + chunks[j]['text']
            if len(combined_text) <= target_size * 1.5:
                current_text = combined_text
                j += 1
            else:
                break
        optimized.append({'text': current_text, 'metadata': current_metadata})
        i = j
    return optimized

def create_semantic_chunks(elements, max_chunk_size=512):
    chunks = []
    current_chunk = []
    current_size = 0
    for element in elements:
        element_text = element.get('text', '').strip()
        if not element_text:
            continue
        element_type = element.get('type')
        if element_type in ['Title'] and current_chunk:
            chunks.append({'text': ' '.join(current_chunk), 'metadata': get_chunk_metadata(current_chunk)})
            current_chunk = [element_text]
            current_size = len(element_text)
        else:
            if current_size + len(element_text) > max_chunk_size and current_chunk:
                chunks.append({'text': ' '.join(current_chunk), 'metadata': get_chunk_metadata(current_chunk)})
                current_chunk = [element_text]
                current_size = len(element_text)
            else:
                current_chunk.append(element_text)
                current_size += len(element_text)
    if current_chunk:
        chunks.append({'text': ' '.join(current_chunk), 'metadata': get_chunk_metadata(current_chunk)})
    return chunks

def preprocess_for_embedding(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        elements = json.load(f)
    print(f'Loaded {len(elements)} elements from JSON')
    processed_elements = preprocess_elements(elements)
    print(f'Preprocessed {len(processed_elements)} elements')
    chunks = create_contextual_chunks(processed_elements)
    print(f'Created {len(chunks)} initial chunks')
    optimized_chunks = optimize_chunks(chunks)
    print(f'Optimized to {len(optimized_chunks)} chunks')
    embedding_ready_chunks = []
    for i, chunk in enumerate(optimized_chunks):
        embedding_ready_chunks.append({
            'id': f'chunk_{i}',
            'content': chunk['text'],
            'metadata': chunk['metadata']
        })
    return embedding_ready_chunks


In [9]:
def ensure_partitioned_pdf(pdf_path: Path, output_path: Path) -> None:
    if output_path.exists():
        print('JSON file already exists, skipping PDF extraction')
        return
    if not pdf_path.exists():
        raise FileNotFoundError(f'PDF file not found: {pdf_path}')
    print('Extracting elements from PDF...')
    elements = partition_pdf(filename=str(pdf_path))
    elements_to_json(elements=elements, filename=str(output_path))
    print('PDF extraction completed')

def generate_embeddings(chunks, model_name='sentence-transformers/all-MiniLM-L6-v2', normalize=True):
    if not chunks:
        print('No chunks available for embedding.')
        return []
    print(f'Loading embedding model: {model_name}')
    model = SentenceTransformer(model_name)
    texts = [chunk['content'] for chunk in chunks]
    embeddings = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        normalize_embeddings=normalize
    )
    chunks_with_embeddings = []
    for chunk, vector in zip(chunks, embeddings):
        enriched = dict(chunk)
        enriched['embedding'] = vector.tolist()
        chunks_with_embeddings.append(enriched)
    print(f'Generated embeddings for {len(chunks_with_embeddings)} chunks')
    return chunks_with_embeddings


In [10]:
# Pipeline execution
ensure_partitioned_pdf(pdf_path, json_output_path)
print('Starting preprocessing pipeline...')
embedding_chunks = preprocess_for_embedding(json_output_path)
with open(chunks_output_path, 'w', encoding='utf-8') as f:
    json.dump(embedding_chunks, f, ensure_ascii=False, indent=2)
print(f'✅ Created {len(embedding_chunks)} chunks ready for embedding')
print(f'✅ Saved chunks to: {chunks_output_path}')
embedded_chunks = generate_embeddings(embedding_chunks)
with open(embeddings_output_path, 'w', encoding='utf-8') as f:
    json.dump(embedded_chunks, f, ensure_ascii=False, indent=2)
print(f'✅ Saved chunk embeddings to: {embeddings_output_path}')


JSON file already exists, skipping PDF extraction
Starting preprocessing pipeline...
Loaded 1050 elements from JSON
Preprocessed 922 elements
Created 922 initial chunks
Optimized to 269 chunks
✅ Created 269 chunks ready for embedding
✅ Saved chunks to: documents/Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO-chunks.json
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Batches: 100%|██████████| 9/9 [00:05<00:00,  1.65it/s]


Generated embeddings for 269 chunks
✅ Saved chunk embeddings to: documents/Edital-Processo-Seletivo-Inteli_-Graduacao-2026_AJUSTADO-embeddings.json


In [11]:
# Preview first chunks
for i, chunk in enumerate(embedded_chunks[:3]):
    print(f'\n--- Chunk {i + 1} ---')
    print(f"ID: {chunk['id']}")
    print(f"Content: {chunk['content'][:200]}...")
    print(f"Content Length: {len(chunk['content'])}")
    print(f"Section: {chunk['metadata'].get('section', 'N/A')}")
    print(f"Element Type: {chunk['metadata'].get('element_type', 'N/A')}")
    print(f"Page: {chunk['metadata'].get('page_number', 'N/A')}")
    print(f"Embedding dims: {len(chunk['embedding'])}")
total_chars = sum(len(chunk['content']) for chunk in embedded_chunks)
avg_chunk_size = total_chars / len(embedded_chunks) if embedded_chunks else 0
print(f'\nTotal chunks: {len(embedded_chunks)}')
print(f'Total characters: {total_chars}')
print(f'Average chunk size: {avg_chunk_size:.1f} characters')



--- Chunk 1 ---
ID: chunk_0
Content: 1. Curso 2. Público-Alvo 3. Calendário 4. Inscrições 4.1.Taxa de inscrição 4.1.1.Solicitação de Isenção da taxa de inscrição 4.2.Política de reembolso 4.3.Confirmação de Inscrição...
Content Length: 179
Section: Introduction
Element Type: ListItem
Page: 2
Embedding dims: 384

--- Chunk 2 ---
ID: chunk_1
Content: 5. Sobre o Processo Seletivo 5.1.Eixo Prova 5.1.1.Quantidade de questões e formato 5.1.2.Uso de materiais de apoio 5.1.3.Duração Eixo Prova 5.1.4.Dinâmica da Prova Inteli 5.1.5.Eixo Prova - Formato On...
Content Length: 514
Section: Introduction
Element Type: ListItem
Page: 2
Embedding dims: 384

--- Chunk 3 ---
ID: chunk_2
Content: 6. Critérios de Avaliação e Desclassificação 6.1.Eixo Prova 6.1.1.Formato Online 6.1.2.Formato Presencial 6.2.Eixo Perfil 6.2.1.Redações e Atividades Extracurriculares 6.3.Eixo Projeto 7. Bolsas e Fin...
Content Length: 284
Section: Introduction
Element Type: ListItem
Page: 2
Embedding dims: 384

Total chunks: 2

In [12]:
# Section distribution overview
sections = {}
for chunk in embedded_chunks:
    section = chunk['metadata'].get('section', 'Unknown')
    sections[section] = sections.get(section, 0) + 1
for section, count in sorted(sections.items()):
    print(f'{section}: {count} chunks')


10 acer- tos: 1 chunks
10 acertos: 9 chunks
10.1. Calendário de Matrícula: 1 chunks
10.3. Vagas remanescentes: 17 chunks
10.5. Indeferimento de matrícula: 3 chunks
10.6. Cancelamento de matrícula: 26 chunks
100 pontos: 4 chunks
11 acer- tos: 1 chunks
11 acertos: 6 chunks
12 acer- tos: 1 chunks
12 acertos: 9 chunks
12/10/2025, solicitando nova confirmação.: 2 chunks
13 acertos: 6 chunks
14 acertos: 6 chunks
14 acertos 20 acertos: 1 chunks
15 acertos: 6 chunks
16 acertos: 1 chunks
18 acertos: 1 chunks
19 acer- tos: 1 chunks
19/10/2025 das 9h às 11h Realização do Eixo Prova Formato Remoto: 4 chunks
1ª chamada: 2 chunks
20 acer- tos: 4 chunks
20 acertos: 5 chunks
2ª chamada: 2 chunks
3ª chamada: 8 chunks
4.1. Taxa de Inscrição: 2 chunks
4.1.1. Solicitação de isenção da taxa de inscrição: 6 chunks
4.3. Confirmação de inscrição: 2 chunks
5.1. Eixo Prova: 4 chunks
5.1.1. Quantidade de questões e formato: 4 chunks
5.1.2. Uso de materiais de apoio: 2 chunks
5.1.3. Duração Eixo Prova: 1 chunks
5

In [13]:
# Convert the document to markdown using pymupdf4llm
md_text = pymupdf4llm.to_markdown(str(pdf_path))
Path('output.md').write_bytes(md_text.encode('utf-8'))
print('✅ Markdown export saved to output.md')


✅ Markdown export saved to output.md
