# The "Visual Logic" Challenge (PDFs & Tables)


In [1]:
# !pip install pymupdf4llm pypdf unstructured pdf2image pytesseract pillow
# !pip install langchain langchain-community langchain-openai
# !pip install chromadb sentence-transformers
# !pip install pandas numpy pillow
# !pip install python-dotenv

In [2]:
import pymupdf4llm
import pathlib
import fitz  # PyMuPDF
import pandas as pd
from typing import List, Dict
from IPython.display import display, Markdown

def extract_pdf_with_structure(pdf_path: str) -> dict:
    """
    Extract PDF with preserved structure using PyMuPDF4LLM.

    Returns:
        dict with 'markdown_text', 'metadata', and 'pages'
        Each page contains extracted text and full table data
    """
    # Extract markdown (global document structure)
    md_text = pymupdf4llm.to_markdown(pdf_path)

    doc = fitz.open(pdf_path)

    metadata = {
        'filename': pathlib.Path(pdf_path).name,
        'num_pages': len(doc),
        'title': doc.metadata.get('title', ''),
        'author': doc.metadata.get('author', ''),
        'creation_date': doc.metadata.get('creationDate', '')
    }

    pages = []

    for page_num, page in enumerate(doc, start=1):
        table_finder = page.find_tables()

        tables = []
        for i, table in enumerate(table_finder.tables):
            tables.append({
                'table_id': f'page_{page_num}_table_{i+1}',
                'data': table.to_pandas(),   
                'bbox': table.bbox           
            })

        pages.append({
            'page_number': page_num,
            'text': page.get_text(),
            'images': len(page.get_images()),
            'num_tables': len(tables),
            'tables': tables               
        })

    doc.close()

    return {
        'markdown_text': md_text,
        'metadata': metadata,
        'pages': pages
    }

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [3]:
result = extract_pdf_with_structure("RAG_BENCHMARK.pdf")
print(f"Extracted {result['metadata']['num_pages']} pages")

Extracted 14 pages


In [4]:
# Print only first table (quick sanity check)
for page in result['pages']:
    if page['tables']:
        table = page['tables'][0]
        print(f"Table on page {page['page_number']}")
        break

table['data']

Table on page 2


Unnamed: 0,Unit,Score,Rank
0,A1,78.4,3
1,B2,91.2,1
2,C7,66.9,5


In [5]:
table['data'].to_markdown(index=False)

'| Unit   |   Score |   Rank |\n|:-------|--------:|-------:|\n| A1     |    78.4 |      3 |\n| B2     |    91.2 |      1 |\n| C7     |    66.9 |      5 |'

In [6]:
display(Markdown(table['data'].to_markdown(index=False)))

| Unit   |   Score |   Rank |
|:-------|--------:|-------:|
| A1     |    78.4 |      3 |
| B2     |    91.2 |      1 |
| C7     |    66.9 |      5 |