<a href="https://colab.research.google.com/github/espickle1/claude-agents/blob/main/utils/pdf_to_markdown_and_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install if needed
!pip install pypdf

## Load library
from pypdf import PdfReader
import re
from google.colab import files
import io

## Function to clean imported text
def clean_text_for_claude(text):
    """
    Clean extracted PDF text to make it Claude-friendly:
    - Remove excessive whitespace
    - Join broken lines within paragraphs
    - Preserve paragraph breaks
    """
    # Replace multiple spaces with single space
    text = re.sub(r' +', ' ', text)

    # Split into lines
    lines = text.split('\n')

    # Process lines to join broken sentences
    cleaned_lines = []
    current_paragraph = []

    for line in lines:
        line = line.strip()

        # Skip empty lines - they mark paragraph breaks
        if not line:
            if current_paragraph:
                # Join accumulated lines into a paragraph
                cleaned_lines.append(' '.join(current_paragraph))
                current_paragraph = []
            continue

        # Add to current paragraph
        current_paragraph.append(line)

    # Add final paragraph if exists
    if current_paragraph:
        cleaned_lines.append(' '.join(current_paragraph))

    # Join paragraphs with double newlines
    result = '\n\n'.join(cleaned_lines)

    # Clean up excessive paragraph breaks
    result = re.sub(r'\n{3,}', '\n\n', result)

    return result

## Extract text from uploaded pdf
def read_pdf(pdf_file):
    """
    Read PDF and return two formats:
    - human_readable: Formatted with page numbers
    - markdown: Clean text optimized for Claude processing

    Args:
        pdf_file: File path (str) or file-like object (bytes)
    """
    # Handle both file paths and uploaded file objects
    if isinstance(pdf_file, bytes):
        reader = PdfReader(io.BytesIO(pdf_file))
    else:
        reader = PdfReader(pdf_file)

    # Extract text from all pages
    pages = []
    for page_num, page in enumerate(reader.pages, start=1):
        text = page.extract_text()
        if text:
            pages.append({'page_num': page_num, 'text': text})

    # Format 1: Human-readable with page numbers
    human_readable = []
    for p in pages:
        human_readable.append(f"\n{'='*60}\n")
        human_readable.append(f"PAGE {p['page_num']}\n")
        human_readable.append(f"{'='*60}\n")
        human_readable.append(p['text'])
    human_readable = ''.join(human_readable)

    # Format 2: Markdown (clean text optimized for Claude)
    # Combine all pages
    full_text = '\n\n'.join([p['text'] for p in pages])

    # Apply cleaning
    markdown = clean_text_for_claude(full_text)

    return {
        'human_readable': human_readable,
        'markdown': markdown,
        'num_pages': len(pages)
    }

# Upload PDF file
print("Upload your PDF file:")
uploaded = files.upload()

# Get the uploaded filename
pdf_filename = list(uploaded.keys())[0]
print(f"\nProcessing: {pdf_filename}")

# Process the PDF
result = read_pdf(pdf_filename)

print(f"\n✓ Successfully extracted {result['num_pages']} pages")

# Save outputs
with open('article_human.txt', 'w', encoding='utf-8') as f:
    f.write(result['human_readable'])

with open('article_markdown.md', 'w', encoding='utf-8') as f:
    f.write(result['markdown'])

print("\n✓ Files created - downloading now...\n")

# Automatically download both files
files.download('article_human.txt')
files.download('article_markdown.md')

print("\n" + "="*60)
print("PREVIEW - Clean markdown (first 2000 chars):")
print("="*60)
print(result['markdown'][:2000])
print("\n[...truncated...]")
print(f"\nTotal markdown length: {len(result['markdown'])} characters")