Process PDF to markdown

In [None]:
import os
import base64
import requests
from pathlib import Path
from pdf2image import convert_from_path
from openai import OpenAI

In [None]:
api_key = os.environ.get("OPENAI_API_KEY")

In [None]:
from openai import OpenAI

client = OpenAI()

In [None]:
print(f"Converting PDF to images with DPI={300}...")
images = convert_from_path("/Users/gladyscallow/Library/CloudStorage/OneDrive-UniversityofCambridge/JESUS/CONGRUENCE ENGINE/RIVER POLLUTION/OCR EXTRACTION/part_ii_bradford/resized/test.pdf", dpi=300, fmt='jpeg')
total_pages = len(images)
digits = len(str(total_pages))

for i, image in enumerate(images):
    image_path = os.path.join("/Users/gladyscallow/Library/CloudStorage/OneDrive-UniversityofCambridge/JESUS/CONGRUENCE ENGINE/RIVER POLLUTION/OCR EXTRACTION/part_ii_bradford/resized/images", f"Page_{str(i+1).zfill(digits)}.jpeg")
    image.save(image_path, "JPEG")
    print(f"Page {i+1} saved as image: {image_path}")

In [None]:
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def process_images_in_directory(input_directory, output_directory):
    input_dir = Path(input_directory)
    output_dir = Path(output_directory)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Get all image files in the directory
    images = sorted(input_dir.glob('*.*'), key=lambda x: x.stem)
    image_extensions = ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']

    for image_path in images:
        if image_path.suffix.lower() not in image_extensions:
            print(f"Skipping non-image file: {image_path.name}")
            continue

        print(f"Processing {image_path.name}...")

        # Encode the image to base64
        base64_image = encode_image_to_base64(str(image_path))

        # Prepare the messages payload
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Return the markdown text output from this page in a PDF using formatting to match the structure of the page as closely as you can. Only output the markdown and nothing else. Do not explain the output, just return it. Do not use a single # for a heading. All headings will start with ## or ###. Convert tables to markdown tables. DO NOT return in a code block. Just return the raw text in markdown format."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ]

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",  # Use the correct model name
                messages=messages,
                max_tokens=1500,
                temperature=0.1,
            )

        except Exception as e:
            print(f"Error calling OpenAI API for image {image_path.name}: {e}")
            continue

        markdown_content = response.choices[0].message.content

        output_file = output_dir / (image_path.stem + ".txt")
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            print(f"Saved markdown for {image_path.name} to {output_file.name}")
        except Exception as e:
            print(f"Error saving markdown for image {image_path.name}: {e}")


input_directory = 'input directory'      # Replace with your input directory path
output_directory = 'output directory'    # Replace with your desired output directory path

process_images_in_directory(input_directory, output_directory)