In [None]:
!pip install -U pymupdf

In [None]:
# OpenAI API
API_BASE_URL = "https://"  # Without /v1
api_key = "sk-"  #
MODEL = "gpt-4o"  # model-with-vision

In [None]:
import os
import asyncio
import aiohttp
from pathlib import Path
import fitz
import base64
from IPython.display import display

# Number of concurrent tasks, DO NOT CHANGE IT
CONCURRENCY = 1
# Maximum number of retries for errors
MAX_RETRIES = 3

def pdf_to_images(pdf_path, zoom_x=5, zoom_y=5, rotation_angle=0):
    """Convert PDF file to images"""
    pdf = fitz.open(pdf_path)
    images = []
    for pg in range(pdf.page_count):
        page = pdf[pg]
        trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle)
        pm = page.get_pixmap(matrix=trans, alpha=False)
        img_bytes = pm.tobytes()
        images.append((pg + 1, img_bytes))
    pdf.close()
    return images

async def process_image(session, image_data, semaphore, page_number, max_retries=MAX_RETRIES):
    """Use OCR to recognize the image and format it into Markdown"""
    system_prompt = """
    OCR to extract text from the table. Return output in csv.
    Do not write code. Read the image directly.
    """

    for attempt in range(max_retries):
        try:
            async with semaphore:
                encoded_image = base64.b64encode(image_data).decode('utf-8')
                response = await session.post(
                    f"{API_BASE_URL}/v1/chat/completions",
                    headers={"Authorization": f"Bearer {api_key}"},
                    json={
                        "messages": [
                            {
                                "role": "system",
                                "content": system_prompt
                            },
                            {
                                "role": "user",
                                "content": [
                                    {
                                        "type": "text",
                                        "text": "Analyze the image and provide the content in the specified format."
                                    },
                                    {
                                        "type": "image_url",
                                        "image_url": {
                                            "url": f"data:image/png;base64,{encoded_image}"
                                        }
                                    }
                                ]
                            }
                        ],
                        "stream": False,
                        "model": MODEL,
                        "temperature": 0.5,
                        "presence_penalty": 0,
                        "frequency_penalty": 0,
                        "top_p": 1
                    },
                )
                if response.status == 200:
                    result = await response.json()
                    content = result['choices'][0]['message']['content']
                    print(f"  Completed processing page {page_number}")
                    return content
                else:
                    raise Exception(f"Request failed, status code: {response.status}\n{await response.text()}")
        except Exception as e:
            print(f"Error processing page {page_number} (attempt {attempt+1}/{max_retries}): {str(e)}")
            if attempt == max_retries - 1:
                print(f"Failed to process page {page_number}, reached maximum retry limit")
                return None
            await asyncio.sleep(2 * attempt)  # Exponential backoff
    return None

async def process_pdf(pdf_file, output_dir):
    """Process a single PDF file"""
    print(f"\nStarting to process file: {pdf_file}")

    # Create output file
    file_name = Path(pdf_file).stem
    output_file = Path(output_dir) / f"{file_name}.md"

    # Check if output file already exists
    if output_file.exists():
        print(f"File {output_file} already exists, skipping.")
        return

    # Convert PDF to images
    images = pdf_to_images(pdf_file)

    # Clear output file
    open(output_file, 'w').close()

    # Create asynchronous HTTP session
    async with aiohttp.ClientSession() as session:
        # Use semaphore to limit concurrency
        semaphore = asyncio.Semaphore(CONCURRENCY)

        # Create task list
        tasks = [process_image(session, image_data, semaphore, page_number) for page_number, image_data in images]

        # Execute tasks concurrently and get results
        results = await asyncio.gather(*tasks)

        # Save results to output file in order
        with open(output_file, "w", encoding="utf-8") as f:
            for page_number, content in enumerate(results, 1):
                if content:
                    print(f"  Saving content of page {page_number}")
                    f.write(f"## Page {page_number}\n\n{content}\n\n")

    print(f"File {pdf_file} processed. Output file: {output_file}")

async def process_files(pdf_files, output_dir):
    """Process all PDF files"""
    for pdf_file in pdf_files:
        await process_pdf(pdf_file, output_dir)

async def main(input_dir, output_dir):
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    pdf_files = list(input_dir.glob("*.pdf"))  # Get all .pdf files
    total_files = len(pdf_files)

    if total_files == 0:
        print(f"Error: No .pdf files found in '{input_dir}' directory.")
        return

    print(f"Found {total_files} PDF files to process.")

    await process_files(pdf_files, output_dir)

    print("\nAll files processed.")

In [None]:
os.makedirs('output', exist_ok=True)
await main(input_dir="/content", output_dir="/content/output")