In [None]:
# %pip install openai

In [None]:
from openai import OpenAI
from openai import AzureOpenAI

api_base = '' # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
api_key=""
deployment_name = 'gpt-4-vision'
api_version = '2023-12-01-preview' # this might change in the future

client = AzureOpenAI(
    api_key=api_key,  
    api_version=api_version,
    base_url=f"",
)

In [None]:
import base64
from pathlib import Path

prompt = """
Give me the markdown text output from this page in a PDF using formatting to match the structure of the page as close as you can get. 

Only output the markdown and nothing else. Do not explain the output, just return it. 

Do not use a single # for a heading. All headings will start with ## or ###. 

Convert tables to markdown tables. 

Take great care to ensure the precision and accuracy of numbers especially longer series of digits - do not transcribe or reorder the digits! 

Describe charts and images as best you can and, when possible use mermaidjs format.

DO NOT return in a codeblock. Just return the raw text in markdown format.

Remove any irrelevant text from the markdown, returning the cleaned up version of the content. Examples include any images []() or 'click here' or 'Listen to this article' or page numbers or logos."
"""

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_to_markdown(base64_image, retries=5, timeout=90):
    try:
        response = client.chat.completions.create(
            model="gpt-4-vision",
            messages=[
                { "role": "system", "content": "You are a helpful assistant." },
                { "role": "user", "content": [  
                    { 
                        "type": "text", 
                        "text": prompt 
                    },
                    { 
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ] } 
            ],
            extra_body={
                "dataSources": [
                    {
                        "type": "AzureComputerVision",
                        "parameters": {
                            "endpoint": "",
                            "key": ""
                        }
                    }],
                "enhancements": {
                    "ocr": {
                        "enabled": True
                    },
                    "grounding": {
                        "enabled": True
                    }
                }
            },
            max_tokens=4096 
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return "" 

def process_images_to_markdown(input_dir, output_dir, pages_per_folder=None):  
    for pdf_folder in Path(input_dir).iterdir():  
        if pdf_folder.is_dir():  
            output_folder_path = Path(output_dir) / pdf_folder.name  
            output_folder_path.mkdir(parents=True, exist_ok=True)  
              
            images = sorted(pdf_folder.iterdir(), key=lambda x: x.stem)  
            for image_index, image_path in enumerate(images):  
                if pages_per_folder is not None and image_index >= pages_per_folder:  
                    break  # Stop processing if the page limit is reached  
                output_path = output_folder_path / f"{image_path.stem}.md"  
                if output_path.exists():  
                    print(f"Skipping {output_path}, already exists.")  
                    continue  
                print(f"Processing {image_path.name}...")  
                base64_image = encode_image_to_base64(str(image_path))  
                markdown_content = image_to_markdown(base64_image)  
                with open(output_path, 'w') as f:  
                    f.write(markdown_content)  
                    print(f"Markdown for {image_path.name} saved to {output_path}")  
  
input_directory = './ToImages'  
output_directory = './ToMarkdownGPT4VandOCRAndGrounding'  
process_images_to_markdown(input_directory, output_directory, pages_per_folder=25)
  
print("All images converted to markdown.")  

In [None]:
from pathlib import Path

def stitch_markdown_in_folders(output_directory):
    output_dir_path = Path(output_directory)
    
    # Iterate over each folder in the output directory
    for folder_path in output_dir_path.iterdir():
        if folder_path.is_dir():
            output_file_path = folder_path / f"{folder_path.name}.md"
            markdown_content = []
            
            # Collect and combine markdown files from the current folder
            for markdown_file in sorted(folder_path.glob("*.md"), key=lambda x: x.stem):
                with open(markdown_file, 'r') as file:
                    markdown_content.append(file.read())
                    markdown_content.append("\n\n")  # Add space between documents
            
            # Save the combined content to a new markdown file named after the folder
            with open(output_file_path, 'w') as output_file:
                output_file.write(''.join(markdown_content))
            print(f"All markdown files stitched into {output_file_path}")

# Example usage:

stitch_markdown_in_folders(output_directory)
