# Enrich OCR markdwon from Azure Document Intelligence using GPT-4o to extract information from an image of the page and inject image descriptions and mermaid diagrams

This requires you to first use the PdfToPageImages and DocIntelligencePipeline notebooks

In [None]:
from openai import OpenAI

In [None]:
ApiKey = ""

In [None]:
client = OpenAI(
    api_key=ApiKey
)

# Configure the model and create a chat completion
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": "Tell me a joke about a duck"
        }
    ]
)

print(response.choices[0].message.content)

In [None]:
import time
import requests
import base64
import os
from pathlib import Path

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_to_markdown(base64_image, md_text, retries=5, timeout=90):
    prompt = f"""
    The provided image is a page of a document. 

    The following markdown formatted text is a true and accurate but incomplete conversion of the provided image. 
    Information contained in figures and other illustrations is missing.

    MARKDOWN START
    {md_text}
    MARKDOWN END

    Create new markdown that contains the EXACT original text and EXACT original tables WITHOUT alteration, 
    while ADDING full text descriptions in place of each figure or illustration.

    Following each text description, consider if it is possible to further describe the figure or illustration 
    using valid mermaidjs syntax and do so taking care to avoid syntax errors.  
    
    Following these guidelines ensures valid and correctly rendered MermaidJS diagrams.

    1. **Node IDs**: Use only alphanumeric characters and underscores (`_`). Avoid spaces and other special characters.
    2. **Labels with Special Characters**: Avoid special characters in labels. Enclose labels with spaces or special characters in double quotes within brackets.

    ### Example

    ```mermaid
    graph TD;
        OwnerTenantC["Owner tenant (C)"] --> MemberTenantA["Member tenant (A)"]
        OwnerTenantC["Owner tenant (C)"] --> MemberTenantB["Member tenant (B)"]
        MemberTenantA["Member tenant (A)"] --> MemberTenantB["Member tenant (B)"]
    ```

    ### Correct Usage

    - **Node ID**: `OwnerTenantC`
    - **Label**: `"Owner tenant (C)"`

    ### Avoid
    - Opening or closing params and other special characters in labels
    - Node IDs with spaces or special characters: `Owner tenant (C)`
    - Labels without quotes: `[Owner tenant (C)]`

    Only output the markdown and nothing else. Do not explain the output, just return it. 

    Remove any irrelevant text from the markdown, returning the cleaned up version of the content. 
    Examples include "<!-- PageHeader="Tell us about your PDF experience." -->", any images []() or 'click here' or 'Listen to this article' or page numbers or logos.
        
    DO NOT RETURN CODE BLOCKS WITH "```markdown" only return raw markdown text.
    """

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {ApiKey}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": prompt
            }, {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }]
        }],
        "max_tokens": 4096
    }
    for attempt in range(retries):
        try:
            response = requests.post("https://api.openai.com/v1/chat/completions",
                                     headers=headers, json=payload, timeout=timeout)
            response.raise_for_status()
            return response.json()['choices'][0]['message']['content']
        except requests.exceptions.HTTPError as e:
            print(f"HTTP Error: {e}, retrying...")
        except requests.exceptions.ConnectionError as e:
            print(f"Connection Error: {e}, retrying in 30 seconds...")
            time.sleep(30)
        except requests.exceptions.Timeout as e:
            print(f"Timeout Error: {e}, retrying...")
        time.sleep(10)  # Wait 10 seconds before retrying to avoid hammering the server
    raise Exception("Failed to get response from server after multiple retries")

def apply_replacements(markdown_content):
    replacements = {
        ":selected:": "✅",
        ":unselected:": "❌"
    }
    for old, new in replacements.items():
        markdown_content = markdown_content.replace(old, new)
    return markdown_content

def process_images_to_markdown(input_dir, output_dir, md_input_dir, pages_per_folder=None):
    for pdf_folder in Path(input_dir).iterdir():
        if pdf_folder.is_dir():
            output_folder_path = Path(output_dir) / pdf_folder.name
            output_folder_path.mkdir(parents=True, exist_ok=True)
            
            images = sorted(pdf_folder.iterdir(), key=lambda x: x.stem)
            for image_index, image_path in enumerate(images):
                if pages_per_folder is not None and image_index >= pages_per_folder:  
                    break
                output_path = output_folder_path / f"{image_path.stem}.md"
                if output_path.exists():
                    print(f"Skipping {output_path}, already exists.")
                    continue
                print(f"Processing {image_path.name}...")
                base64_image = encode_image_to_base64(str(image_path))
                
                md_file_path = Path(md_input_dir) / pdf_folder.name / f"{image_path.stem}.md"
                if not md_file_path.exists():
                    print(f"Markdown file {md_file_path} not found. Skipping {image_path.name}.")
                    continue

                with open(md_file_path, 'r') as md_file:
                    md_text = md_file.read()
                
                markdown_content = image_to_markdown(base64_image, md_text)
                markdown_content = apply_replacements(markdown_content)
                
                with open(output_path, 'w') as f:
                    f.write(markdown_content)
                    print(f"Markdown for {image_path.name} saved to {output_path}")

# Usage example:
input_directory = './ToImages'  # Path where images are stored
output_directory = './ToMarkdownGpt4oPlusDocIntelOCR'  # Path where markdowns should be saved
md_input_directory = './ToMarkdownDocIntelligencePages'  # Path where the markdown files are stored

process_images_to_markdown(input_directory, output_directory, md_input_directory)

print("All images converted to markdown.")


In [None]:
from pathlib import Path

def stitch_markdown_in_folders(output_directory):
    output_dir_path = Path(output_directory)
    
    # Iterate over each folder in the output directory
    for folder_path in output_dir_path.iterdir():
        if folder_path.is_dir():
            output_file_path = folder_path / f"{folder_path.name}.md"
            markdown_content = []
            
            # Collect and combine markdown files from the current folder
            for markdown_file in sorted(folder_path.glob("*.md"), key=lambda x: x.stem):
                with open(markdown_file, 'r') as file:
                    markdown_content.append(file.read())
                    markdown_content.append("\n\n")  # Add space between documents
            
            # Save the combined content to a new markdown file named after the folder
            with open(output_file_path, 'w') as output_file:
                output_file.write(''.join(markdown_content))
            print(f"All markdown files stitched into {output_file_path}")

stitch_markdown_in_folders(output_directory)
