In [1]:
from openai import OpenAI
from openai import AzureOpenAI

In [3]:
client = AzureOpenAI(
    api_key="",
    azure_endpoint="",
    api_version="2024-02-15-preview"
)

# Configure the model and create a chat completion
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "user",
            "content": "Tell me a joke about a duck"
        }
    ]
)

print(response.choices[0].message.content)

Why did the duck go to therapy? Because he was feeling quacked up!


In [2]:
import time
import requests
import base64
import os
from pathlib import Path

prompt = """
Give me the markdown text output from this page in a PDF using formatting to match the structure of the page as close as you can get. 

Only output the markdown and nothing else. Do not explain the output, just return it. 

Do not use a single # for a heading. All headings will start with ## or ###. 

Convert tables to markdown tables. 

Take great care to ensure the precision and accuracy of numbers especially longer series of digits - do not transcribe or reorder the digits! 

Describe charts and images as best you can and, when possible use mermaidjs format.

DO NOT return in a codeblock. Just return the raw text in markdown format.

Remove any irrelevant text from the markdown, returning the cleaned up version of the content. Examples include any images []() or 'click here' or 'Listen to this article' or page numbers or logos."
"""

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_to_markdown(base64_image, retries=5, timeout=90):
    headers = {
        "Content-Type": "application/json",
        "api-key": "09180c2651274025b6f4948abc7a975d"  # Replace with your actual API key
    }
    payload = {
        "model": "gpt-4",
        "messages": [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": prompt
            }, {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }]
        }],
        "max_tokens": 4096
    }
    for attempt in range(retries):
        try:
            response = requests.post("https://ai-dougwareai685749536435.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2024-02-15-preview",
                                     headers=headers, json=payload, timeout=timeout)
            response.raise_for_status()
            return response.json()['choices'][0]['message']['content']
        except requests.exceptions.HTTPError as e:
            print(f"HTTP Error: {e}, retrying...")
        except requests.exceptions.ConnectionError as e:
            print(f"Connection Error: {e}, retrying in 30 seconds...")
            time.sleep(30)
        except requests.exceptions.Timeout as e:
            print(f"Timeout Error: {e}, retrying...")
        time.sleep(10)  # Wait 10 seconds before retrying to avoid hammering the server
    raise Exception("Failed to get response from server after multiple retries")

def process_images_to_markdown(input_dir, output_dir):
    for pdf_folder in Path(input_dir).iterdir():
        if pdf_folder.is_dir():
            output_folder_path = Path(output_dir) / pdf_folder.name
            output_folder_path.mkdir(parents=True, exist_ok=True)
            
            images = sorted(pdf_folder.iterdir(), key=lambda x: x.stem)
            for image_path in images:
                output_path = output_folder_path / f"{image_path.stem}.md"
                if output_path.exists():
                    print(f"Skipping {output_path}, already exists.")
                    continue
                print(f"Processing {image_path.name}...")
                base64_image = encode_image_to_base64(str(image_path))
                markdown_content = image_to_markdown(base64_image)
                with open(output_path, 'w') as f:
                    f.write(markdown_content)
                    print(f"Markdown for {image_path.name} saved to {output_path}")

# Usage example:
input_directory = './ToImages'  # Path where images are stored
output_directory = './ToMarkdownNoOcr'  # Path where markdowns should be saved
process_images_to_markdown(input_directory, output_directory)

print("All images converted to markdown.")


Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_001.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_002.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_003.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_004.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_005.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_006.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_007.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_008.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_009.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\Page_010.md, already exists.
Skipping ToMarkdownNoOcr\entra-identity-

In [1]:
from pathlib import Path

def stitch_markdown_in_folders(output_directory):
    output_dir_path = Path(output_directory)
    
    # Iterate over each folder in the output directory
    for folder_path in output_dir_path.iterdir():
        if folder_path.is_dir():
            output_file_path = folder_path / f"{folder_path.name}.md"
            markdown_content = []
            
            # Collect and combine markdown files from the current folder
            for markdown_file in sorted(folder_path.glob("*.md"), key=lambda x: x.stem):
                with open(markdown_file, 'r') as file:
                    markdown_content.append(file.read())
                    markdown_content.append("\n\n")  # Add space between documents
            
            # Save the combined content to a new markdown file named after the folder
            with open(output_file_path, 'w') as output_file:
                output_file.write(''.join(markdown_content))
            print(f"All markdown files stitched into {output_file_path}")

# Example usage:
output_directory = './ToMarkdownNoOcr'
stitch_markdown_in_folders(output_directory)


All markdown files stitched into ToMarkdownNoOcr\entra-identity-multi-tenant-organizations\entra-identity-multi-tenant-organizations.md
All markdown files stitched into ToMarkdownNoOcr\EP New Hire QDIA 2024 11.15.23 FINAL\EP New Hire QDIA 2024 11.15.23 FINAL.md
