In [None]:
from openai import OpenAI
from pdf2image import convert_from_path
from PIL import Image

import os
import base64
import requests
import io

def get_api_key(api_path):
    """
    Read the API key from a specified file.

    Args:
        api_path (str): The path to the file containing the API key.

    Returns:
        str: The API key.
    """
    with open(api_path, 'r') as f:
        return f.read().strip()

def concatenate_images_vertically(images):
    """
    Concatenate a list of images vertically.

    Args:
        images (list of PIL.Image.Image): List of PIL images to concatenate.

    Returns:
        PIL.Image.Image: The concatenated image.

    Raises:
        ValueError: If the list of images is empty or if all images do not have the same width.
    """
    if not images:
        raise ValueError("The list of images is empty")

    # Get dimensions of images
    widths, heights = zip(*(image.size for image in images))

    # Check if all images have the same width
    if len(set(widths)) != 1:
        raise ValueError("All images must have the same width")

    total_height = sum(heights)
    max_width = widths[0]  # Since all widths are the same

    # Create a new image with the total height and max width
    new_image = Image.new('RGB', (max_width, total_height))

    y_offset = 0
    for image in images:
        new_image.paste(image, (0, y_offset))
        y_offset += image.height

    return new_image

def encode_image(image_path):
    """
    Encode an image to a base64 string.

    Args:
        image_path (str): The path to the image file.

    Returns:
        str: The base64 encoded string of the image.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
def pil_image_to_base64(pil_image):
    # Create a BytesIO object to hold the image data
    buffered = io.BytesIO()
    
    # Save the PIL image to the BytesIO object in the format of your choice (e.g., PNG)
    pil_image.save(buffered, format="PNG")
    
    # Get the byte data from the BytesIO object
    img_byte_data = buffered.getvalue()
    
    # Encode the byte data to base64
    base64_encoded = base64.b64encode(img_byte_data)
    
    # Convert the base64 bytes to a string
    base64_string = base64_encoded.decode('utf-8')
    
    return base64_string


def pdf2txt(pdf_path,txt_dir, dpi, api_path):
    '''
    Use openai gpt to ocr a pdf-file. 
    
    inputs: 
        pdf_path - path to pdf to ocr
        txt_dir - path to dir where to save ocr
        dpi - resolution for image conversion
        api_path - path to api-key in .txt file
    '''
    pdf_name = os.path.basename(pdf_path)
    txt_name = pdf_name.replace('.pdf','.txt')
    save_path = os.path.join(txt_dir, txt_name)
    
    images = convert_from_path(pdf_path, dpi = dpi)
    concatenated_image = concatenate_images_vertically(images)
    print(type(concatenated_image))
    base64_image = pil_image_to_base64(concatenated_image)
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {get_api_key(api_path)}"
    }
    
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Du kommer bli presenterad av en bild av ett inscannat dokument. Din uppgift är att ordagrant transkribera texten i detta dokument. Det är av yttersta vikt att du inte inför fel eller hittar på saker som inte står i dokumentet. Om det är tecken i dokumentet som är otydliga, sätt då in tecknet '*' istället för att gissa. Här är bilden:"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
           }
         ],
        "max_tokens": 1000
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    txt_content = response.json()['choices'][0]['message']['content']
    
    with open(save_path, 'w', encoding='utf-8') as file:
        file.write(txt_content)

In [None]:
pdf_path = '/your_path_here.pdf' #pdf-file to ocr with gpt4
txt_dir = '/your_path_here' #directory where the txt-file will be saved
dpi = 200 #resolution for pdf to img conversion
api_path = '/your_path_here' #path to api-key
pdf2txt(pdf_path, txt_dir,dpi, api_path)