In [1]:
import fitz  # pip install PyMuPDF
import io
from PIL import Image
import os

In [2]:
path=os.getcwd()

# Function to convert PDF to images, with two pages per image
def convert_pdf_to_images(pdf_path, output_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # List to hold the byte data of images
    images_bytes = []
    
    # Go through the pages three by one
    for page_number in range(0, pdf_document.page_count, 3):
        # Create a blank image with white background, three times the height of a single page
        combined_image = Image.new('RGB', 
                                (int(pdf_document[0].rect.width), 
                                    int(pdf_document[0].rect.height * 3)), 
                                'white')
        
        # Draw the pages on the combined_image
        for i in range(3):
            if page_number + i < pdf_document.page_count:
                page = pdf_document[page_number + i]
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes()))
                # Calculate the vertical position for each page
                vertical_position = int(pdf_document[0].rect.height) * i
                combined_image.paste(img, (0, vertical_position))
        
        # Save the combined image to bytes
        img_byte_arr = io.BytesIO()
        combined_image.save(img_byte_arr, format='JPEG')
        img_byte_arr = img_byte_arr.getvalue()
        images_bytes.append(img_byte_arr)
        for (i,image) in enumerate(images_bytes):
            image_output_path=os.path.join(output_path, str(i)+'.jpg')
            with open(image_output_path, "wb") as img_file:
                img_file.write(images_bytes[i])

    return images_bytes

In [3]:
import base64
import requests

# OpenAI API Key
api_key = #<YOUR KEY>

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [4]:
def request(image_path):
    base64_image = encode_image(image_path)

    headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
    }

    payload = {
    "model": "gpt-4-vision-preview",
    "messages": [
        {
        "role": "user",
        "content": [
            {
            "type": "text",
            "text": "Transcribe the image content exclusively into LaTeX format, suitable for immediate compilation without any modifications. The transcription should include all textual content, accurately reflecting the lecture material. Replace images or non-textual elements that cannot be represented in LaTeX with the token [TBD]. Avoid adding document creation commands or \includegraphics; focus on replicating the lecture content cleanly and directly. Any necessary explanations or notes should be included as comments within the LaTeX code, using the '%' symbol. The goal is to provide a LaTeX transcription that consists solely of the lecture's text content, formatted and ready for direct use in a LaTeX editor. Don't use beamers of frames."
            },
            {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
            }
            }
        ]
        }
    ],
    "max_tokens": 2000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()

In [5]:
def extract_latex_code(response_text):
    # Splitting the text into lines
    lines = response_text.split("\n")

    # Finding the start and end indices of the LaTeX block
    start_index = next((i for i, line in enumerate(lines) if "```latex" in line), -1)
    end_index = next((i for i, line in enumerate(lines) if line.strip() == "```" and i > start_index), len(lines))

    # Extracting the LaTeX block
    latex_block = lines[start_index + 1:end_index]

    # Filtering out specific LaTeX commands
    unwanted_commands = ["\\includepackage", "\\begin{document}", "\\end{document}", "\\documentclass{", "\\usepackage"]
    latex_code = [line for line in latex_block if not any(cmd in line for cmd in unwanted_commands)]

    return "\n".join(latex_code)

In [15]:
import time

# messy structure, but it works the following structure: p01_output, p02_output, ..., p22_output (a folder for each lecture).
# Each folder contains a variable number of jpg images, each containing 3 subsequent lecture slides (which will then be fed to GPT-4, one at a time).

def main():
    counter=0
    # change range if needed. This selects the range of lectures (between 1 and 22)
    for lectures in range(20, 23): 
        if len(str(lectures))==1:
            lectures= "0" + str(lectures)

        dir_path="p"+str(lectures)+"_output"
        files = os.listdir(dir_path)
        sorted_files = sorted(files, key=lambda x: int(x.split('.')[0]))

        text = ""
        for file in sorted_files:
            file_path=os.path.join(dir_path, file)
            if counter == 100:
                break
            response=request(file_path)
            counter +=1
            
            cleaned_latex = extract_latex_code(response["choices"][0]["message"]["content"])
            text += "\n\n\n" + cleaned_latex
            
            write_path=os.path.join(dir_path, "text.txt")
            with open(write_path, "w") as f:
                f.write(text)

            # up to 3 requests per minute are allowed
            time.sleep(21)
            
        print("Done with lecture", lectures)

In [None]:
if __name__ == "main":
    main()