In [9]:
%pip install pdf2image Pillow anthropic litellm python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import base64
import anthropic
from pdf2image import convert_from_path
from PIL import Image
from litellm import completion

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Now you can access the environment variables using os.getenv()


True

In [50]:
# Cell 6: Set up parameters and run the process
pdf_path = os.path.join(os.getcwd(), 'pdf', 'Sharp73-Ch1.pdf')
output_folder = os.path.join(os.getcwd(), 'pdf', 'pages')
api_key = os.environ.get('ANTHROPIC_API_KEY')
ngrok_url = "https://a9c7-2a01-799-eef-a00-69a8-1da0-151d-4568.ngrok-free.app/"



In [55]:
custom_prompt = """
<instructions>
Format the given text according to these rules:

1. Follow the instructions below.

2. Line Numbering:
   - Prefix every line (including blank lines) with "PageNumber:LineNumber\t".
   - Example: "10:1\t" for page 10, line 1.
   - Reset line numbers to 1 at the start of each new page.

3. Content Formatting:
   - After the tab following the line number, reproduce the line exactly as it appears in the original text.
   - For blank lines, include the line number and tab, then leave the rest of the line empty.
   - For paragraphs, include the line number and tab, then reproduce the paragraph exactly as it appears in the original text. Make sure to include the indentation of the first line of each paragraph.

4. Preserve Original Structure:
   - Maintain all original line breaks, even mid-sentence.
   - Keep all blank lines, indentations, and spacing as they appear in the source.
   - Retain all section headers, paragraph breaks, and special formatting (e.g., lists, bullet points) exactly as they appear. 
   - Do not include footer content, the last line of each page, which usually contains section- and chapter names. Skip this line.
   - Inline numbered references in superscript should be formatted as [1] [2] [3] etc.

5. Continuous Numbering:
   - Number all lines sequentially within a page, including blank lines and headers.

6. Multi-Page Handling:
   - For documents spanning multiple pages, repeat steps 1-5 for each page.

7. Special Characters:
   - Preserve all special characters, symbols, and non-standard formatting exactly as they appear in the original text.

Apply this format consistently to the entire document, regardless of content type or structure. The goal is to create a representation that allows for precise reconstruction of the original document layout while adding systematic page and line references.
</instructions>

"""

In [41]:
# Cell 2: Function to convert PDF to images
def pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path=pdf_path, fmt='jpeg', jpegopt={'quality': 80}, grayscale=True)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'page_{i+1}.jpg')
        image.save(image_path, 'JPEG')
        image_paths.append(image_path)
    return image_paths

pdf_to_images(pdf_path, output_folder)

['/Users/knutole/graphlabai/blog/pdf/pages/page_1.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_2.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_3.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_4.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_5.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_6.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_7.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_8.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_9.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_10.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_11.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_12.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_13.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_14.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_15.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_16.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pages/page_17.jpg',
 '/Users/knutole/graphlabai/blog/pdf/pag

In [58]:
# Cell 4: Function to upload image to Claude AI and extract text
def extract_text_from_image(image_url, custom_prompt):

    # llm call
    response = completion(
        model = "claude-3-5-sonnet-20240620", 
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": custom_prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url
                        }
                    }
                ]
            }
        ],
    )


    # get usage
    usage = response.usage
    text = response.choices[0].message.content

    return text, usage


# get list of image paths
format = 'jpg'
image_paths = os.listdir(output_folder)
jpg_image_paths = sorted([path for path in image_paths if path.lower().endswith('.jpg')], key=lambda x: int(x.split('_')[1].split('.')[0]))

# debug: only first 5 images
jpg_image_paths = jpg_image_paths[:5]

for image_path in jpg_image_paths:
    image_url = f"{ngrok_url}{image_path}"
    print(image_url)

    text, usage = extract_text_from_image(image_url, custom_prompt)
    
    print(text)
    print(usage)


    # save text to file
    filename_txt = image_path.split('.')[0] + '.txt'
    os.makedirs(os.path.join(os.getcwd(), 'pdf', 'text'), exist_ok=True)
    with open(os.path.join(os.getcwd(), 'pdf', 'text', filename_txt), "w") as file:
        file.write(text)




https://a9c7-2a01-799-eef-a00-69a8-1da0-151d-4568.ngrok-free.app/page_1.jpg
3:1	Sharp, Gene. 1973. The Politics of Nonviolent Action, Part
3:2	One: Power & Struggle. Boson: Porter Sargent Publishers.
3:3	
3:4	Introduction to Part One
3:5	&
3:6	Chapter 1
3:7	
3:8	
3:9	
3:10	
3:11	
3:12	INTRODUCTION
3:13	TO PART ONE
3:14	
3:15	Some conflicts do not yield to compromise and can be resolved only
3:16	through struggle. Conflicts which, in one way or another, involve the
3:17	fundamental principles of a society, of independence, of self-respect, or of
3:18	people's capacity to determine their own future are such conflicts. For their
3:19	resolution, regular institutional procedures are rarely available; it is even
3:20	doubtful that they could be completely adequate. Instead, in the belief
3:21	that the choice in these types of conflicts is between abject passive surren-
3:22	der and violence, and also that victory requires violence, people turn to the
3:23	threat and use of violence. The spe