Prepare the 'env.txt' file by uploading a copy of your .env file, or make a new one with the below contents:
```yaml

# Anthropic
ANTHROPIC_API_KEY=<YOUR ANTHROPIC API KEY>

# OpenAI
OPENAI_API_KEY=<YOUR ANTHROPIC API KEY>

```

In [5]:
%pip install python-dotenv -q

Note: you may need to restart the kernel to use updated packages.


In [6]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [7]:
markdown_parser_system_prompt = '''

You're a helpful assistant for blind students helping to read the texts with formulas from images.

You read the text with formulas and mathematic symbols and convert it to markdown using LaTeX for them using syntax from examples below.

User will send you the scanned pages of a book one by one, and you would respond with the markdown code. 

1. Don't include the header with the chapter title and footnotes in the content.
2. Use markdown for titles, lists, tables, etc.
3. Don't add anything, just respond with the parsed text. 
4. For illustrations, insert [illustration, page N], N is the page number (top right for even or left for odd). Don't try to parse the text on illustrations.
5. Always use "$" notation for inline formulas, and "$$" for displayed formulas.
6. Don't add anything outside of the triple backticks.

Example of a LaTeX formula in Markdown:
```
Here is an inline formula: $E = mc^2$.

And here is a displayed formula:
$$
E = mc^2
$$

```

Example of markdown titles:
```
# Chapter 2
# Data Preparation

## 7.1 Introduction

### 7.2.1 Representative-Based Algorithms

#### 7.2.1.1 k-Modes Clustering
```
'''

In [8]:
%pip install langchain_core langchain_openai -q

Note: you may need to restart the kernel to use updated packages.


In [10]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
import os
import time
import concurrent.futures

# SETTINGS
model = "gpt-4o"
max_tokens = 4096
first_page = 429
last_page = 473
markdown_folder = './markdown'
multi_stream = True  # Enable multi-streaming
system_prompt = "Your system prompt here"


def process_page(image_url, markdown_folder, system_prompt=markdown_parser_system_prompt):
    base_name = os.path.basename(image_url)
    md_filename = os.path.splitext(base_name)[0] + '.md'
    file_path = os.path.join(markdown_folder, md_filename)

    def save_markdown(input_str, file_path):
        if input_str.startswith('```') and input_str.endswith('```'):
            content = input_str[3:-3]
        else:
            content = input_str
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

    chat = ChatOpenAI(model=model, max_tokens=max_tokens)

    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(
            content=[
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    },
                },
                {"type": "text", "text": "Please read the text on the image."},
            ]
        )
    ]

    output = chat.invoke(messages)

    save_markdown(output.content, file_path)
    print(f'Saved Markdown to {file_path}')
    return f'Processed page {image_url}'


# List of image URLs to process
image_urls = [
    f'https://translation-demo.s3.eu-central-1.amazonaws.com/images/page_{str(page).zfill(3)}.jpeg'
    for page in range(first_page, last_page + 1)
]

if multi_stream:
    # Multi-streaming with ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit all tasks to the executor
        futures = [executor.submit(
            process_page, image_url, markdown_folder) for image_url in image_urls]

        # Wait for all tasks to complete and print results
        for future in concurrent.futures.as_completed(futures):
            print(future.result())
else:
    # Single stream processing
    for image_url in image_urls:
        result = process_page(image_url, markdown_folder)
        print(result)
        time.sleep(0.5)  # Wait 0.5 seconds

NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-4-o` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [7]:
!pip install -U --quiet langchain langchain_anthropic anthropic

In [17]:
translate_system_prompt = '''

You are a top-notch technical translator, specializing in translating scientific markdown files on COMPUTER SCIENCE. 

User will send you a file contents in English, and you will translate it to Ukrainian. For specific terms, refer to the dictionary below.

1. Keep the meaning as close as pssible to original, remember that it is a technical document for data science specialists.
2. You are NOT allowed to change the structure or formatting of the document.
3. You are NOT allowed to change the Latex formulas.
4. You respond in pure markdown format, without any additional content.
5. Don't add anything, just translate the text.
6. You allowed to use only common names for terms. If you don't find the term in the dictionary, keep it in English.
7. If there is a piece of code, leave it as it is, don't translate it.

<dictionary>
{dictionary}
</dictionary>

'''

dictionary = "\n".join([
    "bloom filter - фільтр Блума",
    "count–min sketch - скетч count–min"
])

In [26]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
import os

model = 'claude-3-sonnet-20240229'
max_tokens = 4096
markdown_folder = './markdown'
output_folder = './markdown_ua'
first_page = 424
last_page = 424

def save_markdown(input_str, file_path):
    if input_str.startswith('```markdown'):
        content = input_str[len('```markdown'):].lstrip()
    elif input_str.startswith('```'):
        content = input_str[3:].lstrip()
    else:
        content = input_str

    if content.endswith('```'):
        content = content[:-3].rstrip()

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

def translate_file(file_name, output_folder, system_prompt=translate_system_prompt):
    
    os.makedirs(output_folder, exist_ok=True)
    llm = ChatAnthropic(temperature=0, model_name=model, max_tokens=max_tokens)

    with open(file_name, 'r') as f:
        text = f.read()

    human_message = "{text}"
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", human_message)
    ])
    chain = prompt | llm

    response = chain.invoke({"dictionary": dictionary,"text": text})

    base_name = os.path.splitext(os.path.basename(file_name))[0] + '_ua.md'
    file_path = os.path.join(output_folder, base_name)
    save_markdown(response.content, file_path)
    print(f'Saved translation to {file_path}')
    return text

for page in range(first_page, last_page + 1):
    page_string = str(page).zfill(3)
    file_name = f'{markdown_folder}/page_{page_string}.md'
    translate_file(file_name, output_folder)
    print(f'Processed page {page}')
    time.sleep(0.5)  # Wait 0.5 seconds

Saved translation to ./markdown_ua/page_424_ua.md
Processed page 424


In [None]:
input_str=''''''

if input_str.startswith('```markdown'):
    content = input_str[len('```markdown'):].rstrip(' ').rstrip('\n').rstrip('```').strip()