# Translating a Jupyter book about Bio-image Analsis
In this notebook we will translate an existing Jupyter book about Bio-image Analsis from English to another language using a large language model. We use Claude 3.5 Sonnet.

In [1]:
import anthropic
import openai
import datetime
import os
from pathlib import Path
from functools import partial
from IPython.display import Markdown, display
import json
import shutil

openai.__version__, anthropic.__version__

('1.30.1', '0.29.0')

## Defining the content of the book
First we specify the source content of the book and the target language.

In [2]:
source_dir = "../BioImageAnalysisNotebooks/"

In [3]:
target_language = "Spanish"

We will also specify the location where to store the book:

In [4]:
base_dir = ""
repository_url = "https://github.com/generated-books/bio-bildanalyse-notebooks"

We will use this language model to generate the book:

In [5]:
model = "claude-3-5-sonnet-20240620"

## Helper functions
Here we create some helper functions for prompting and for file format handling.

In [6]:
def prompt_chatGPT(message:str, model="gpt-4o-2024-05-13"):
    """
    A prompt helper function that sends a message to openAI
    and returns only the text response.
    """
    import os
    import openai
    
    # convert message in the right format if necessary
    if isinstance(message, str):
        message = [{"role": "user", "content": message}]
        
    # setup connection to the LLM
    client = openai.OpenAI()
    
    # submit prompt
    response = client.chat.completions.create(
        model=model,
        messages=message
    )
    
    # extract answer
    return response.choices[0].message.content

In [7]:
def prompt_claude(message:str, model="claude-3-5-sonnet-20240620"):
    """
    A prompt helper function that sends a message to anthropic
    and returns only the text response.

    Example models: claude-3-5-sonnet-20240620 or claude-3-opus-20240229
    """
    import os
    from anthropic import Anthropic
    
    # convert message in the right format if necessary
    if isinstance(message, str):
        message = [{"role": "user", "content": message}]
        
    # setup connection to the LLM
    client = Anthropic()
    
    message = client.messages.create(
        max_tokens=4096,
        messages=message,
        model=model,
    )

    # extract answer
    return message.content[0].text

In [8]:
if "gpt" in model:
    prompt = partial(prompt_gpt, model=model)
else:
    prompt = partial(prompt_claude, model=model)    

In [9]:
def prompt_with_memory(message:str):
    """
    This function allows to use an LLMs in a chat-mode. 
    The LLM is equipped with some memory, 
    so that we can refer back for former conversation steps.
    """
    
    # convert message in the right format and store it in memory
    question = {"role": "user", "content": message}
    chat_history.append(question)
    
    # receive answer
    response = prompt(chat_history)
    
    # convert answer in the right format and store it in memory
    answer = {"role": "assistant", "content": response}
    chat_history.append(answer)
    
    return response

In [10]:
def is_valid_json(test_string):
    """This function returns if a string is formatted json."""
    import json
    try:
        json.loads(test_string)
        return True
    except:
        return False

def ensure_json(notebook):
    """This function makes sure that the passed notebook is indeed a json-formatted ipynb file."""
    if is_valid_json(notebook):
        return notebook
        
    return prompt(f"""
Take the following text and extract the Jupyter 
notebook ipynb/json from it:

{notebook}

Make sure the output is in ipynb/json format. 
Respond only the JSON content.
""").strip("```json").strip("```python").strip("```")

## Context
Here we provide some context to the language model. As gpt4 and claude have different APIs for providing system messages, we instead use this message to start the conversation.

In [11]:
system_message = f"""
You are going to translate text from English to {target_language}. 
Do not modify any Python code. Do not translate it. Also do not translate strings in Python code.
In case you receive markdown files or Jupyter notebooks in JSON format, leave the structure of the document as it is and just translate the English text to {target_language}.

Confirm this with "ok".
"""

chat_history = [{"role": "user", "content": system_message}, {"role": "assistant", "content": "ok"}]

## Generating the book
Here we start generating the notebooks for the content listed in the table of contents.

In [12]:
def translate_file(source_filename, target_filename):
    # remove outputs from jupyter notebooks (because it might be too big otherwise)
    if source_filename.endswith(".ipynb"): # notebook
        # Load the notebook as a JSON object
        with open(source_filename, 'r', encoding='utf-8') as f:
            notebook_content = json.load(f)
    
        # Iterate through the cells and erase the outputs
        for cell in notebook_content['cells']:
            if cell['cell_type'] == 'code':
                cell['outputs'] = []
                cell['execution_count'] = None
    
        file_content = json.dumps(notebook_content, indent=1)
    else:
        with open(source_filename, 'r', encoding='utf-8') as f:
            file_content = f.read()

    # run LLM translation
    translated_content = prompt_with_memory(file_content).strip("'").strip('"')

    # copy code outputs from source to target file
    if source_filename.endswith(".ipynb"): # notebook
        with open(source_filename, 'r', encoding='utf-8') as f:
            notebook_content = json.load(f)
        
        #print(translated_content)
        
        old_content = notebook_content
        new_content = json.loads(translated_content)
    
        for o, n in zip(old_content['cells'], new_content['cells']):
            if "outputs" in o.keys():
                n['outputs'] = o['outputs']
                n['execution_count'] = o['execution_count']
    
        translated_str = json.dumps(new_content, indent=1)
    else:
        translated_str = translated_content

    # save result to target file
    with open(target_filename, 'w', encoding='utf-8') as file:
        file.write(translated_str)

In [13]:
memory_len = len(chat_history)
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if not "-checkpoint" in file and not "Untitled" in file and not "_build" in root and not ".git/" in root and not ".git\\" in root and not "generator.ipynb" in file:

            source_file = os.path.join(root, file).replace("\\", "/")
            target_file = source_file.replace(source_dir, base_dir).replace("\\", "/")

            # make sure target folder exists
            directory = Path(target_file).parent
            os.makedirs(directory, exist_ok=True)

            if not os.path.exists(target_file): # avoid re-generating the same files
                try:
                    if file.endswith(".md") or file.endswith(".ipynb"):
                        
                        chat_history = chat_history[:memory_len]
                        
                        print(f"Translating {source_file} to {target_file}")
                        translate_file(source_file, target_file)
                    else:
                        print(f"Copying {source_file} to {target_file}")
                        shutil.copy(source_file, target_file)
                except:
                    print("Error")

Translating ../BioImageAnalysisNotebooks/docs/18_image_filtering/image_filtering.ipynb to docs/18_image_filtering/image_filtering.ipynb
Error
Translating ../BioImageAnalysisNotebooks/docs/19_spatial_transforms/downsampling_with_denoising.ipynb to docs/19_spatial_transforms/downsampling_with_denoising.ipynb
Error
Translating ../BioImageAnalysisNotebooks/docs/21_surface_processing/quality_measurements.ipynb to docs/21_surface_processing/quality_measurements.ipynb
Translating ../BioImageAnalysisNotebooks/docs/21_surface_processing/readme.md to docs/21_surface_processing/readme.md
Translating ../BioImageAnalysisNotebooks/docs/21_surface_processing/saving_and_loading_surfaces.ipynb to docs/21_surface_processing/saving_and_loading_surfaces.ipynb
Translating ../BioImageAnalysisNotebooks/docs/21_surface_processing/smoothing_and_simplifying_surfaces.ipynb to docs/21_surface_processing/smoothing_and_simplifying_surfaces.ipynb
Translating ../BioImageAnalysisNotebooks/docs/21_surface_processing/su