In [None]:
# Remember to install libraries such as 'mammoth', google generativeai, if not yet installed
# Also, the command line tool 'pandoc' needs to be installed in your system in order to transform markdown documents into word documents.

import os
import re
import mammoth
import subprocess
import google.generativeai as genai

In [None]:
def split_markdown_into_parts(markdown_text, max_words_per_part=4000):
  """Splits Markdown text into parts, preserving paragraphs, with a maximum word count per part.

  Args:
    markdown_text: The Markdown text to split.
    max_words_per_part: Maximum number of words allowed in each part.

  Returns:
    A list of strings, each containing a part of the original text.
  """

  # Split into paragraphs
  paragraphs = re.split(r'\s*\n\s*\n\s*', markdown_text)

  # Initialize list to store parts
  parts = []
  current_part = []
  current_word_count = 0

  # Iterate over paragraphs
  for paragraph in paragraphs:
    # Count words in the paragraph
    words_in_paragraph = len(paragraph.split())

    # Check if adding the paragraph exceeds the word limit
    if current_word_count + words_in_paragraph > max_words_per_part:
      # If it does, append the current part and start a new one
      parts.append("\n\n".join(current_part))
      current_part = [paragraph]
      current_word_count = words_in_paragraph
    else:
      # Otherwise, add the paragraph to the current part
      current_part.append(paragraph)
      current_word_count += words_in_paragraph

  # Add the last part (if any)
  if current_part:
    parts.append("\n\n".join(current_part))

  return parts

In [None]:
# The purpose of this cell is to store the content of a Microsoft Word document text into a text with markdown format that will be passed to the model

def convert_docx_to_markdown(file_path):
    with open(file_path, "rb") as docx_file:
        result = mammoth.convert_to_markdown(docx_file)
        markdown_text = result.value  # The generated markdown
        return markdown_text

folder = './texts/book_olympics'
docx_file = 'book_olympics.docx'

docx_file_path = os.path.join(folder, docx_file)
file_wo_ext, _ = os.path.splitext(os.path.basename(docx_file_path))

markdown_text = convert_docx_to_markdown(docx_file_path)

In [None]:
# Instantiate the Gemini Model
GEMINI_API_KEY='YOUR_GEMINI_API_KEY'
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('models/gemini-1.5-flash-latest')


# Split the text to be translated into parts small enough to be passed to the gemini model 
parts = split_markdown_into_parts(markdown_text, max_words_per_part=4000)

# Target languages for translation
languages = ['french', 'german']

for lang in languages:
    
    translated_md_text = ""
    
    for i, part in enumerate(parts):  
        translated_md = model.generate_content("Please translate ENTIRELY the following text to " + lang + ". Please beware that this text is formated using markdown. Just translate the actual text, and restitute the markdown as is. Here is the text:" + "'" + part + "'." + "Then do some copy editing of the translated text as the expert you are. Please, don't worry, the text is safe and has been vetted for all audiences")
        translated_md_text += translated_md.text

    translated_md_file_path = os.path.join(folder, file_wo_ext + "_" + lang + ".md")
    translated_docx_file_path = os.path.join(folder, file_wo_ext + "_" + lang + ".docx")

    with open(translated_md_file_path, 'w', encoding="utf-8") as f:
        f.write(translated_md_text)

    subprocess.run(['pandoc', translated_md_file_path, '-o', translated_docx_file_path])