In [1]:
class ChatMessage:
    def __init__(self, r, c):
        self.role = r
        self.content = c

    def to_message(self):
        if self.role and self.content:
            return [self.role, self.content]
        else:
            raise ValueError("role and content cannot be empty or None")


class UserMessage(ChatMessage):
    def __init__(self, c):
        self.role = "user"
        self.content = c


class SystemMessage(ChatMessage):
    def __init__(self, c):
        self.role = "system"
        self.content = c


class BehaviorMessage(ChatMessage):
    def __init__(self, c):
        self.role = "assistant"
        self.content = c


In [2]:
from chat_message import *


class OpenAiTranslator():
    def __init__(self, text, model, lang=None):
        if lang is None:
            lang = ['English']
        self.text = text
        self.model = model
        self.language = lang

    def generate_system_message(self):
        return SystemMessage(
            f'You are a super skilled translator who can translate any text to {self.language}.',
        )

    def generate_behavior_message(self):
        return BehaviorMessage(
            f'Sure, I can translate anything to {self.language}.')

    def generate_translation_message(self):
        return UserMessage(
            f'Translate this "{self.text}" to {self.language}'
        )

    def translate(self):
        system = self.generate_system_message().to_message()
        behavior = self.generate_behavior_message().to_message()
        translation = self.generate_translation_message().to_message()

        messages = [
            {"role": system[0], "content": system[1]},
            {"role": behavior[0], "content": behavior[1]},
            {"role": translation[0], "content": translation[1]}
        ]

        return messages


In [3]:
def openai_translate(text, model, translation_language):
    translator = OpenAiTranslator(text, model, language)

    return openai.ChatCompletion.create(
        model=f"{model}",
        messages=translator.translate()
    )

In [4]:
import docx2txt


def split_docx_at_word(docx_path, output_dir, splitting_w):
    # Load the document
    text = docx2txt.process(docx_path)
    # Find the index of the split word
    split_index = text.find(splitting_w)
    if split_index == -1:
        raise ValueError(f"Could not find the split word '{splitting_w}' in the document")
    # Split the text at the split index
    first_half = text[:split_index]
    second_half = text[split_index:]
    # Get the base filename without extension
    filename = os.path.splitext(os.path.basename(docx_path))[0]
    # Save the first half as a .txt file
    with open(os.path.join(output_dir, f"{filename}_1.txt"), "w") as f:
        f.write(first_half)
    # Save the second half as a .txt file
    with open(os.path.join(output_dir, f"{filename}_2.txt"), "w") as f:
        f.write(second_half)


In [5]:
import os

# Define the input and output folders and the split word
input_folder = 'inputs'
output_folder = 'inputs_split'
split_word = "Tavola 11"

# Iterate over the files in the input folder
for name_file in os.listdir(input_folder):
    # Check if the file is a .docx file
    if name_file.endswith(".docx"):
        # Get the full path to the file
        filepath = os.path.join(input_folder, name_file)
        # Split the file and save the resulting .txt files in the output folder
        split_docx_at_word(filepath, output_folder, split_word)


In [6]:
import os


def read_file(folder_path, n):
    with open(os.path.join(folder_path, n), "r") as f:
        text = f.read()
    return text



In [11]:
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

languages = [
    'English',
    'Spanish',
    'French',
    'German',
    'Dutch',
    'Portuguese',
    'Russian',
    'Polish',
    'Japanese',
    'Korean',
    'Chinese',
    'Arabic',
    'Turkish',
    'Hindi',
    'Vietnamese']

input_folder = "inputs_split"
output_folder = "translations"

for filename in os.listdir(input_folder):
    print(f'Starting {filename}')
    text_to_translate = read_file(input_folder, filename)
    name = filename. \
        replace("_da tradurre", ""). \
        replace("capitolo", ""). \
        replace(".txt", "") \
        .lower()

    for language in languages:
        lang = language.lower()
        print(f'Translating {name} to {lang}')
        with open(f'{output_folder}/{name}_{lang}.md',"w") as f:
            response = openai_translate(text_to_translate, "gpt-3.5-turbo", language)
            translation = response.choices[0].message.content
            f.write(f'\n#corvi/traduzione/{lang}\n')
            f.write(translation)
    print(f'Done {filename}')
print("Done")


Starting CDN_prologoS2_da tradurre_1.txt
Translating cdn_prologos2_1 to english
Translating cdn_prologos2_1 to spanish
Translating cdn_prologos2_1 to french
Translating cdn_prologos2_1 to german
Translating cdn_prologos2_1 to dutch
Translating cdn_prologos2_1 to portuguese
Translating cdn_prologos2_1 to russian
Translating cdn_prologos2_1 to polish
Translating cdn_prologos2_1 to japanese
Translating cdn_prologos2_1 to korean
Translating cdn_prologos2_1 to chinese
Translating cdn_prologos2_1 to arabic
Translating cdn_prologos2_1 to turkish
Translating cdn_prologos2_1 to hindi
Translating cdn_prologos2_1 to vietnamese
Done CDN_prologoS2_da tradurre_1.txt
Starting CDN_capitolo15_da tradurre_2.txt
Translating cdn_15_2 to english
Translating cdn_15_2 to spanish
Translating cdn_15_2 to french
Translating cdn_15_2 to german
Translating cdn_15_2 to dutch
Translating cdn_15_2 to portuguese
Translating cdn_15_2 to russian
Translating cdn_15_2 to polish
Translating cdn_15_2 to japanese
Translatin