In [None]:
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk

In [None]:
import docx
import glob
from pathlib import Path, PurePath
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, TRANSFORMERS_CACHE
import shutil
import fasttext
from collections import Counter
from tqdm import tqdm
import torch
from zipfile import BadZipFile
import gc
import os
# from dotenv import load_dotenv

# load_dotenv()
nltk.download('punkt')

In [None]:
pretrained_lang_model = "./lid218e.bin"  # path of the pretrained model file
if not os.path.isfile(pretrained_lang_model):
    # If the file doesn't exist, download it
    !wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin

In [None]:
checkpoints = {
    "NLLB": "facebook/nllb-200-3.3B",
}

def load_model(model_name):
    model_dir = f"{TRANSFORMERS_CACHE}/{model_name}"

    if not os.path.exists(model_dir):
        print(f"{model_name} not found")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model.save_pretrained(model_dir)
        tokenizer.save_pretrained(model_dir)
    else:
        print(f"{model_name} found")
        tokenizer = AutoTokenizer.from_pretrained(f"{model_dir}")
        model = AutoModelForSeq2SeqLM.from_pretrained(f"{model_dir}")

    return model, tokenizer


def unload_model(model, tokenizer):
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:

input_folder = Path("./Input")
output_folder = Path("./Translated/")
ext_in = 'docx'
ext_out = 'docx'

# Create the output folder if it doesn't exist
output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
def translate_docx(translation_pipeline, input_file, output_file):
    """Translate a DOCX file and save the translated content to a new file."""
    doc = docx.Document(input_file)

    # Iterate over paragraphs and tables to translate the content
    for paragraph in doc.paragraphs:
        translate_paragraph(paragraph, translation_pipeline)

    for table in doc.tables:
        translate_table(table, translation_pipeline)

    # Save the translated document
    doc.save(output_file)


def translate_paragraph(paragraph, translation_pipeline):
    """Translate the content of a paragraph and replace its text."""
    original_text = paragraph.text
    if original_text.strip():  # Only translate if the paragraph is not empty
        translated_text = translate_text(original_text, translation_pipeline)
        replace_text_in_runs(paragraph, translated_text)


def translate_text(text, translation_pipeline):
    """Translate the given text using the provided translation pipeline."""
    return translation_pipeline(text)[0]['translation_text']


def replace_text_in_runs(paragraph, translated_text):
    """Replace text in each run while preserving the original formatting."""
    original_text = "".join(run.text for run in paragraph.runs)

    # Ensure we correctly replace text while preserving formatting
    if len(original_text) == len(translated_text):
        current_char_index = 0
        for run in paragraph.runs:
            run_length = len(run.text)
            run.text = translated_text[current_char_index:current_char_index + run_length]
            current_char_index += run_length
    else:
        # If lengths don't match, replace text by matching run lengths
        current_char_index = 0
        for run in paragraph.runs:
            run_length = len(run.text)
            run.text = translated_text[current_char_index:current_char_index + run_length]
            current_char_index += run_length

        # Handle any leftover text by adding it as a new run
        if current_char_index < len(translated_text):
            remaining_text = translated_text[current_char_index:]
            paragraph.add_run(remaining_text)


def translate_table(table, translation_pipeline):
    """Translate all the cells in a table."""
    for row in table.rows:
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                translate_paragraph(paragraph, translation_pipeline)


In [None]:
def get_languages(file):

    file = file.resolve()
    fasttext_model = fasttext.load_model(pretrained_lang_model)
    
    # Open the input file as a Word document
    try :
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")

    paragraphs = [para for para in document.paragraphs]
    sentences = [sentence for para in document.paragraphs for sentence in sent_tokenize(para.text)]

    languages = Counter()
    for sentence in sentences:
        predictions = fasttext_model.predict(sentence, k=1)
        output_lang = predictions[0][0].replace('__label__', '')
        languages.update([output_lang])

    del fasttext_model
    gc.collect()
    torch.cuda.empty_cache()

    return languages

In [None]:
# List of target languages and corresponding file names
languages = [
    ("spa_Latn", "LASpanish"),
]

files = [file for file in input_folder.rglob("*." + ext_in)]
print(f"Found {len(files)} {ext_in} files in {input_folder.resolve()}")

for i, file in enumerate(files, 1):
    file = file.resolve()
    # languages_in_file = get_languages(file)
    # top_language_in_file = languages_in_file.most_common(1)[0][0]
    # file_is_english = top_language_in_file == "eng_Latn"

    # if file_is_english:
    print(f"{i:>4} : Translating file {file} from English to multiple languages.")
    try:
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")
        continue

    
    for model_name, checkpoint in checkpoints.items():
        print(f"Loading model: {model_name}")
        model, tokenizer = load_model(checkpoint)

        for target_lang, file_name in languages:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

            translation_pipeline = pipeline('translation',
                                            model=model,
                                            tokenizer=tokenizer,
                                            src_lang='eng_Latn',
                                            tgt_lang=target_lang,
                                            max_length=400,
                                            device=device)


            
            output_dir_for_model = output_folder / f"{model_name}"
            output_dir_for_model.mkdir(parents=True, exist_ok=True)
            output_path = output_dir_for_model / f"{file.stem}_{file_name}.{ext_out}"

            translate_docx(translation_pipeline, file, output_path)

            print(f"{i:>4} : Translated file {file} to {file_name}.")
            
            del translation_pipeline
            gc.collect()
            torch.cuda.empty_cache()

        unload_model(model, tokenizer)

    # else:
    #     print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")
