In [None]:
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk

In [None]:
import docx
import glob
from pathlib import Path, PurePath
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, TRANSFORMERS_CACHE
import shutil
import fasttext
from collections import Counter
from tqdm import tqdm
import torch
from zipfile import BadZipFile
import gc
import os
# from dotenv import load_dotenv

# load_dotenv()
nltk.download('punkt')

In [None]:
pretrained_lang_model = "./lid218e.bin"  # path of the pretrained model file
if not os.path.isfile(pretrained_lang_model):
    # If the file doesn't exist, download it
    !wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin

In [None]:
checkpoints = {
    "NLLB": "facebook/nllb-200-3.3B",
    "MADLAD": "google/madlad400-3b-mt",
    "Llama-3.1-405B": "meta-llama/Meta-Llama-3.1-405B",
}

def load_model(model_name):
    model_dir = f"{TRANSFORMERS_CACHE}/{model_name}"

    if not os.path.exists(model_dir):
        print(f"{model_name} not found")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model.save_pretrained(model_dir)
        tokenizer.save_pretrained(model_dir)
    else:
        print(f"{model_name} found")
        tokenizer = AutoTokenizer.from_pretrained(f"{model_dir}")
        model = AutoModelForSeq2SeqLM.from_pretrained(f"{model_dir}")

    return model, tokenizer


def unload_model(model, tokenizer):
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:

input_folder = Path("./Input")
output_folder = Path("./Translated/")
ext_in = 'docx'
ext_out = 'docx'

# Create the output folder if it doesn't exist
output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
def translate_docx(file, translation_pipeline):
    
    paras = []
    # TODO NOTE: this is somewhat odd, because it creates a copy of the input in the output folder. 
    file = shutil.copy(file, output_folder / "test")

    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = [sentence for sentence in sent_tokenize(para.text)]
        translations = [translation_pipeline(sentence)[0]['translation_text'] for sentence in sentences]

        this_para['sentences'] = sentences
        this_para['translations'] = translations
        paras.append(this_para)

        # This line was a great simplification of the find and replace code.
        para.text = " ".join(translations)

        # I'm not sure this is required, since the style shouldn't have changed.
        para.style = this_para['style']

    doc.save(file)
    return paras


def save_translated_document(file_path, translated_paragraphs):
    # Create a new Document
    doc = docx.Document()

    # Iterate over the translated paragraphs
    for para in translated_paragraphs:
        # Add a new paragraph with the translated text
        new_para = doc.add_paragraph(" ".join(para['translations']))

        # Set the style of the paragraph
        new_para.style = para['style']

    translated_file_path = file_path.with_suffix('.translated.docx')
    # Save the document to the specified file path
    doc.save(translated_file_path)

In [None]:
def get_languages(file):

    file = file.resolve()
    fasttext_model = fasttext.load_model(pretrained_lang_model)
    
    # Open the input file as a Word document
    try :
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")

    paragraphs = [para for para in document.paragraphs]
    sentences = [sentence for para in document.paragraphs for sentence in sent_tokenize(para.text)]

    languages = Counter()
    for sentence in sentences:
        predictions = fasttext_model.predict(sentence, k=1)
        output_lang = predictions[0][0].replace('__label__', '')
        languages.update([output_lang])

    del fasttext_model
    gc.collect()
    torch.cuda.empty_cache()

    return languages

In [None]:
# List of target languages and corresponding file names
languages = [
    ("amh_Ethi", "Ethiopian"),
    ("arb_Arab", "Arabic"),
    ("asm_Beng", "Assamese"),
    ("ben_Beng", "Bangal"),
    ("por_Latn", "BPortugese"),
    ("mya_Mymr", "Burmese"),
    ("ceb_Latn", "Cebuano"),
    ("zsm_Latn", "Chinese"),
    ("fra_Latn", "French"),
    ("guj_Gujr", "Gujarati"),
    ("hau_Latn", "Hausa"),
    ("hin_Deva", "Hindi"),
    ("ilo_Latn", "Illocano"),
    ("ind_Latn", "Indonesian"),
    ("kan_Knda", "Kannada"),
    ("khm_Khmr", "Khmer"),
    ("lao_Laoo", "Laotian"),
    ("spa_Latn", "LASpanish"),
    ("mal_Mlym", "Malayalam"),
    ("npi_Deva", "Nepali"),
    ("ory_Orya", "Oriya"),
    ("plt_Latn", "PlatMalagasy"),
    ("pan_Guru", "EPunjabi"),
    ("rus_Cyrl", "Russian"),
    ("swh_Latn", "Swahili"),
    ("tgl_Latn", "Tagalog"),
    ("tam_Taml", "Tamil"),
    ("tel_Telu", "Telugu"),
    ("tha_Thai", "Thai"),
    ("tpi_Latn", "TokPisin"),
    ("urd_Arab", "Urdu"),
    ("vie_Latn", "Vietnamese")
]

files = [file for file in input_folder.rglob("*." + ext_in)]
print(f"Found {len(files)} {ext_in} files in {input_folder.resolve()}")

for i, file in enumerate(files, 1):
    file = file.resolve()
    languages_in_file = get_languages(file)
    top_language_in_file = languages_in_file.most_common(1)[0][0]
    file_is_english = top_language_in_file == "eng_Latn"

    if file_is_english:
        print(f"{i:>4} : Translating file {file} from English to multiple languages.")
        try:
            document = docx.Document(file)
        except BadZipFile:
            print(f"BadZipFile Error on opening {file}")
            continue

        
        for model_name, checkpoint in checkpoints.items():
            print(f"Loading model: {model_name}")
            model, tokenizer = load_model(checkpoint)

            for target_lang, file_name in languages:
                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

                translation_pipeline = pipeline('translation',
                                                model=model,
                                                tokenizer=tokenizer,
                                                src_lang='eng_Latn',
                                                tgt_lang=target_lang,
                                                max_length=400,
                                                device=device)

                paragraphs = translate_docx(file, translation_pipeline)
                print(paragraphs)
                
                output_dir_for_model = output_folder / f"{model_name}"

                output_dir_for_model.mkdir(parents=True, exist_ok=True)

                output_path = output_dir_for_model / f"{file.stem}_{file_name}.{ext_out}"
                save_translated_document(output_path, paragraphs)

                print(f"{i:>4} : Translated file {file} to {file_name}.")
                
                del translation_pipeline
                gc.collect()
                torch.cuda.empty_cache()

            unload_model(model, tokenizer)

    else:
        print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")
