In [None]:
# download the language model pretrained file

#!wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin
!pip install fasttext

import fasttext

pretrained_lang_model = "./lid218e.bin" # path of pretrained model file
model = fasttext.load_model(pretrained_lang_model)

text = "صباح الخير، الجو جميل اليوم والسماء صافية."
predictions = model.predict(text, k=1)
print(predictions)
input_lang = predictions[0][0].replace('__label__', '')
print(input_lang)

In [None]:
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

In [None]:
import docx
import glob
from pathlib import Path, PurePath

In [None]:
checkpoint = 'facebook/nllb-200-3.3B'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
#Test sentence tokenizer:
from nltk import sent_tokenize
para = "Hello World. It's good to see you. Thanks for buying this book."
sent_tokenize(para)

In [None]:
# List of target languages and corresponding file names
languages = [
    ("amh_Ethi", "Ethiopian"),
    ("arb_Arab", "Arabic"),
    ("asm_Beng", "Assamese"),
    ("ben_Beng", "Bangal"),
    ("por_Latn", "BPortugese"),
    ("mya_Mymr", "Burmese"),
    ("ceb_Latn", "Cebuano"),
    ("zsm_Latn", "Chinese"),
    ("fra_Latn", "French"),
    ("guj_Gujr", "Gujarati"),
    ("hau_Latn", "Hausa"),
    ("hin_Deva", "Hindi"),
    ("ilo_Latn", "Illocano"),
    ("ind_Latn", "Indonesian"),
    ("kan_Knda", "Kannada"),
    ("khm_Khmr", "Khmer"),
    ("lao_Laoo", "Laotian"),
    ("spa_Latn", "LASpanish"),
    ("mal_Mlym", "Malayalam"),
    ("npi_Deva", "Nepali"),
    ("ory_Orya", "Oriya"),
    ("plt_Latn", "PlatMalagasy"),
    ("pan_Guru", "EPunjabi"),
    ("rus_Cyrl", "Russian"),
    ("swh_Latn", "Swahili"),
    ("tgl_Latn", "Tagalog"),
    ("tam_Taml", "Tamil"),
    ("tel_Telu", "Telugu"),
    ("tha_Thai", "Thai"),
    ("tpi_Latn", "TokPisin"),
    ("urd_Arab", "Urdu"),
    ("vie_Latn", "Vietnamese")
]

files = [file for file in input_folder.rglob("*." + ext_in)]
print(f"Found {len(files)} {ext_in} files in {input_folder.resolve()}")

for i, file in enumerate(files, 1):
    file = file.resolve()
    languages_in_file = get_languages(file)
    top_language_in_file = languages_in_file.most_common(1)[0][0]
    file_is_english = top_language_in_file == "eng_Latn"

    if file_is_english:
        print(f"{i:>4} : Translating file {file} from English to multiple languages.")
        try:
            document = docx.Document(file)
        except BadZipFile:
            print(f"BadZipFile Error on opening {file}")
            continue

        for target_lang, file_name in languages:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            translation_pipeline = pipeline('translation',
                                            model=model,
                                            tokenizer=tokenizer,
                                            src_lang='eng_Latn',
                                            tgt_lang=target_lang,
                                            max_length=400,
                                            device=device)

            paragraphs = translate_docx(file)
            
            output_path = output_folder / f"{file.stem}_{file_name}.{ext_out}"
            save_translated_document(paragraphs, output_path)
            
            print(f"{i:>4} : Translated file {file} to {file_name}.")

    else:
        print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")


In [None]:
input_folder = Path("home/curleyd/GitHub/MTFiles")
output_folder = Path("home/curleyd/GitHub/MTFiles/Translated")
ext_in = 'docx'
ext_out = 'docx'

In [None]:
def get_paragraphs_from_docx(file):

    paras = []
    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = []
        for sentence in sent_tokenize(para.text):
            sentences.append(sentence)
        this_para['sentences'] = sentences

    #print(f'Found {len(styles_in_doc)} styles {styles_in_doc} in this document.')
    return paras

import shutil
def translate_docx(file):
    
    paras = []
    # Copy file to output folder and then work off of the copy
    file = shutil.copy(file, output_folder)

    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = [sentence for sentence in sent_tokenize(para.text)]
        translations = [translation_pipeline(sentence)[0]['translation_text'] for sentence in sentences]

        this_para['sentences'] = sentences
        this_para['translations'] = translations
        paras.append(this_para)

        # This line was a great simplification of the find and replace code.
        para.text = " ".join(translations)

        # I'm not sure this is required, since the style shouldn't have changed.
        para.style = this_para['style']


        #print(this_para)

    doc.save(file)
    #print(f'Found {len(styles_in_doc)} styles {styles_in_doc} in this document.')
    return paras

In [None]:
import fasttext
from collections import Counter
from tqdm import tqdm
pretrained_lang_model = "./lid218e.bin" # path of pretrained model file
fasttext_model = fasttext.load_model(pretrained_lang_model)

def get_languages(file):

    file = file.resolve()

    #print(f"Opening {file}")
    # Open the input file as a Word document
    try :
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")

    paragraphs = [para for para in document.paragraphs]
    sentences = [sentence for para in document.paragraphs for sentence in sent_tokenize(para.text)]

    languages = Counter()
    for sentence in sentences:
        predictions = fasttext_model.predict(sentence, k=1)
        #print(predictions)
        output_lang = predictions[0][0].replace('__label__', '')
        #print(output_lang)
        languages.update([output_lang])

    return languages


def show_languages(files):
    results = {}

    for file in tqdm(files):
        file = file.resolve()
        languages = get_languages(file)

        #print(languages)
        #print(f"There are {len(sentences)} sentences in {output_file}.")
        results[file] = {"languages": languages, "sentence_count": len(sentences)}

    for file, info in results.items():
        print(f"There are {info['sentence_count']} sentences in {str(file)[23:]}    {info['languages'].most_common()}")
        #print(f"{info['languages']}")

In [None]:
files = [file for file in input_folder.rglob("*." + ext_in)]
print(f"Found {len(files)} {ext_in} files in {input_folder.resolve()}")

for i, file in enumerate(files,1):
    file = file.resolve()
    languages_in_file = get_languages(file)
    top_language_in_file = languages_in_file.most_common(1)[0][0]
    file_is_english = top_language_in_file == "eng_Latn"

    #print(top_language_in_file, file_is_english)

    if file_is_english:
        print(f"{i:>4} : Translating file {file} from English to Spanish.")
        try :
            document = docx.Document(file)
        except BadZipFile:
            print(f"BadZipFile Error on opening {file}")
            continue

        # Save the file.
        document.save(file)

        # Translate the content
        paragraphs = translate_docx(file)

        print(f"{i:>4} : Translated file {file} from English to Spanish")
    else:
        print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")