In [None]:
!pip install fasttext
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk

In [None]:
import fasttext
import nltk
from nltk.tokenize import sent_tokenize
import docx
import glob
import os
from pathlib import Path, PurePath
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from nltk import sent_tokenize
import torch
import fasttext
from collections import Counter
from tqdm import tqdm
import shutil
from dotenv import load_dotenv

load_dotenv()
nltk.download('punkt')

In [None]:
checkpoint = 'facebook/nllb-200-3.3B'
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

pretrained_lang_model = "./lid218e.bin"  # path of the pretrained model file
if not os.path.isfile(pretrained_lang_model):
    # If the file doesn't exist, download it
    !wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin

In [None]:
input_lang = 'eng_Latn'
target_lang = 'swh_Latn'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
translation_pipeline = pipeline('translation',
                                model=model,
                                tokenizer=tokenizer,
                                src_lang=input_lang,
                                tgt_lang=target_lang,
                                max_length = 400,
                                device=device)

In [None]:
input_folder = Path(os.getenv('INPUT_FOLDER_PATH'))
output_folder = Path(os.getenv('OUTPUT_FOLDER_PATH'))
ext_in = 'docx'
ext_out = 'docx'

# Create the output folder if it doesn't exist
output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
def get_paragraphs_from_docx(file):

    paras = []
    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = []
        for sentence in sent_tokenize(para.text):
            sentences.append(sentence)
        this_para['sentences'] = sentences

    return paras

def translate_docx(file):
    
    paras = []
    # Copy file to output folder and then work off of the copy
    file = shutil.copy(file, output_folder)

    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = [sentence for sentence in sent_tokenize(para.text)]
        translations = [translation_pipeline(sentence)[0]['translation_text'] for sentence in sentences]

        this_para['sentences'] = sentences
        this_para['translations'] = translations
        paras.append(this_para)

        # This line was a great simplification of the find and replace code.
        para.text = " ".join(translations)

        # I'm not sure this is required, since the style shouldn't have changed.
        para.style = this_para['style']

    doc.save(file)
    return paras

In [None]:

fasttext_model = fasttext.load_model(pretrained_lang_model)

def get_languages(file):

    file = file.resolve()

    try :
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")

    paragraphs = [para for para in document.paragraphs]
    sentences = [sentence for para in document.paragraphs for sentence in sent_tokenize(para.text)]

    languages = Counter()
    for sentence in sentences:
        predictions = fasttext_model.predict(sentence, k=1)
        output_lang = predictions[0][0].replace('__label__', '')
        languages.update([output_lang])

    return languages

In [None]:
files = [file for file in input_folder.rglob("*." + ext_in)]
print(f"Found {len(files)} {ext_in} files in {input_folder.resolve()}")

for i, file in enumerate(files,1):
    file = file.resolve()
    languages_in_file = get_languages(file)
    top_language_in_file = languages_in_file.most_common(1)[0][0]
    file_is_english = top_language_in_file == "eng_Latn"

    if file_is_english:
        try :
            document = docx.Document(file)
        except BadZipFile:
            print(f"BadZipFile Error on opening {file}")
            continue

        # Save the file.
        document.save(file)

        # Translate the content
        paragraphs = translate_docx(file)

        print(f"{i:>4} : Translated file {file} from English to Spanish")
    else:
        print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")