In [4]:
# run off of GPU in PyTorch
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# check if cuda is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU instead.")

Using GPU: NVIDIA RTX A6000


In [5]:
# download the language model pretrained file

!pip install fasttext

import fasttext
import os

pretrained_lang_model = "./lid218e.bin" # path of pretrained model file
if not os.path.exists(pretrained_lang_model):
    !wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin
else:
    print("Model already exists. Skipping download.")
    
model = fasttext.load_model(pretrained_lang_model)

text = "صباح الخير، الجو جميل اليوم والسماء صافية."
predictions = model.predict(text, k=1)
print(predictions)
test_lang = predictions[0][0].replace('__label__', '')
print(test_lang)

Defaulting to user installation because normal site-packages is not writeable
Model already exists. Skipping download.
(('__label__arb_Arab',), array([0.99960977]))
arb_Arab




In [6]:
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to /home/curleyd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import docx
import glob
from pathlib import Path, PurePath

## Define functions to initialize models

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

def use_nllb():
    # Smallest 600M parameter model - distilled
    # checkpoint = 'facebook/nllb-200-distilled-600M'

    # Medium 1.3B parameter model - distilled
    # checkpoint = 'facebook/nllb-200-distilled-1.3B'

    # Medium 1.3B parameter model
    # checkpoint = 'facebook/nllb-200-1.3B'

    # Large 3.3B parameter model
    checkpoint = 'facebook/nllb-200-3.3B'

    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    return (model, tokenizer)

def use_madlad():
    madlad_model="google/madlad400-10b-mt"

    pipe = pipeline("translation", model=madlad_model)
    
    tokenizer = AutoTokenizer.from_pretrained(madlad_model)
    model = AutoModelForSeq2SeqLM.from_pretrained(madlad_model)
    return (model, tokenizer)

## Select Model to use (comment out which one you do __not__ want to use)

In [9]:

# model, tokenizer = use_nllb()
model, tokenizer = use_madlad()

Loading checkpoint shards: 100%|██████████| 9/9 [00:09<00:00,  1.06s/it]
Loading checkpoint shards:  44%|████▍     | 4/9 [00:05<00:06,  1.27s/it]

: 

In [None]:
#Test sentence tokenizer:
from nltk import sent_tokenize
para = "Hello World. It's good to see you. Thanks for buying this book."
sent_tokenize(para)

In [None]:
source_language = 'eng_Latn'
target_language = 'npi_Deva'

translation_pipeline = pipeline('translation',
                                model=model,
                                tokenizer=tokenizer,
                                src_lang=source_language,
                                tgt_lang=target_language,
                                max_length = 400)
output = translation_pipeline(para)
print(output[0]['translation_text'])

In [None]:
def get_paragraphs_from_docx(file):

    paras = []
    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = []
        for sentence in sent_tokenize(para.text):
            sentences.append(sentence)
        this_para['sentences'] = sentences

    #print(f'Found {len(styles_in_doc)} styles {styles_in_doc} in this document.')
    return paras


def translate_docx(file):

    paras = []

    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = [sentence for sentence in sent_tokenize(para.text)]
        translations = [translation_pipeline(sentence)[0]['translation_text'] for sentence in sentences]

        this_para['sentences'] = sentences
        this_para['translations'] = translations
        paras.append(this_para)

        # This line was a great simplification of the find and replace code.
        para.text = " ".join(translations)

        # I'm not sure this is required, since the style shouldn't have changed.
        para.style = this_para['style']


        #print(this_para)

    doc.save(file)
    #print(f'Found {len(styles_in_doc)} styles {styles_in_doc} in this document.')
    return paras

In [None]:
translation_pipeline2 = pipeline('translation',
                        model=model,
                        tokenizer=tokenizer,
                        src_lang=source_language,
                        tgt_lang=target_language,
                        max_length = 400)

In [None]:
import shutil

input_folder = Path("/home/curleyd/pytorch_stuff/Downloads/Choosingaspouse")
output_folder = Path("/home/curleyd/pytorch_stuff/Downloads/Choosingaspousenepali")
ext_in = 'docx'
ext_out = 'docx'

# Create the output folder if it doesn't exist
output_folder.mkdir(parents=True, exist_ok=True)

# Iterate over files in the input folder and copy the docx files to output
for file_path in input_folder.glob("*"):
    if file_path.suffix == ".docx":
        destination_path = output_folder / file_path.name
        shutil.copy(file_path, destination_path)

In [None]:
from collections import Counter
from tqdm import tqdm
pretrained_lang_model = "./lid218e.bin" # path of pretrained model file
fasttext_model = fasttext.load_model(pretrained_lang_model)

def get_languages(file):

    file = file.resolve()

    #print(f"Opening {file}")
    # Open the input file as a Word document
    try :
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")

    paragraphs = [para for para in document.paragraphs]
    sentences = [sentence for para in document.paragraphs for sentence in sent_tokenize(para.text)]

    languages = Counter()
    for sentence in sentences:
        predictions = fasttext_model.predict(sentence, k=1)
        #print(predictions)
        output_lang = predictions[0][0].replace('__label__', '')
        #print(output_lang)
        languages.update([output_lang])

    return languages


def show_languages(files):
    results = {}

    for file in tqdm(files):
        file = file.resolve()
        languages = get_languages(file)

        #print(languages)
        #print(f"There are {len(sentences)} sentences in {output_file}.")
        results[file] = {"languages": languages, "sentence_count": len(sentences)}

    for file, info in results.items():
        print(f"There are {info['sentence_count']} sentences in {str(file)[23:]}    {info['languages'].most_common()}")
        #print(f"{info['languages']}")

In [None]:
# from English
def translate_from_english(output_folder, ext_in):
    folder = output_folder  # Path("/home/curleyd/pytorch_stuff/Downloads/")
    files = [file for file in folder.rglob("*." + ext_in)]
    print(f"Found {len(files)} {ext_in} files in {folder.resolve(True)}")

    for i, file in enumerate(files, 1):
        file = file.resolve()
        languages_in_file = get_languages(file)
        if languages_in_file:
            top_languages = languages_in_file.most_common(1)
            if top_languages:
                top_language_in_file = top_languages[0][0]
            else:
                top_language_in_file = "eng_Latn"
        else:
            top_language_in_file = "eng_Latn"

        file_is_english = top_language_in_file == "eng_Latn"

        if file_is_english:
            print(f"{i:>4} : Translating file {file} from English to Nepali.")
            try:
                document = docx.Document(file)
            except BadZipFile:
                print(f"BadZipFile Error on opening {file}")
                continue

            # Save the file.
            document.save(file)

            # Translate the content
            paragraphs = translate_docx(file)

            print(f"{i:>4} : Translated file {file} from English to Nepali")
        else:
            print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")


### Into English
def translate_to_english(output_folder, ext_in):
    folder = output_folder  # Path("/home/curleyd/pytorch_stuff/Downloads/darrell/darrell/darrell/")
    files = [file for file in folder.rglob("*." + ext_in)]
    print(f"Found {len(files)} {ext_in} files in {folder.resolve()}")

    for i, file in enumerate(files, 1):
        file = file.resolve()
        languages_in_file = get_languages(file)
        top_language_in_file = languages_in_file.most_common(1)[0][0]
        file_is_source_language = top_language_in_file == source_language

        if file_is_source_language:
            print(f"{i:>4} : Translating file {file} from {source_language} to {target_language}.")
            try:
                document = docx.Document(file)
            except BadZipFile:
                print(f"BadZipFile Error on opening {file}")
                continue

            # Translate the content
            paragraphs = translate_docx(file)

            print(f"{i:>4} : Translated file {file} from {source_language} to {target_language}")
        else:
            print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")


In [None]:
if source_language == "eng_Latn":
    translate_from_english(output_folder=output_folder, ext_in=ext_in)
elif target_language == "eng_Latn":
    translate_to_english(output_folder=output_folder, ext_in=ext_in)