In [1]:
# run off of GPU in PyTorch
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# check if cuda is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU instead.")

Using GPU: NVIDIA RTX A6000


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# download the language model pretrained file

!wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin
!pip install fasttext

import fasttext

pretrained_lang_model = "./lid218e.bin" # path of pretrained model file
model = fasttext.load_model(pretrained_lang_model)

text = "صباح الخير، الجو جميل اليوم والسماء صافية."
predictions = model.predict(text, k=1)
print(predictions)
input_lang = predictions[0][0].replace('__label__', '')
print(input_lang)

--2024-04-16 11:08:09--  https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.173.166.74, 18.173.166.48, 18.173.166.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.173.166.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1176355829 (1.1G) [application/octet-stream]
Saving to: ‘lid218e.bin.1’


2024-04-16 11:08:22 (84.9 MB/s) - ‘lid218e.bin.1’ saved [1176355829/1176355829]

Defaulting to user installation because normal site-packages is not writeable
(('__label__arb_Arab',), array([0.99960977]))
arb_Arab




In [3]:
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to /home/curleyd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import docx
import glob
from pathlib import Path, PurePath

In [5]:
# Smallest 600M parameter model - distilled
# checkpoint = 'facebook/nllb-200-distilled-600M'

# Medium 1.3B parameter model - distilled
# checkpoint = 'facebook/nllb-200-distilled-1.3B'

# Medium 1.3B parameter model
# checkpoint = 'facebook/nllb-200-1.3B'

# Large 3.3B parameter model
checkpoint = 'facebook/nllb-200-3.3B'

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it]


In [7]:
#Test sentence tokenizer:
from nltk import sent_tokenize
para = "Hello World. It's good to see you. Thanks for buying this book."
sent_tokenize(para)

['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

In [8]:
input_lang = 'eng_Latn'
target_lang = 'npi_Deva'
translation_pipeline = pipeline('translation',
                                model=model,
                                tokenizer=tokenizer,
                                src_lang=input_lang,
                                tgt_lang=target_lang,
                                max_length = 400)
output = translation_pipeline(para)
print(output[0]['translation_text'])

हेलो वर्ल्ड, तपाईंलाई भेटेर खुसी लाग्यो, यो पुस्तक किनेकोमा धन्यवाद।


In [9]:
def get_paragraphs_from_docx(file):

    paras = []
    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = []
        for sentence in sent_tokenize(para.text):
            sentences.append(sentence)
        this_para['sentences'] = sentences

    #print(f'Found {len(styles_in_doc)} styles {styles_in_doc} in this document.')
    return paras


def translate_docx(file):

    paras = []

    # Open connection to Word Document
    doc = docx.Document(file)

    # read in each paragraph in file and store the style name with it.
    for para in doc.paragraphs:
        this_para = {'style': para.style.name}
        sentences = [sentence for sentence in sent_tokenize(para.text)]
        translations = [translation_pipeline(sentence)[0]['translation_text'] for sentence in sentences]

        this_para['sentences'] = sentences
        this_para['translations'] = translations
        paras.append(this_para)

        # This line was a great simplification of the find and replace code.
        para.text = " ".join(translations)

        # I'm not sure this is required, since the style shouldn't have changed.
        para.style = this_para['style']


        #print(this_para)

    doc.save(file)
    #print(f'Found {len(styles_in_doc)} styles {styles_in_doc} in this document.')
    return paras

In [10]:
input_lang = 'eng_Latn'
output_lang = 'npi_Deva'

translation_pipeline2 = pipeline('translation',
                        model=model,
                        tokenizer=tokenizer,
                        src_lang=input_lang,
                        tgt_lang=output_lang,
                        max_length = 400)

In [25]:
input_folder = Path("/home/curleyd/pytorch_stuff/Downloads/Choosingaspouse")
output_folder = Path("/home/curleyd/pytorch_stuff/Downloads/Choosingaspousenepali")
ext_in = 'docx'
ext_out = 'docx'

In [26]:
from collections import Counter
from tqdm import tqdm
pretrained_lang_model = "./lid218e.bin" # path of pretrained model file
fasttext_model = fasttext.load_model(pretrained_lang_model)

def get_languages(file):

    file = file.resolve()

    #print(f"Opening {file}")
    # Open the input file as a Word document
    try :
        document = docx.Document(file)
    except BadZipFile:
        print(f"BadZipFile Error on opening {file}")

    paragraphs = [para for para in document.paragraphs]
    sentences = [sentence for para in document.paragraphs for sentence in sent_tokenize(para.text)]

    languages = Counter()
    for sentence in sentences:
        predictions = fasttext_model.predict(sentence, k=1)
        #print(predictions)
        output_lang = predictions[0][0].replace('__label__', '')
        #print(output_lang)
        languages.update([output_lang])

    return languages


def show_languages(files):
    results = {}

    for file in tqdm(files):
        file = file.resolve()
        languages = get_languages(file)

        #print(languages)
        #print(f"There are {len(sentences)} sentences in {output_file}.")
        results[file] = {"languages": languages, "sentence_count": len(sentences)}

    for file, info in results.items():
        print(f"There are {info['sentence_count']} sentences in {str(file)[23:]}    {info['languages'].most_common()}")
        #print(f"{info['languages']}")



In [35]:
folder = Path("/home/curleyd/pytorch_stuff/Downloads/")
files = [file for file in folder.rglob("*." + ext_in)]
print(f"Found {len(files)} {ext_in} files in {folder.resolve(True)}")

for i, file in enumerate(files,1):
    file = file.resolve()
    languages_in_file = get_languages(file)
    top_language_in_file = languages_in_file.most_common(1)[0][0]
    file_is_english = top_language_in_file == "eng_Latn"

    #print(top_language_in_file, file_is_english)

    if file_is_english:
        print(f"{i:>4} : Translating file {file} from English to Spanish.")
        try :
            document = docx.Document(file)
        except BadZipFile:
            print(f"BadZipFile Error on opening {file}")
            continue

        # Save the file.
        document.save(file)

        # Translate the content
        paragraphs = translate_docx(file)

        print(f"{i:>4} : Translated file {file} from English to Spanish")
    else:
        print(f"{i:>4} : Not translating file {file}. It seems to be in :{top_language_in_file}.")




Found 22 docx files in /home/curleyd/pytorch_stuff/Downloads
   1 : Not translating file /home/curleyd/pytorch_stuff/Downloads/Honorandrepsect.docx. It seems to be in :npi_Deva.
   2 : Not translating file /home/curleyd/pytorch_stuff/Downloads/Biblestudyquestions.docx. It seems to be in :npi_Deva.
   3 : Not translating file /home/curleyd/pytorch_stuff/Downloads/Hope.docx. It seems to be in :npi_Deva.
   4 : Not translating file /home/curleyd/pytorch_stuff/Downloads/Stepstowardsfreedom.docx. It seems to be in :npi_Deva.
   5 : Not translating file /home/curleyd/pytorch_stuff/Downloads/CommunicatingwithJesus.docx. It seems to be in :npi_Deva.


IndexError: list index out of range