**Cell 1: Language Detection**

In [3]:
# Download the fastText language model and install the required package
!pip install fasttext

import torch
import os
import fasttext

# check if cuda is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU instead.")

# Load the pre-trained language model
pretrained_lang_model = "./lid218e.bin"  # path of the pretrained model file
# Check if the file exists
if not os.path.isfile(pretrained_lang_model):
    # If the file doesn't exist, download it
    !wget https://dl.fbaipublicfiles.com/nllb/lid/lid218e.bin
fasttext_model = fasttext.load_model(pretrained_lang_model)

# Sample text for language detection
text = "صباح الخير، الجو جميل اليوم والسماء صافية."

# Predict the language of the text
predictions = fasttext_model.predict(text, k=1)
print(predictions)

# Extract and print the language code
input_lang = predictions[0][0].replace('__label__', '')
print(input_lang)


Using GPU: NVIDIA GeForce RTX 3090


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

**Cell 2: Installing Necessary Packages**

In [None]:
# Install necessary packages for tokenization and translation
!pip install -U pip transformers
!pip install sentencepiece
!pip install python-docx
!pip install nltk

# Import NLTK and download required data
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


**Cell 3: Importing Additional Libraries**

In [11]:
# Import necessary libraries for document processing
import docx
import glob
from pathlib import Path, PurePath


**Cell 4:Setting Up Translation Model**

In [None]:
# Define the translation model checkpoint and load the model and tokenizer
checkpoint = 'facebook/nllb-200-3.3B'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


**Cell 5: Testing Sentence Tokenizer**

**Cell 6: Defining Target Language and File Names**

In [13]:
# List of target languages and corresponding file names
languages = [
    # ("amh_Ethi", "Ethiopian"),
    # ("arb_Arab", "Arabic"),
    # ("asm_Beng", "Assamese"),
    # ("ben_Beng", "Bangal"),
    # ("por_Latn", "BPortugese"),
    # ("mya_Mymr", "Burmese"),
    # ("ceb_Latn", "Cebuano"),
    # ("zsm_Latn", "Chinese"),
    # ("fra_Latn", "French"),
    # ("guj_Gujr", "Gujarati"),
    # ("hau_Latn", "Hausa"),
    # ("hin_Deva", "Hindi"),
    # ("ilo_Latn", "Illocano"),
    # ("ind_Latn", "Indonesian"),
    # ("kan_Knda", "Kannada"),
    # ("khm_Khmr", "Khmer"),
    # ("lao_Laoo", "Laotian"),
    ("spa_Latn", "LASpanish"),
    # ("mal_Mlym", "Malayalam"),
    # ("npi_Deva", "Nepali"),
    # ("ory_Orya", "Oriya"),
    # ("plt_Latn", "PlatMalagasy"),
    # ("pan_Guru", "EPunjabi"),
    # ("rus_Cyrl", "Russian"),
    # ("swh_Latn", "Swahili"),
    # ("tgl_Latn", "Tagalog"),
    # ("tam_Taml", "Tamil"),
    # ("tel_Telu", "Telugu"),
    # ("tha_Thai", "Thai"),
    # ("tpi_Latn", "TokPisin"),
    # ("urd_Arab", "Urdu"),
    # ("vie_Latn", "Vietnamese")
]


**Cell 7: Helper Function to Get Languages from DOCX Files**

In [14]:
from collections import Counter

def get_languages(file_path):
    # Function to detect languages used in the document
    doc = docx.Document(file_path)
    full_text = []
    
    for para in doc.paragraphs:
        full_text.append(para.text)
        
    text = ' '.join(full_text)
    sentences = sent_tokenize(text)
    
    languages = Counter()
    
    for sentence in sentences:
        prediction = fasttext_model.predict(sentence, k=1)
        lang = prediction[0][0].replace('__label__', '')
        languages.update([lang])
    
    return languages


**Cell 8: Translate DOCX File Content**

In [15]:
def translate_docx(file_path):
    """
    Function to translate DOCX file content from English to Spanish
    """
    doc = docx.Document(file_path)
    translated_paragraphs = []
    
    for para in doc.paragraphs:
        input_text = para.text
        translated_text = translate_text(input_text)
        translated_paragraphs.append(translated_text)
    
    # Save the translated paragraphs back to the document
    for para, translated_text in zip(doc.paragraphs, translated_paragraphs):
        para.text = translated_text
    
    doc.save(file_path)

def translate_text(text, source_lang='eng_Latn', target_lang='spa_Latn'):
    # Function to translate text using the loaded translation model
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    
    
    return translated_text


**Cell 9: Processing Files in the Input Folder**

In [None]:
from tqdm import tqdm

# Define the input folder and file extension
input_folder = Path("/home/drew/Documents/GitHub/MTFiles/Input")
ext_in = "docx"

# Get the list of files to process
files = [file for file in input_folder.rglob("*." + ext_in)]

print(f"Found {len(files)} {ext_in} files in {input_folder.resolve()}")

# Process each file
results = {}

for i, file in enumerate(files, 1):
    file = file.resolve()
    languages_in_file = get_languages(file)
    top_language_in_file = languages_in_file.most_common(1)[0][0]
    file_is_english = top_language_in_file == "eng_Latn"
    
    if file_is_english:
        print(f"{i:>4} : Translating file {file} from English to Spanish.")
        try:
            document = docx.Document(file)
        except BadZipFile:
            print(f"BadZipFile Error on opening {file}")
            continue
        
        # Save the file
        document.save(file)
        
        # Translate the content
        paragraphs = translate_docx(file)
        
        print(f"{i:>4} : Translated file {file} from English to Spanish")
    else:
        print(f"{i:>4} : Not translating file {file}. It seems to be in: {top_language_in_file}.")
