We will install the necessary packages:

In [1]:
# Dataset Manipulation

%pip install pandas
%pip install pyarrow

# Grammatical Check Tool
# - Requires Java installed at the machine

%pip install language-tool-python

# Translation - MarianMT

%pip install transformers
%pip install sentencepiece
%pip install sacremoses

# Translation - LibreTranslate

%pip install libretranslatepy

# Semantic Accuracy Tool

%pip install sentence_transformers
%pip install scipy

# Widget

%pip install --upgrade jupyter ipywidgets
%jupyter nbextension enable --py widgetsnbextension


Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%jupyter` not found.


We will try to load an existing dataset, or download it if it's not yet avaliable. Use Parquet file format.

In [2]:
import pandas as pd

dataset_name = "dolly-15k"
!wget https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/refs%2Fconvert%2Fparquet/databricks--databricks-dolly-15k/json-train.parquet
!mv json-train.parquet dolly-15k.parquet

original_language_suffix = "_en"
target_language_suffix = "_pt"

file_path_original = dataset_name + '.parquet'
file_path_target = dataset_name + '_hippo' + target_language_suffix + '.parquet'

try:
  parquet_data_target = pd.read_parquet(file_path_target)
  print("Continuing existing file.")
except:
  parquet_data_target = pd.read_parquet(file_path_original)
  print("Starting new file.")

# Define the fields to be translated
keys_to_translate = ["instruction","context","response"]

# Test
print(parquet_data_target.iloc[0])

Connecting to huggingface.co (108.158.172.25:443)
Connecting to cdn-lfs.huggingface.co (18.244.202.105:443)
saving to 'json-train.parquet'
'json-train.parquet' saved


Continuing existing file.
instruction                     When did Virgin Australia start operating?
context                  Virgin Australia, the trading name of Virgin A...
response                 Virgin Australia commenced services on 31 Augu...
category                                                         closed_qa
instruction_pt_marian    Quando é que a Virgem Austrália começou a operar?
hippo_check                                                            0.0
instruction_pt_libre                                                  None
context_pt_marian                                                     None
response_pt_marian                                                    None
context_pt_libre                                                      None
response_pt_libre                                                     None
Name: 0, dtype: object


We will load and test the Grammatical Check Tool, defining the target language code.

In [3]:
# Prepare the Grammatical Check Tool

import language_tool_python

language_tool_language_code = 'pt-PT'

tool = language_tool_python.LanguageTool(language_tool_language_code)

max_loops = 2

def apply_and_check_languagetool_corrections(text):
    
    remaining_corrections = None
    all_issues_resolved = False
    loop = 1

    while remaining_corrections != 0 and loop <= max_loops:
        loop = loop + 1
        # Get the matches (errors) in the text
        matches = tool.check(text)

        # Apply the corrections
        for match in reversed(matches):  # Reversed to avoid index problems when applying multiple corrections
            if match.replacements:  # If there are suggestions
                start = match.offset
                end = match.offset + match.errorLength
                correction = match.replacements[0]
                text = text[:start] + correction + text[end:]
        
        # Check if there are remaining issues after correction
        remaining_matches = tool.check(text)
        all_issues_resolved = len(remaining_matches) == 0
        remaining_corrections = 0
        for match in matches:
            remaining_corrections += len(match.replacements)
    
    return text, all_issues_resolved

# Preliminary test

text = "Esse erro nao e conplexo."
corrected_text, resolved = apply_and_check_languagetool_corrections(text)
print(f'Text "{corrected_text}". Grammatical errors: {str(not resolved)})')

text = "Esse error yes eh conlexo."
corrected_text, resolved = apply_and_check_languagetool_corrections(text)
print(f'Text "{corrected_text}". Grammatical errors: {str(not resolved)})')

Text "Esse erro não é complexo.". Grammatical errors: False)
Text "Esse erro ienes eh! Conexo.". Grammatical errors: True)


To use the translation tools we need to define a function to adjust the translation, avoiding losing the meaning of the instruction:

In [4]:
def adjust_translation(translated_text):

  translated_text = translated_text.replace("Inglês","Português")
  translated_text = translated_text.replace("inglês","português")

  return translated_text

Now we prepare and test the first automatic translation tool (MarianMT):

In [5]:
import torch
from transformers import MarianMTModel, MarianTokenizer

marian_suffix = "_marian"

max_token_length = 475 # A little less than the MarianMT 512 tokens hardcoded limit

model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)
marian_model = MarianMTModel.from_pretrained(model_name).to('cuda')

print(marian_tokenizer.supported_language_codes)
target_language_prefix = ">>pt<< "

def marian_translate(src_text):
  if not src_text or src_text == "":
    return ""

  len_error = False

  try:
    tokenized_input = marian_tokenizer(target_language_prefix + src_text, return_tensors="pt", padding=True).to('cuda')
    token_lengths = [len(sequence) for sequence in tokenized_input['input_ids']]
    max_length = max(token_lengths)
    if max_length <= max_token_length:
      translated = marian_model.generate(**tokenized_input, max_new_tokens=512)
    else:
      len_error = True
  except:
      raise Exception("This is a fatal error! (CUDA)")

  if len_error:
    return "LEN_ERROR"
  else:
    return adjust_translation([marian_tokenizer.decode(t, skip_special_tokens=True) for t in translated][0])

# Preliminary Test

src_text = "This is a second test, first translation tool working."
print(marian_translate(src_text))

['>>fr<<', '>>es<<', '>>it<<', '>>pt<<', '>>pt_br<<', '>>ro<<', '>>ca<<', '>>gl<<', '>>pt_BR<<', '>>la<<', '>>wa<<', '>>fur<<', '>>oc<<', '>>fr_CA<<', '>>sc<<', '>>es_ES<<', '>>es_MX<<', '>>es_AR<<', '>>es_PR<<', '>>es_UY<<', '>>es_CL<<', '>>es_CO<<', '>>es_CR<<', '>>es_GT<<', '>>es_HN<<', '>>es_NI<<', '>>es_PA<<', '>>es_PE<<', '>>es_VE<<', '>>es_DO<<', '>>es_EC<<', '>>es_SV<<', '>>an<<', '>>pt_PT<<', '>>frp<<', '>>lad<<', '>>vec<<', '>>fr_FR<<', '>>co<<', '>>it_IT<<', '>>lld<<', '>>lij<<', '>>lmo<<', '>>nap<<', '>>rm<<', '>>scn<<', '>>mwl<<']
Este é um segundo teste, primeira ferramenta de tradução de trabalho.


Now we prepare and test the second automatic translation tool (LibreTranslate). There must be a published URL to be used to query the API:

In [6]:
from libretranslatepy import LibreTranslateAPI

libre_suffix = "_libre"

lt = LibreTranslateAPI("http://127.0.0.1:5000")
lt_source = 'en'
lt_target = 'pt'

def libre_translate(src_text):
  if not src_text or src_text == "":
    return ""
  
  return lt.translate(src_text, lt_source, lt_target)

# Preliminary Test

src_text = "This is a third test, second translation tool working."
print(libre_translate(src_text))

Este é um terceiro teste, segunda ferramenta de tradução trabalhando.


Now we prepare the tool to check the Semantic Proximity, and tests it.

In [7]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

threshold = 0.80

semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

def check_semantic_proximity(sentence1, sentence2):
    embedding1 = semantic_model.encode(sentence1, convert_to_tensor=True).cpu()
    embedding2 = semantic_model.encode(sentence2, convert_to_tensor=True).cpu()

    cos_distance = cosine(embedding1, embedding2)
    score = 1 - cos_distance

    passed = score >= threshold
    return passed

# Preliminary test

sentence1 = "O gato está na caixa."
sentence2 = "A caixa contém o gato."
print("Sentences have similar semantic meaning: " + str(check_semantic_proximity(sentence1, sentence2)))

Sentences have similar semantic meaning: True


With everything prepared we enter the main loop, for each item of the dataset
it will check if the HIPPO validation was already done. If it's not, it will:

- Translate each field with MarianMT, checking if the grammar is right;
- Translate each field with LibreTranslate, checking if the grammar is right;
- Compare the embeddings of each field from one translation with the other, checking if the meaning was not lost;

If any of these checks fails the item is validated as a failure, and it passes to the next item. If it passes all testes the item is validated as a success.

In [8]:
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

hippo_check_field = 'hippo_check'

# Shared counter and lock for synchronization
lock = Lock()

save_interval = 100
num_threads = 18

def worker(args):
    idx, item, keys_to_translate, marian_translate, libre_translate, grammatical_check, check_semantic_proximity = args
    
    # Check if already processed
    if check_item_already_processed(item, hippo_check_field):
        return None, None
    
    # Clone the item for processing
    item_copy = item.copy()

    # Process item
    rejected = process_item_for_translation(item_copy, keys_to_translate, marian_translate, libre_translate, grammatical_check, check_semantic_proximity)
    
    # Update item_copy based on results
    if rejected:
        item_copy[hippo_check_field] = 0
    else:
        item_copy[hippo_check_field] = 1

    return idx, item_copy

def check_item_already_processed(item, hippo_check_field):
    """Check if the item was already processed (validated or rejected)"""
    return hippo_check_field in item.keys() and (item[hippo_check_field] == 0 or item[hippo_check_field] == 1)

def process_item_for_translation(item, keys_to_translate, marian_translate, libre_translate, grammatical_check, check_semantic_proximity):
    """Process the dataset item for translation and validation"""
    rejected = False
    for key in keys_to_translate:
        current_original_key = key
        marian_target_key = key + target_language_suffix + marian_suffix
        libre_target_key = key + target_language_suffix + libre_suffix
        
        original_text = item[current_original_key]
        
        # Translate with MarianMT
        translated_text_marian = marian_translate(original_text)
        item[marian_target_key] = translated_text_marian
        # Check grammar
        translated_text_marian, resolved = apply_and_check_languagetool_corrections(translated_text_marian)
        if not resolved:
            rejected = True
            break
        item[marian_target_key] = translated_text_marian
    
        # Check if LibreTranslate already exists
        translated_text_libre = libre_translate(original_text)
        # Check grammar
        translated_text_libre, resolved = apply_and_check_languagetool_corrections(translated_text_libre)
        if not resolved:
            rejected = True
            break
        item[libre_target_key] = translated_text_libre
        
        # Check semantic proximity
        if not check_semantic_proximity(translated_text_marian, translated_text_libre):
            rejected = True
            break
    
    return rejected

def process_dataset_direct_update(dataset, keys_to_translate, marian_translate, libre_translate, grammatical_check, check_semantic_proximity, hippo_check_field, save_interval=10):
    """Process the dataset with direct updates to a copied dataset"""
    
    # Create a copy of the dataset for updates
    dataset_copy = dataset.copy()
    processed_counter = 0
    
    args = [(idx, item, keys_to_translate, marian_translate, libre_translate, grammatical_check, check_semantic_proximity) for idx, item in dataset.iterrows()]
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Wrap args with tqdm
        progress_bar = tqdm(total=len(args), desc="Processing", dynamic_ncols=True)
        
        futures = [executor.submit(worker, arg) for arg in args]
        for future in as_completed(futures):
            idx, item_copy = future.result()
            
            if idx is None:
                with lock:
                    processed_counter += 1
                    # Update tqdm progress
                    progress_bar.update(1)
                continue
        
            # Reflect changes in the dataset_copy for specific columns
            modified_keys = [key + target_language_suffix + marian_suffix for key in keys_to_translate]
            modified_keys += [key + target_language_suffix + libre_suffix for key in keys_to_translate]
            modified_keys += [hippo_check_field]

            for key in modified_keys:
                # Ensure column exists in the dataset_copy
                if key in item_copy.keys() and not pd.isna(item_copy[key]):
                    if key not in dataset_copy.columns:
                        dataset_copy[key] = pd.Series(dtype='object')  # Explicitly creating the column
                    dataset_copy.at[idx, key] = item_copy[key]

            # Update the shared counter and save checkpoint if needed
            with lock:
                processed_counter += 1

                # Update tqdm progress
                progress_bar.update(1)
                progress_bar.set_postfix(current_idx=idx, total_processed=processed_counter, refresh=True)

                # Partial checkpoint
                if processed_counter % save_interval == 0:
                    dataset_copy.to_parquet(file_path_target)

        # Close the progress bar at the end
        progress_bar.close()

    # Final save
    dataset_copy.to_parquet(file_path_target)

# Process the dataset
process_dataset_direct_update(parquet_data_target, keys_to_translate, marian_translate, libre_translate, apply_and_check_languagetool_corrections, check_semantic_proximity, hippo_check_field)

Processing:   0%|          | 0/15011 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (702 > 512). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (512). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
