In [None]:
!pip install ctranslate2
!pip install pyonmttok
!pip install huggingface_hub
!pip install transformers
!pip install simalign
!pip install nltk



In [None]:
# We connect with Drive
from google.colab import drive
import os
import json
from tqdm import tqdm

from pprint import pprint
import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download

drive.mount('/content/drive')
root_path = '/content/drive/My Drive/Translator/'
files = os.listdir(root_path)

%cd "/content/drive/My Drive/Translator/"

from texttokenizer import TextTokenizer
import pandas as pd
import unicodedata

from nltk import word_tokenize
from simalign import SentenceAligner
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

nltk.download('punkt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Translator


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# We define some custom functions to help us translate

def _normalize_input_string(result):
    result = unicodedata.normalize('NFC', result)
    return result

def _translate_batch(input_batch, spm, model, max_sentence_batch=10):

    batch_input_tokenized = []
    batch_input_markers = []

    num_sentences = len(input_batch)
    for pos in range(0, num_sentences):
        tokenized = spm.tokenize(input_batch[pos])[0]
        batch_input_tokenized.append(tokenized)

    batch_output = []
    for offset in range(0,len(batch_input_tokenized), max_sentence_batch):
      partial_result = model.translate_batch(batch_input_tokenized[offset:offset+max_sentence_batch], return_scores=False, replace_unknowns=True)
      for pos in range(0,len(partial_result)):
        tokenized = partial_result[pos][0]['tokens']
        translated = spm.detokenize(tokenized)
        batch_output.append(translated)

    return batch_output

def find_translated_object(text_english, text_catalan, object_english, aligner):

  tokenized_english = word_tokenize(text_english)
  tokenized_catalan = word_tokenize(text_catalan)
  alignment = aligner.get_word_aligns(tokenized_english,tokenized_catalan)["mwmf"]
  object_tokenized = word_tokenize(object_english)

  if object_tokenized[-1] == ".":
    object_tokenized[-2] = object_tokenized[-2]+object_tokenized[-1]
    object_tokenized.pop()

  indices_object = [tokenized_english.index(word) for word in object_tokenized]
  index_catalan_tok = [el[1] for el in alignment for index_object in indices_object if el[0]==index_object]
  index_catalan_tok.sort()
  object_catalan = " ".join(word for word in [tokenized_catalan[i] for i in range(index_catalan_tok[0], index_catalan_tok[-1]+1)]) # You need to do this to avoid some problems

  try:
    index_catalan = text_catalan.rindex(object_catalan)

  except:
    print("-> Something happened")
    print(text_catalan)
    print(index_catalan_tok)
    ini = index_catalan_tok[0]
    fin = index_catalan_tok[-1]
    index_catalan = text_catalan.index(tokenized_catalan[ini])
    fin_catalan = text_catalan.index(tokenized_catalan[fin])+len(tokenized_catalan[fin])
    object_catalan = text_catalan[index_catalan:fin_catalan]
    print("Final Out:", object_catalan, "\nIndex:", index_catalan)

  return object_catalan, index_catalan

In [None]:
# We download the counterfact data that we want to translate (Some elements have been removed)
file = "./english_CF.json"

with open(file,"r") as f:
  data = json.load(f)

In [None]:
language='English'

# Model to translate
model_dir = snapshot_download(repo_id="projecte-aina/mt-aina-en-ca", revision="main")
spm = pyonmttok.Tokenizer(mode="none",sp_model_path=model_dir + '/spm.model')
translator = ctranslate2.Translator(model_dir, device="cuda")
tokenizer = TextTokenizer(language)

# Aligner
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

2023-11-21 10:48:49,811 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased
INFO:simalign.simalign:Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [None]:
fields = ["paraphrase_prompts",
          "neighborhood_prompts",
          "attribute_prompts",
          "generation_prompts"]

max_sentence_batch = 10

errors = []
success = 0
failure = 0
new_data = []

num_to_translate = 3
for ex in tqdm(data[0:num_to_translate]):

  try:

    relation_id = ex["requested_rewrite"]["relation_id"]

    dict_ex = {}
    dict_ex["case_id"] = ex["case_id"]
    dict_ex["pararel_idx"] = ex["pararel_idx"]
    dict_ex["relation_id"] =  ex["requested_rewrite"]["relation_id"]

    errors_dict = {}
    errors_dict["case_id"] = ex["case_id"]
    errors_dict["pararel_idx"] = ex["pararel_idx"]
    errors_dict["relation_id"] =  ex["requested_rewrite"]["relation_id"]

    # We are generating a variant of the format to avoid problems with
    # masculine and femenine words
    for target in ["target_new","target_true"]:
      errors_dict[target] = False
      dict_ex[f"eval_{target}"] = {}

      text = ex["requested_rewrite"]["prompt"].format(ex["requested_rewrite"]["subject"])
      text = _normalize_input_string(text)
      text += " " + ex["requested_rewrite"][target]["str"]

      # We translate
      sentences, translate = tokenizer.tokenize(text)
      num_sentences = len(sentences)
      sentences_batch = []
      indexes = []
      results = ["" for x in range(num_sentences)]

      for i in range(num_sentences):
        if translate[i] is False:
          continue
        sentences_batch.append(sentences[i])
        indexes.append(i)

      translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch)
      for pos in range(0, len(translated_batch)):
        i = indexes[pos]
        results[i] = translated_batch[pos]

      translated = tokenizer.sentence_from_tokens(sentences, translate, results)

      # We find subject and target to extract it from the text
      # Subject
      subject_catalan, index_subj_catalan = find_translated_object(text, translated, ex["requested_rewrite"]["subject"], myaligner)
      # Object
      target_catalan, index_tar_catalan = find_translated_object(text, translated, ex["requested_rewrite"][target]["str"], myaligner)

      prompt = translated[:index_tar_catalan-1]
      prompt = prompt[:index_subj_catalan] + "{}" + prompt[index_subj_catalan+len(subject_catalan):]

      dict_ex[f"eval_{target}"]["requested_rewrite"] = {"prompt": prompt,
                                                        target: {"str": target_catalan, "id":ex["requested_rewrite"][target]["id"]},
                                                        "subject": subject_catalan}
      if index_subj_catalan>index_tar_catalan:
        errors_dict["Horrible error"] = True
        errors_dict["exact_horrible_errors"] = [{"all_translated": translated, "english_text":text, "target": target_catalan}]
        print("---------------------")
        print("Another error")
        print("English: ", text)
        print("All translated: ", translated)
        print("Target catalan: ", target_catalan)
        print("Index detected in catalan: ", index_tar_catalan)
        print("Translation without subject", translated[:index_tar_catalan-1])
        print("Possible misalignment between object at the end of the sentence")
        print("---------------------")

      for field in fields:

        dict_ex[f"eval_{target}"][field] = []

        for text in ex[field]:

          # We add the object to the subjects
          text = _normalize_input_string(text)
          text += " " + ex["requested_rewrite"][target]["str"]

          # We translate
          sentences, translate = tokenizer.tokenize(text)
          num_sentences = len(sentences)
          sentences_batch = []
          indexes = []
          results = ["" for x in range(num_sentences)]

          for i in range(num_sentences):
            if translate[i] is False:
              continue
            sentences_batch.append(sentences[i])
            indexes.append(i)

          tokenized_english = word_tokenize(text)
          index_english = len(tokenized_english)-1-tokenized_english[::-1].index(ex["requested_rewrite"][target]["str"]) # You need to take the last token of the sentence

          translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch)
          for pos in range(0, len(translated_batch)):
            i = indexes[pos]
            results[i] = translated_batch[pos]

          translated = tokenizer.sentence_from_tokens(sentences, translate, results)
          tokenized_catalan = word_tokenize(translated)

          # We check where is the object translation
          alignment = myaligner.get_word_aligns(tokenized_english, tokenized_catalan)["mwmf"]

          # If the index is not at the end of the sentence, print it
          try:
            index_catalan = [el for el in alignment if el[0]==index_english][0][1]
          except:
            print("Failure with the alignment")
            errors_dict[target] = True
            errors_dict["exact_errors"] = [{"all_translated": translated, "english_text":text, "target": translated_object}]
            index_catalan = len(tokenized_catalan)-1

          translated_object = tokenized_catalan[index_catalan]

          # We remove the translated object and all the words after. We also remove the space token
          index_object = translated.rindex(translated_object)

          if index_catalan + 1 != len(tokenized_catalan):
            if errors_dict[target] == False:
              errors_dict[target] = True
              errors_dict["exact_errors"] = [{"all_translated": translated, "english_text":text, "target": translated_object}]
            else:
              errors_dict["exact_errors"].append({"all_translated": translated, "english_text":text, "target": translated_object})

          translated = translated[:index_object-1]

          # We add the translated sentence
          dict_ex[f"eval_{target}"][field].append(translated)

    # print(possible_dict)
    new_data.append(dict_ex)
    errors.append(errors_dict)
  except:
    print(f"Error in example {ex['case_id']}")



100%|██████████| 3/3 [00:25<00:00,  8.53s/it]


In [None]:
new_data

[{'case_id': 2,
  'pararel_idx': 13704,
  'relation_id': 'P1303',
  'eval_target_new': {'requested_rewrite': {'prompt': '{}, el',
    'target_new': {'str': 'piano', 'id': 'Q5994'},
    'subject': 'Toko Yasuda'},
   'paraphrase_prompts': ['Inicialment i són zero i és fals. Toko Yasuda, tocant al',
    'La densitat de població era . Toko Yasuda toca el'],
   'neighborhood_prompts': ['Paul McCartney toca el',
    'John Lennon, tocant el',
    'Elvis Presley, el',
    'Douglas Adams, tocant el',
    'John Lennon toca el',
    'Jimi Hendrix, tocant el',
    'Ringo Starr, tocant el',
    'Leonard Cohen toca el',
    'Bruce Springsteen, tocant el',
    'John Lennon toca el'],
   'attribute_prompts': ['Robert Radecke toca el',
    'Peter Igelhoff, el',
    'Grete von Zieritz, tocant al',
    'Anton Rubinstein, tocant el',
    'Hauschka, tocant el',
    'Joseph Fischhof, el',
    'Paul Badura-Skoda toca el',
    'Richard Fall actua al',
    'Justus Frantz, el',
    'Laci Boldemann, tocant al'],

In [None]:
# Write the formatted json
with open("./new_data_example.json", "w") as f:
    f.write(json.dumps(new_data, indent=4))

with open("./error_example.json", "w") as f:
    f.write(json.dumps(errors, indent=4))