In [1]:
!pip install evaluate
!pip install datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [3]:
import pandas as pd
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from evaluate import load
from seq2seq import create_transformers_train_data, decode_with_transformer

In [4]:
data = pd.read_csv('en_es_corpus.txt',
                   sep='\t', header=None)
data

Unnamed: 0,0,1,2
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
...,...,...,...
100332,Are you going to sell him your house?,¿Le vas a vender tu casa a él?,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
100333,Are you going to stay in bed all day?,¿Te vas a quedar en la cama todo el día?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
100334,Are you going to the concert tonight?,¿Va usted al concierto esta noche?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
100335,Are you going to the theater tonight?,¿Vas a ir al teatro esta noche?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [5]:
data = data[[0, 1]]
data.columns = ['EN', 'ES']
data

Unnamed: 0,EN,ES
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
100332,Are you going to sell him your house?,¿Le vas a vender tu casa a él?
100333,Are you going to stay in bed all day?,¿Te vas a quedar en la cama todo el día?
100334,Are you going to the concert tonight?,¿Va usted al concierto esta noche?
100335,Are you going to the theater tonight?,¿Vas a ir al teatro esta noche?


In [6]:
data = data[-500:]
data

Unnamed: 0,EN,ES
99837,We didn't want to begin without you.,No queríamos empezar sin vos.
99838,"We didn't want to go, but we had to.","No queríamos ir, pero tuvimos que hacerlo."
99839,We discovered that it was all a lie.,Descubrimos que todo era mentira.
99840,We discussed a wide range of topics.,Discutimos una amplia gama de temas.
99841,We discussed the new plan yesterday.,Ayer discutimos el nuevo plan.
...,...,...
100332,Are you going to sell him your house?,¿Le vas a vender tu casa a él?
100333,Are you going to stay in bed all day?,¿Te vas a quedar en la cama todo el día?
100334,Are you going to the concert tonight?,¿Va usted al concierto esta noche?
100335,Are you going to the theater tonight?,¿Vas a ir al teatro esta noche?


In [7]:
sentences_en = data['EN'].values.tolist()
sentences_en

["We didn't want to begin without you.",
 "We didn't want to go, but we had to.",
 'We discovered that it was all a lie.',
 'We discussed a wide range of topics.',
 'We discussed the new plan yesterday.',
 "We don't have anything else to lose.",
 "We don't know what we're up against.",
 "We don't need to ask for permission.",
 "We don't talk to each other anymore.",
 "We don't use force unless necessary.",
 "We don't want to scare the children.",
 "We don't want you to feel pressured.",
 'We drank cappuccinos and reminisced.',
 'We enjoyed ourselves at the seaside.',
 'We enjoyed playing chess last night.',
 'We followed the course of the river.',
 'We generally drink tea after a meal.',
 'We got rid of the mice in our house.',
 'We had hoped you could do it for us.',
 'We had our photo taken on the beach.',
 'We have a boxing club in our school.',
 'We have a break from 10:40 to 11:00.',
 'We have a lot of snow in the winter.',
 'We have a meeting with the director.',
 'We have been w

In [17]:
instruction_en = 'Translate from English to Spanish: '
instruction_es = 'Translate from Spanish to English: '
instruction_classification = 'Classify the following text into positive or negative: '

In [15]:
sentences_en_in = [f'{instruction_en}{s}' for s in sentences_en]
sentences_en_in

["Translate from English to Spanish: Translate from English to Spanish: We didn't want to begin without you.",
 "Translate from English to Spanish: Translate from English to Spanish: We didn't want to go, but we had to.",
 'Translate from English to Spanish: Translate from English to Spanish: We discovered that it was all a lie.',
 'Translate from English to Spanish: Translate from English to Spanish: We discussed a wide range of topics.',
 'Translate from English to Spanish: Translate from English to Spanish: We discussed the new plan yesterday.',
 "Translate from English to Spanish: Translate from English to Spanish: We don't have anything else to lose.",
 "Translate from English to Spanish: Translate from English to Spanish: We don't know what we're up against.",
 "Translate from English to Spanish: Translate from English to Spanish: We don't need to ask for permission.",
 "Translate from English to Spanish: Translate from English to Spanish: We don't talk to each other anymore.",
 

In [10]:
sentences_es = data['ES'].values.tolist()
sentences_es

['No queríamos empezar sin vos.',
 'No queríamos ir, pero tuvimos que hacerlo.',
 'Descubrimos que todo era mentira.',
 'Discutimos una amplia gama de temas.',
 'Ayer discutimos el nuevo plan.',
 'No tenemos nada más que perder.',
 'No sabemos a lo que nos enfrentamos.',
 'No tenemos que pedir permiso.',
 'No nos hablamos más.',
 'No utilizamos la fuerza a menos que sea necesario.',
 'No queremos asustar a los niños.',
 'No queremos que te sientas presionado.',
 'Bebimos capuccinos y rememoramos.',
 'Nos lo pasamos muy bien en la playa.',
 'Anoche lo pasamos bien jugando al ajedrez.',
 'Seguimos el curso del río.',
 'Generalmente tomamos té luego de una comida.',
 'Nos deshicimos de los ratones en nuestra casa.',
 'Esperábamos que pudieras hacerlo por nosotros.',
 'Nos tomamos la foto en la playa.',
 'Tenemos un club de boxeo en la escuela.',
 'El descanso es desde las 10:40h hasta las 11h.',
 'Tenemos mucha nieve en invierno.',
 'Tenemos una cita con el director.',
 'Llevamos horas es

In [18]:
sentences_es_in = [f'{instruction_es}{s}' for s in sentences_es]
sentences_es_in

['Translate from Spanish to English: No queríamos empezar sin vos.',
 'Translate from Spanish to English: No queríamos ir, pero tuvimos que hacerlo.',
 'Translate from Spanish to English: Descubrimos que todo era mentira.',
 'Translate from Spanish to English: Discutimos una amplia gama de temas.',
 'Translate from Spanish to English: Ayer discutimos el nuevo plan.',
 'Translate from Spanish to English: No tenemos nada más que perder.',
 'Translate from Spanish to English: No sabemos a lo que nos enfrentamos.',
 'Translate from Spanish to English: No tenemos que pedir permiso.',
 'Translate from Spanish to English: No nos hablamos más.',
 'Translate from Spanish to English: No utilizamos la fuerza a menos que sea necesario.',
 'Translate from Spanish to English: No queremos asustar a los niños.',
 'Translate from Spanish to English: No queremos que te sientas presionado.',
 'Translate from Spanish to English: Bebimos capuccinos y rememoramos.',
 'Translate from Spanish to English: Nos 

In [20]:
input_sentences = sentences_en_in + sentences_es_in

In [21]:
output_sentences = sentences_es + sentences_en

In [22]:
len(input_sentences)

1000

In [23]:
len(output_sentences)

1000

In [24]:
train_en, test_en, train_es, test_es = train_test_split(input_sentences, output_sentences,
                 test_size=0.2, random_state=0)

In [25]:
model_name = 't5-small'

In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [34]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [28]:
len(train_en)

800

In [29]:
len(train_es)

800

In [35]:
train_set = create_transformers_train_data(train_en, train_es, tokenizer)

Input: This is input sentence
Output: This is output sentence

Input: This is input sentence
Input previous words: #start#
Output next word: This

Input: This is input sentence
Input previous words: #start# This
Output next word: is

Input: This is input sentence
Input previous words: #start# This is
Output next word: output

Input: This is input sentence
Input previous words: #start# This is output
Output next word: sentence

Input: This is input sentence
Input previous words: #start# This is output sentence
Output next word: #end#

In [36]:
train_set

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [37]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                       model=model_name,
                                       return_tensors='tf')

train_set = model.prepare_tf_dataset(train_set,
                                     collate_fn=data_collator)

In [38]:
train_set

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(8, 10), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, 10), dtype=tf.int64, name=None)}, TensorSpec(shape=(8, 10), dtype=tf.int64, name=None))>

In [39]:
model.compile('adam')

In [40]:
model.fit(train_set, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7ddc0defecb0>

In [41]:
input_example = test_en[0]
gt_example = test_es[0]

In [42]:
predicted_example = decode_with_transformer(test_en[0], tokenizer, model)

In [43]:
print(f'Input: {input_example}\nGT: {gt_example}\nPred: {predicted_example}')

Input: Translate from Spanish to English: ¿Vas a ir a la fiesta de despedida de Tom?
GT: Translate from English to Spanish: Are you going to Tom's goodbye party?
Pred: Translate from English to Spanish: We'


In [44]:
inputs = test_en
gts = test_es
preds = []
for inp in inputs:
  pred = decode_with_transformer(inp, tokenizer, model)
  preds.append(pred)

In [45]:
for inp, gt, pred in zip(inputs, gts, preds):
  print(f'Input: {inp}\nGT: {gt}\nPred: {pred}\n')

Input: Translate from Spanish to English: ¿Vas a ir a la fiesta de despedida de Tom?
GT: Translate from English to Spanish: Are you going to Tom's goodbye party?
Pred: Translate from English to Spanish: We'

Input: Translate from Spanish to English: Deberías haber sido más cuidadoso.
GT: Translate from English to Spanish: You ought to have been more careful.
Pred: Translate from English to Spanish: You should

Input: Translate from English to Spanish: Translate from English to Spanish: You can use my computer if you want.
GT: Puedes usar mi ordenador si quieres.
Pred: Por qué te

Input: Translate from Spanish to English: Nos preparamos para la colisión.
GT: Translate from English to Spanish: We prepared ourselves for the crash.
Pred: Translate from English to Spanish: We'

Input: Translate from Spanish to English: ¿Cuándo empezasteis a estudiar inglés?
GT: Translate from English to Spanish: When did you begin studying English?
Pred: Translate from English to Spanish: We'

Input: Transl

In [46]:
metric = load('bleu')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [47]:
metric.compute(predictions=preds, references=gts)

{'bleu': 0.22233338133854938,
 'precisions': [0.6228571428571429,
  0.6258823529411764,
  0.6615384615384615,
  0.7288888888888889],
 'brevity_penalty': 0.33766051365513194,
 'length_ratio': 0.4794520547945205,
 'translation_length': 1050,
 'reference_length': 2190}