In [1]:
!pip install evaluate
!pip install datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [3]:
import pandas as pd
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from evaluate import load
from seq2seq import create_transformers_train_data, decode_with_transformer

In [16]:
data = pd.read_csv('en_es_corpus.txt',
                   sep='\t', header=None)
data

Unnamed: 0,0,1,2
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
...,...,...,...
139008,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
139009,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
139010,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe...",CC-BY 2.0 (France) Attribution: tatoeba.org #9...
139011,It may be impossible to get a completely error...,Puede que sea imposible obtener un corpus comp...,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [17]:
data = data[[0, 1]]
data.columns = ['EN', 'ES']
data

Unnamed: 0,EN,ES
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
139008,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
139009,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
139010,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."
139011,It may be impossible to get a completely error...,Puede que sea imposible obtener un corpus comp...


In [18]:
data = data[-500:]
data

Unnamed: 0,EN,ES
138513,"He's a really straight-laced guy, so he doesn'...","Él es realmente un tipo muy pundonoroso, por l..."
138514,I imagine that Tom will eventually find out th...,Me imagino que Tom eventualmente descubrirá qu...
138515,I never for a moment imagined that I would sti...,Nunca me imaginé ni por un momento que yo segu...
138516,I think it's highly unlikely that we'll ever g...,Creo que es altamente improbable que alguna ve...
138517,I would like to drastically decrease the amoun...,Me gustaría reducir drásticamente el tiempo qu...
...,...,...
139008,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
139009,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
139010,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."
139011,It may be impossible to get a completely error...,Puede que sea imposible obtener un corpus comp...


In [19]:
sentences_en = data['EN'].values.tolist()
sentences_en

["He's a really straight-laced guy, so he doesn't like the idea of his son changing jobs.",
 'I imagine that Tom will eventually find out that Mary has been talking behind his back.',
 'I never for a moment imagined that I would still be doing this kind of thing at my age.',
 "I think it's highly unlikely that we'll ever get any help from the national government.",
 'I would like to drastically decrease the amount of time it takes me to clean the house.',
 "If mankind doesn't take care of the environment, the environment may eliminate mankind.",
 "People who are constantly copying others do it because they can't think for themselves.",
 'Suddenly, there was a period of terrible violence and hatred between blacks and whites.',
 'That report was important because it pointed out all the errors the committee had made.',
 'The amount of paper produced by a country is closely related to its cultural standards.',
 'The first point that requires clarification is that the design was purely expe

In [23]:
prefix = 'Translate from English to Spanish: '

In [25]:
sentences_en = [f'{prefix}{s}' for s in sentences_en]
sentences_en

["Translate from English to Spanish: He's a really straight-laced guy, so he doesn't like the idea of his son changing jobs.",
 'Translate from English to Spanish: I imagine that Tom will eventually find out that Mary has been talking behind his back.',
 'Translate from English to Spanish: I never for a moment imagined that I would still be doing this kind of thing at my age.',
 "Translate from English to Spanish: I think it's highly unlikely that we'll ever get any help from the national government.",
 'Translate from English to Spanish: I would like to drastically decrease the amount of time it takes me to clean the house.',
 "Translate from English to Spanish: If mankind doesn't take care of the environment, the environment may eliminate mankind.",
 "Translate from English to Spanish: People who are constantly copying others do it because they can't think for themselves.",
 'Translate from English to Spanish: Suddenly, there was a period of terrible violence and hatred between black

In [20]:
sentences_es = data['ES'].values.tolist()
sentences_es

['Él es realmente un tipo muy pundonoroso, por lo que no le agrada la idea de su hijo de cambiar de trabajos.',
 'Me imagino que Tom eventualmente descubrirá que Mary ha estado hablando a sus espaldas.',
 'Nunca me imaginé ni por un momento que yo seguiría haciendo esta clase de cosas a mi edad.',
 'Creo que es altamente improbable que alguna vez consigamos cualquier ayuda del gobierno nacional.',
 'Me gustaría reducir drásticamente el tiempo que tardo en limpiar la casa.',
 'Si la humanidad no cuida el medio ambiente, el medio ambiente podría eliminar a la humanidad.',
 'Las personas que constantemente están copiando a otros lo hacen porque no son capaces de pensar por sí mismas.',
 'De repente, hubo un periodo de terrible violencia y odio entre blancos y negros.',
 'Ese informe fue importante porque señaló todos los errores cometidos por el comité.',
 'La cantidad de papel producido por un país está cercanamente relacionado a sus estándares culturales.',
 'El primer punto que requier

In [30]:
len(sentences_en)

500

In [31]:
len(sentences_es)

500

In [34]:
train_en, test_en, train_es, test_es = train_test_split(sentences_en, sentences_es,
                 test_size=0.2, random_state=0)

In [22]:
model_name = 't5-small'

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [28]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [35]:
len(train_en)

400

In [36]:
len(train_es)

400

In [37]:
train_set = create_transformers_train_data(train_en, train_es, tokenizer)



Input: This is input sentence
Output: This is output sentence

Input: This is input sentence
Input previous words: #start#
Output next word: This

Input: This is input sentence
Input previous words: #start# This
Output next word: is

Input: This is input sentence
Input previous words: #start# This is
Output next word: output

Input: This is input sentence
Input previous words: #start# This is output
Output next word: sentence

Input: This is input sentence
Input previous words: #start# This is output sentence
Output next word: #end#

In [38]:
train_set

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})

In [39]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                       model=model_name,
                                       return_tensors='tf')

train_set = model.prepare_tf_dataset(train_set,
                                     collate_fn=data_collator)

In [40]:
train_set

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(8, 10), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, 10), dtype=tf.int64, name=None)}, TensorSpec(shape=(8, 10), dtype=tf.int64, name=None))>

In [41]:
model.compile('adam')

In [42]:
model.fit(train_set, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7ab2fe04ae60>

In [44]:
input_example = test_en[0]
gt_example = test_es[0]

In [45]:
predicted_example = decode_with_transformer(test_en[0], tokenizer, model)



In [47]:
print(f'Input: {input_example}\nGT: {gt_example}\nPred: {predicted_example}')

Input: Translate from English to Spanish: I don't know what went on last night, but they're not speaking to each other this morning.
GT: No sé qué pasó anoche, pero ellos no se hablaban esta mañana.
Pred: No saba que 


In [49]:
inputs = test_en
gts = test_es
preds = []
for inp in inputs:
  pred = decode_with_transformer(inp, tokenizer, model)
  preds.append(pred)



In [51]:
for inp, gt, pred in zip(inputs, gts, preds):
  print(f'Input: {inp}\nGT: {gt}\nPred: {pred}\n')

Input: Translate from English to Spanish: I don't know what went on last night, but they're not speaking to each other this morning.
GT: No sé qué pasó anoche, pero ellos no se hablaban esta mañana.
Pred: No saba que 

Input: Translate from English to Spanish: I've kept my weight down even though many of my friends have gained weight as they've grown older.
GT: He mantenido mi peso bajo, a pesar de que muchos de mis amigos han aumentado de peso mientras se hacían mas viejos.
Pred: An es a

Input: Translate from English to Spanish: Leeches do not transmit diseases to humans, but in rare cases they can provoke an allergic reaction.
GT: Las sanguijuelas no transmiten enfermedades a los humanos pero en algunos raros casos pueden provocar alergia.
Pred: En rare cas, elles peuvent provoqu

Input: Translate from English to Spanish: The world's first Ferris wheel was built in Chicago. It was named after its constructor, George Washington Gale Ferris, Jr.
GT: La primera rueda de la fortuna del 

In [53]:
metric = load('bleu')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [55]:
metric.compute(predictions=preds, references=gts)

{'bleu': 0.0017225805353988167,
 'precisions': [0.37376237623762376,
  0.14473684210526316,
  0.08823529411764706,
  0.06363636363636363],
 'brevity_penalty': 0.013048150089036648,
 'length_ratio': 0.18729717199814558,
 'translation_length': 404,
 'reference_length': 2157}