In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
! pip install transformers datasets evaluate

In [7]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = load_dataset("aslg_pc12")
dataset = dataset["train"].train_test_split(train_size=0.8)



  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
source_lang = "gloss"
target_lang = "text"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [12]:
tokenized_data = dataset.map(preprocess_function, batched=True)

  0%|          | 0/71 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

In [13]:
! pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")

In [15]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="transformer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.1574,0.738817,49.1743,15.6537
2,0.938,0.618889,54.3456,15.6047
3,0.8909,0.591974,55.6738,15.5762


Saving model checkpoint to transformer_model/checkpoint-13000
Configuration saved in transformer_model/checkpoint-13000/config.json
Model weights saved in transformer_model/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in transformer_model/checkpoint-13000/tokenizer_config.json
Special tokens file saved in transformer_model/checkpoint-13000/special_tokens_map.json
Deleting older checkpoint [transformer_model/checkpoint-11500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, gloss. If text, gloss are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17542
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13158, training_loss=1.1222635127976592, metrics={'train_runtime': 2235.7705, 'train_samples_per_second': 94.153, 'train_steps_per_second': 5.885, 'total_flos': 3206598319669248.0, 'train_loss': 1.1222635127976592, 'epoch': 3.0})

In [19]:
dataset["train"][5]

{'gloss': 'BE X-IT TO BE ARTICLE 215 OR ARTICLE 75 TREATY ON FUNCTION EUROPEAN UNION ?\n',
 'text': 'is it to be article 215 or article 75 of the treaty on the functioning of the european union ?\n'}

In [20]:
text = dataset["train"][5]["gloss"]

In [27]:
from transformers import pipeline

translator = pipeline("translation", model="my_awesome_opus_books_model")

prediction = []
answer = []

def testing_data():
  for i in range(5):
    text = dataset["train"][i]["gloss"]
    answer.append(dataset["train"][i]["text"])
    prediction.append(translator(text))

loading configuration file my_awesome_opus_books_model/config.json
Model config T5Config {
  "_name_or_path": "my_awesome_opus_books_model",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_

In [28]:
testing_data()
print(prediction)
print(answer)

[[{'translation_text': 'IF REFERENDUM FAIL , SIR , you will not be thankful by the long-term prosperity of Europe .'}], [{'translation_text': 'ich beabsichtige, daß es in südlich von Lanka zu einer eklatanten Trennung kommt .'}], [{'translation_text': 'es ist nicht mehr ungewöhnlich, als eine ungewöhnliche GRUPPE in gleicher Weise .'}], [{'translation_text': 'X-I notice very carefully how it is done .'}], [{'translation_text': 'ITS implementation will have a large consensus for budgetary power institution and financial implementation .'}]]
['if the referendum fails , sir , you will not be thanked by the vast majority of europeans .\n', 'i therefore wish merely to say that the situation in sri lanka is extremely tragic .\n', 'there is nothing more unfair than treating unequal groups in the same way .\n', 'i noticed very carefully how it was done .\n', 'its implementation will have major consequences for the budgetary powers of the institutions and financial implications .\n']
