In [None]:
# ! pip install huggingface_hub

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
! pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = load_dataset("aslg_pc12")
dataset = dataset["train"].train_test_split(train_size=0.8)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [5]:
source_lang = "gloss"
target_lang = "text"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True)

In [None]:
! pip install sacrebleu

In [None]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")

In [9]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="transformer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.1509,0.743268,49.4587,15.7111
2,0.9407,0.623432,54.9088,15.6437
3,0.8687,0.594972,56.2173,15.625




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13158, training_loss=1.1230986131725205, metrics={'train_runtime': 2046.9226, 'train_samples_per_second': 102.839, 'train_steps_per_second': 6.428, 'total_flos': 3208336615931904.0, 'train_loss': 1.1230986131725205, 'epoch': 3.0})

In [None]:
! pip install googletrans==3.1.0a0

In [18]:
from googletrans import Translator
translators = Translator()


In [None]:
from transformers import pipeline

translator = pipeline("translation", model="transformer_model")

prediction = []
answer = []

def testing_data():
  for i in range(20, 25, 1):
    text = dataset["test"][i]["gloss"]
    answer.append(dataset["test"][i]["text"])
    prediction.append(translators.translate(translator(text)[0]["translation_text"]).text)

In [29]:
testing_data()
print(prediction)
print(answer)

['Nicht ALIGNMENT WITH RESPECT to NATO is modern ALTERNATIVE for the mature state .', 'This is before the Huge TRAGEDY.', 'it is desired to intensify in the search and in nature.', 'i have many reviews relating to the UNHCR .', 'he does not express his concern on the committee .']
['non alignment with respect to nato is the modern alternative for a mature state .\n', 'this is therefore a huge tragedy .\n', 'it is the first to suffer the increase in droughts and natural disasters .\n', "i have many reservations about the unhcr's overall credibility .\n", 'he did express his condolences on behalf of the commission .\n']
