In [5]:
from huggingface_hub import notebook_login, login

In [6]:
notebook_login()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/geshi/.cache/huggingface/token
Login successful


In [7]:
from datasets import load_dataset

In [8]:
books = load_dataset("opus_books", "en-fr")

In [9]:
books = books["train"].train_test_split(test_size=0.2)

In [10]:
books["train"][0]

{'id': '123044',
 'translation': {'en': 'Fortunately a door opened here and they passed into a passage.',
  'fr': "Par bonheur, une porte s'ouvrait la, et ils déboucherent dans une voie."}}

In [11]:
from transformers import AutoTokenizer

In [12]:
checkpoint = "google-t5/t5-small"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [14]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

In [15]:
def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [16]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [17]:
from transformers import DataCollatorForSeq2Seq

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [19]:
import evaluate

In [20]:
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [21]:
import numpy as np

In [22]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [23]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Train

In [25]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [26]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [30]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_opus_books_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [31]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.8778,1.648568,5.3409,17.6391
2,1.8405,1.627918,5.4985,17.627




TrainOutput(global_step=6356, training_loss=1.8872595679617439, metrics={'train_runtime': 1498.7964, 'train_samples_per_second': 135.666, 'train_steps_per_second': 4.241, 'total_flos': 5763730456313856.0, 'train_loss': 1.8872595679617439, 'epoch': 2.0})

In [33]:
trainer.push_to_hub()

events.out.tfevents.1725058630.cajal.1812718.1:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/geshijoker/t5_opus_books_model/commit/9be88bf07ce1b0359db83fd3df4409d7b9874120', commit_message='End of training', commit_description='', oid='9be88bf07ce1b0359db83fd3df4409d7b9874120', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [34]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [35]:
from transformers import pipeline

In [36]:
translator = pipeline("translation_xx_to_yy", model="geshijoker/t5_opus_books_model")
translator(text)

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'translation_text': 'Les légumes partagent les ressources avec les bactéries fixatrice'}]

# Inference by pytorch

In [39]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [38]:
tokenizer = AutoTokenizer.from_pretrained("geshijoker/t5_opus_books_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [40]:
model = AutoModelForSeq2SeqLM.from_pretrained("geshijoker/t5_opus_books_model")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [41]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Legumes partagent des ressources avec les bactéries qui fixent l’azote.'