# Hugging Face Translation Guide

link: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/gen/.huggingface/token
Login successful


In [None]:
!pip install transformers datasets evaluate sacrebleu

In [2]:
from datasets import load_dataset

# Use a parallel dataset, en-french
# link: https://huggingface.co/datasets/opus_books
books = load_dataset("opus_books", "en-fr")

Found cached dataset opus_books (/Users/gen/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


  0%|          | 0/1 [00:00<?, ?it/s]

In [33]:
books

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

In [31]:
# Split data for train/test datasets
books_train = books["train"].train_test_split(test_size=0.2)

In [34]:
books_train

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 25417
    })
})

In [4]:
# Example entry
books_train["train"][0]

{'id': '118667',
 'translation': {'en': 'Since the small stock of coal at the surface of the pits was exhausted, customers talked of going to Belgium, so that in future they would be threatened from that quarter.',
  'fr': "Depuis que le faible stock de houille s'épuisait sur le carreau des fosses, la clientele parlait de s'adresser en Belgique; et il y avait la, pour l'avenir, une menace."}}

# Data Preprocessing

In [5]:
# Set the T5 tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [36]:
# Tokenizing

source_lang = "en"
target_lang = "fr"

# Some models require a task prompt.
prefix = "translate English to French: "

def preprocess_function(examples):
    # Use to tokenize input, target separately since they are different languages.
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

# batching increases the concurrency
tokenized_books = books_train.map(preprocess_function, batched=True)

  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [37]:
tokenized_books

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25417
    })
})

In [9]:
# Select your ML Framework
use_pytorch = True
use_tensorflow = False

In [11]:
model = "t5-small"

# Dynamically pad sentences during collation

if use_pytorch:
    # With Pytorch
    from transformers import DataCollatorForSeq2Seq
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
elif use_tensorflow:
    # With TensorFlow
    from transformers import DataCollatorForSeq2Seq
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

# Evaluation

In [15]:
# Set up evaluation before training so you can use it during training.
# For example when calculating loss and determining when the model has trained enough.

import evaluate
sacrebleu = evaluate.load("sacrebleu")
# link: https://huggingface.co/spaces/evaluate-metric/sacrebleu

import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Unpack predictions if necessary
    if isinstance(preds, tuple):
        preds = preds[0]
        
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Convert string to list
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Calculate score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    # In the case of many predictions per prompt, 
    # get overall score, skipping padding tokens and 0% (no overlap in input and target)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Training

In [38]:
if use_pytorch:
    from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")


Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
if use_pytorch:
    training_args = Seq2SeqTrainingArguments(
        output_dir="my_awesome_opus_books_model", # required
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=2,
        predict_with_generate=True,
        #fp16=True, # cannot do mixed precision since my comp isn't a CUDA device
        push_to_hub=True, # Must be logged in to HF
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_books["train"],
        eval_dataset=tokenized_books["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # trainer.push_to_hub() # already did automatically above

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/coding-gen/my_awesome_opus_books_model into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, translation. If id, translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 101668
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12710
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Saving model checkpoint to my_awesome_opus_books_model/checkpoint-500
Configuration saved in my_awesome_opus_books_model/checkpoint-500/config.json
Model weights saved in my_awesome_opus_books_model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in my_awesome_opus_books_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/checkpoint-500/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/checkpoint-500/spiece.model
tokenizer config file saved in my_awesome_opus_books_model/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/spiece.model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Saving model checkpoint to my_awesome_opus_books_model/checkpoint-1000
Configuration saved in my_awesome_opus_books_model/checkpoint-1000/config.json
Model weights saved in my_awesome_opus_books_model/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in my_awesome_opus_books_model/checkpoint-1000/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/checkpoint-1000/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/checkpoint-1000/spiece.model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in my_awesome_opus_books_model/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/spiece.model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Saving model checkpoint to my_awesome_opus_books_model/checkpoint-1500
Configuration saved in my_awesome_opus_books_model/checkpoint-1500/config.json
Model weights saved in my_awesome_opus_books_model/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in my_awesome_opus_books_model/checkpoint-1500/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/checkpoint-1500/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/checkpoint-1500/spiece.model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in my_awesome_opus_books_model/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/spiece.model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Saving model checkpoint to my_awesome_opus_books_model/checkpoint-2000
Configuration saved in my_awesome_opus_books_model/checkpoint-2000/config.json
Model weights saved in my_awesome_opus_books_model/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in my_awesome_opus_books_model/checkpoint-2000/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/checkpoint-2000/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/checkpoint-2000/spiece.model
Deleting older checkpoint [my_awesome_opus_books_model/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to my_awesome_opus_books_model/checkpoint-2500
Configuration saved in my_awesome_opus_books_model/checkpoint-2500/config.json
Model weights saved in my_awesome_opus_books_model/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in my_awesome_opus_books_model/checkpoint-2500/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/checkp

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in my_awesome_opus_books_model/tokenizer_config.json
Special tokens file saved in my_awesome_opus_books_model/special_tokens_map.json
Copy vocab file to my_awesome_opus_books_model/spiece.model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Several commits (2) will be pushed upstream.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Deleting older checkpoint [my_awesome_opus_books_model/checkpoint-1000] due to args.save_total_limit


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
if use_tensorflow:
    
    from transformers import AdamWeightDecay
    optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
    
    from transformers import TFAutoModelForSeq2SeqLM
    model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
    
    tf_train_set = model.prepare_tf_dataset(
        tokenized_books["train"],
        shuffle=True,
        batch_size=16,
        collate_fn=data_collator,
    )

    tf_test_set = model.prepare_tf_dataset(
        tokenized_books["test"],
        shuffle=False,
        batch_size=16,
        collate_fn=data_collator,
    )
    
    import tensorflow as tf
    model.compile(optimizer=optimizer)
    
    from transformers.keras_callbacks import KerasMetricCallback
    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
    
    from transformers.keras_callbacks import PushToHubCallback
    push_to_hub_callback = PushToHubCallback(
        output_dir="my_awesome_opus_books_model",
        tokenizer=tokenizer,
    )
    
    callbacks = [metric_callback, push_to_hub_callback]
    
    model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)

# Inference

In [None]:
from transformers import pipeline

# T5 model requires a prompt
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

translator = pipeline("translation", model="my_awesome_opus_books_model")
translator(text)

In [None]:
# Or manually create a pipeline

if use_pytorch:
    # Tokenize and return ids as tensors
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
    inputs = tokenizer(text, return_tensors="pt").input_ids

    # Generate a translation with the API
    from transformers import AutoModelForSeq2SeqLM
    model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
    outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

    # Decode the ids back to text
    tokenizer.decode(outputs[0], skip_special_tokens=True)
    
if use_tensorflow:
    # Tokenize and return ids as tensors
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
    inputs = tokenizer(text, return_tensors="tf").input_ids

    # Generate a translation with the API
    from transformers import TFAutoModelForSeq2SeqLM
    model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
    outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

    # Decode the ids back to text
    tokenizer.decode(outputs[0], skip_special_tokens=True)