In [1]:
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration

In [2]:
tokenizer = AutoTokenizer.from_pretrained("t5-3b", padding_side='left')
model = T5ForConditionalGeneration.from_pretrained("t5-3b")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-3b automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
tokenizer.eos_token

'</s>'

In [4]:
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(["Hello, my dog is cute", "Hello"], return_tensors="pt", padding=True)
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
logits = outputs.logits

In [5]:
inputs

{'input_ids': tensor([[8774,    6,   82, 1782,   19, 5295,    1],
        [   1,    1,    1,    1,    1, 8774,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1]])}

In [6]:
logits.shape

torch.Size([2, 7, 32128])

In [7]:
outputs.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])

In [8]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [9]:
infer = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=256, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)

In [10]:
infer

tensor([[    0,     3,     6,    82,  1782,     6, 32094,     6, 32093,     6,
         32092,     6, 32091,     6, 32090,     6,     8,  1782,    19,     3,
             9,  1782,     5,    27,   183,  1782,     3,    18,    82,  3947,
          1782,    55,     6,    27,  1782,    18, 10169,     6,     3,    88,
             3,   233,  8774,     6, 32089,     6, 21820,     6, 32088,    55,
             3,     5,     3,     2,     3,     7,  1782, 32090,     5, 32090,
            55,    27,     3,    17,     3,    15,  1782,    11,    82, 17351,
             5, 32086,     5,     6,     6,    11,  1782,  1782,    82,     6,
          1782, 17351,     6,    69,  1782, 32086,     6, 32085,     6, 32084,
             6, 32083,     6,     5,     5,    82,  3887,    19,     6, 32079,
             6, 32096,     6, 32095,     6,     1],
        [    0, 32099,     5, 32098,  8774,     5,  8774, 21820, 21820,     5,
         32097,     5, 32096,     5, 32095,     5, 32094,     5, 32093,     5,


In [11]:
print(tokenizer.decode(infer[0], skip_special_tokens=True))

, my dog,,,,,, the dog is a dog. I am dog - my pet dog!, I dog-dog, he... Hello,, hello,!.  s dog.! I t e dog and my puppy..,, and dog dog my, dog puppy, our dog,,,,.. my dogs is,,,,


In [12]:
print(tokenizer.decode(infer[1], skip_special_tokens=True))

. Hello. Hello hello hello.........       Hello! Hello Hello hey hello huawei xbox 360 ps4 – ipad??hello hello hi hello!  ahhhhh!


In [13]:
infer2 = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=256, num_beams=5, early_stopping=True, no_repeat_ngram_size=2, num_return_sequences=5)

In [14]:
infer2.size()

torch.Size([10, 106])

In [15]:
for i in range(10):
    print(tokenizer.decode(infer2[i], skip_special_tokens=True))

, my dog,,,,,, the dog is a dog. I am dog - my pet dog!, I dog-dog, he... Hello,, hello,!.  s dog.! I t e dog and my puppy..,, and dog dog my, dog puppy, our dog,,,,.. my dogs is,,,,
, my dog,,,,,, the dog is a dog. I am dog - my pet dog!, I dog-dog, he... Hello,, hello,!.  s dog.! I t e dog and my puppy..,, and dog dog my, dog puppy, our dog,,,,.. my dogs is..,,
, my dog,,,,,, the dog is a dog. I am dog - my pet dog!, I dog-dog, he... Hello,, hello,!.  s dog.! I t e dog and my puppy..,, and dog dog my, dog puppy, our dog,,,,.. my dogs is.,,,
, my dog,,,,,, the dog is a dog. I am dog - my pet dog!, I dog-dog, he... Hello,, hello,!.  s dog.! I t e dog and my puppy..,, and dog dog my, dog puppy, our dog,,,,.. my dogs is.., pet,
, my dog,,,,,, the dog is a dog. I am dog - my pet dog!, I dog-dog, he... Hello,, hello,!.  s dog.! I t e dog and my puppy..,, and dog dog my, dog puppy, our dog,,,,.. my dogs is..,.
. Hello. Hello hello hello.........       Hello! Hello Hello hey hello huawei xbo

In [16]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f56994291b0>

In [17]:
sample_output = model.generate(
    **inputs, 
    do_sample=True, 
    max_length=50, 
    top_k=0,
    pad_token_id=tokenizer.eos_token_id, 
)

In [18]:
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
, my dog from the dog, my dog, my dog, my dog is,,,,, my dog is my dog, my dog a dog, my dog, my dog, our dog, my


In [None]:
sample_output = model.generate(
    **inputs, 
    do_sample=True, 
    max_length=50, 
    top_k=0,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.7
)

In [None]:
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    **inputs, 
    do_sample=True, 
    max_length=50, 
    top_k=6,
    pad_token_id=tokenizer.eos_token_id, 
)

In [None]:
for i in range(2):
    print(tokenizer.decode(sample_output[i], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    **inputs, 
    do_sample=True, 
    max_length=50, 
    top_p=0.92, 
    top_k=0,
    pad_token_id=tokenizer.eos_token_id, 
)

In [None]:
for i in range(2):
    print(tokenizer.decode(sample_output[i], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    **inputs, 
    do_sample=True, 
    max_length=50, 
    top_p=0.95, 
    top_k=50,
    pad_token_id=tokenizer.eos_token_id,
    early_stopping=True,
    num_return_sequences=3
)

In [None]:
for i in range(6):
    print(tokenizer.decode(sample_output[i], skip_special_tokens=True))