In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [2]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-base')
base_tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
text_to_summarize = '''Sinan Ozdemir is a data scientist, startup founder, and educator living in the San Francisco Bay Area with his dog, 
Charlie; cat, Euclid; and bearded dragon, Fiero. He spent his academic career studying pure mathematics 
at Johns Hopkins University before transitioning to education. He spent several years conducting lectures 
on data science at Johns Hopkins University and at the General Assembly before founding his own start-up,
Legion Analytics, which uses artificial intelligence and data science to power enterprise sales teams. 
After completing the Fellowship at the Y Combinator accelerator, Sinan has spent most of his days working on 
his fast-growing company, while creating educational material for data science.
'''
preprocess_text = text_to_summarize.strip().replace('\n', '')
print('original text preprocessed:\n', preprocess_text)

original text preprocessed:
 Sinan Ozdemir is a data scientist, startup founder, and educator living in the San Francisco Bay Area with his dog, Charlie; cat, Euclid; and bearded dragon, Fiero. He spent his academic career studying pure mathematics at Johns Hopkins University before transitioning to education. He spent several years conducting lectures on data science at Johns Hopkins University and at the General Assembly before founding his own start-up,Legion Analytics, which uses artificial intelligence and data science to power enterprise sales teams. After completing the Fellowship at the Y Combinator accelerator, Sinan has spent most of his days working on his fast-growing company, while creating educational material for data science.


In [4]:
t5_prepared_text = 'summarize: ' + preprocess_text
input_ids = base_tokenizer.encode(t5_prepared_text, return_tensors='pt')
summary_ids = base_model.generate(
    input_ids,
    num_beams=4,
    no_repeat_ngram_size=3,
    min_length=30,
    max_length=50,
    early_stopping=True
)
output = base_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(f'summarized text: \n{output}')

summarized text: 
Sinan Ozdemir is a data scientist, startup founder, and educator. he founded his own start-up, which uses artificial intelligence and data science to power sales teams.


In [5]:
input_ids = base_tokenizer.encode('translate English to German: Where is the chocolate?', return_tensors='pt')
translate_ids = base_model.generate(
    input_ids,
    num_beams=4,
    no_repeat_ngram_size=3,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Tranlated text:\n{output}')

Tranlated text:
Wo ist die Schokolade?


In [6]:
input_ids = base_tokenizer('translate English to German: Where is the chocolate?', return_tensors='pt').input_ids
labels = base_tokenizer('Wo ist die Schokolade?', return_tensors='pt').input_ids
loss = base_model(input_ids=input_ids, labels=labels).loss
labels, loss

(tensor([[ 3488,   229,    67, 31267,    58,     1]]),
 tensor(0.1136, grad_fn=<NllLossBackward0>))

In [11]:
input_ids = base_tokenizer.encode('cola sentence: Where is the chocolate?', return_tensors='pt')
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'is grammatically correct?: \n{output}')

is grammatically correct?: 
acceptable


In [10]:
input_ids = base_tokenizer.encode('cola sentence: Where be a chocolate?', return_tensors='pt')
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'is grammatically correct?: \n{output}')

is grammatically correct?: 
unacceptable


In [20]:
sentence_one = 'How to fish'
sentence_two = 'Fishing Manual for beginners'
input_ids = base_tokenizer.encode(f'stsb sentence1: {sentence_one} sentence2: {sentence_two}', return_tensors='pt')
translate_ids = base_model.generate(
    input_ids,
    max_length=3,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'is semantically similar? (0-5): \n{output}')

is semantically similar? (0-5): 
3.2


In [21]:
sentence_one = 'How to fish'
sentence_two = 'Hiking Manual for beginners'
input_ids = base_tokenizer.encode(f'stsb sentence1: {sentence_one} sentence2: {sentence_two}', return_tensors='pt')
translate_ids = base_model.generate(
    input_ids,
    max_length=3,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'is semantically similar? (0-5): \n{output}')

is semantically similar? (0-5): 
0.0


In [25]:
# entailment, contradiction, netural
input_ids = base_tokenizer.encode(
    'mnli premise: I am active in politics. hypothesis: I am running for mayor', return_tensors='pt'
)
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Response: \n{output}')

Response: 
entailment


In [26]:
# entailment, contradiction, netural
input_ids = base_tokenizer.encode(
    'mnli premise: I am active in politics. hypothesis: I do not really vote', return_tensors='pt'
)
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Response: \n{output}')

Response: 
contradiction


In [27]:
# entailment, contradiction, netural
input_ids = base_tokenizer.encode(
    'mnli premise: I am active in politics. hypothesis: I code for a living', return_tensors='pt'
)
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Response: \n{output}')

Response: 
neutral


In [28]:
input_ids = base_tokenizer.encode(
    'question: Where does Sunggon live? context: Sunggon lives in Saitama, Japan but Sigyo lives in Seoul, Korea', 
    return_tensors='pt'
)
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Response: \n{output}')

Response: 
Saitama, Japan


In [29]:
input_ids = base_tokenizer.encode(
    'question: Where does Sigyo live? context: Sunggon lives in Saitama, Japan but Sigyo lives in Seoul, Korea',
    return_tensors='pt'
)
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Response: \n{output}')

Response: 
Seoul, Korea


In [30]:
input_ids = base_tokenizer.encode(
    'prompt1: Where does Sunggon live? prompt2: Sunggon lives in Saitama, Japan but Sigyo lives in Seoul, Korea', 
    return_tensors='pt'
)
translate_ids = base_model.generate(
    input_ids,
    max_length=20,
    early_stopping=True
)
output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)
print(f'Response: \n{output}')

Response: 
not_duplicate
