In [4]:
from transformers import pipeline, T5ForConditionalGeneration, TrainingArguments, Trainer, \
                          T5Tokenizer, DataCollatorForSeq2Seq
import pandas as pd
from datasets import Dataset
import random

In [5]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-small')
base_tokenizer = T5Tokenizer.from_pretrained('t5-small')

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [3]:
reviews = pd.read_csv('data/reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
def add_punc(s):
    if s[-1] not in ['.', '!', '?']:
        s = s + '.'
    return s

In [9]:
reviews.dropna(inplace=True)
reviews = reviews[['Text', 'Summary', 'Score']]
reviews['Summary'] = reviews['Summary'].map(add_punc)
print(reviews.shape)
reviews.head()

(568411, 3)


Unnamed: 0,Text,Summary,Score
0,I have bought several of the Vitality canned d...,Good Quality Dog Food.,5
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised.,1
2,This is a confection that has been around a fe...,"""Delight"" says it all.",4
3,If you are looking for the secret ingredient i...,Cough Medicine.,2
4,Great taffy at a great price. There was a wid...,Great taffy.,5


In [10]:
reviews = reviews[(reviews['Summary'].str.len() < 100) & (reviews['Summary'].str.len() >= 30)]
reviews.shape

(157786, 3)

In [11]:
random.seed(0)
reviews_dataset = Dataset.from_pandas(reviews.astype(str).sample(5000))

In [12]:
prefix = 'summarize: '
def preprocess_function(examples):
    inputs = [prefix + text for text in examples['Text']]
    model_inputs = base_tokenizer(inputs, max_length=1024, truncation=True)
    labels = base_tokenizer(examples['Summary'], max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [13]:
tokenized_reviews_dataset = reviews_dataset.map(preprocess_function, batched=True)
tokenized_reviews_dataset

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['Text', 'Summary', 'Score', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [15]:
len(tokenized_reviews_dataset['input_ids']), tokenized_reviews_dataset['input_ids']

(5000,
 [[21603,
   10,
   27,
   31,
   162,
   2944,
   633,
   11104,
   13,
   8,
   3,
   115,
   5,
   89,
   5,
   89,
   5,
   689,
   21,
   82,
   1712,
   6,
   11,
   255,
   3,
   75,
   4067,
   7,
   323,
   30,
   334,
   80,
   13,
   135,
   55,
   1441,
   82,
   562,
   5058,
   31,
   7,
   1712,
   6,
   113,
   19,
   2033,
   1432,
   63,
   28,
   62,
   17,
   542,
   41,
   235,
   8,
   500,
   24,
   132,
   19,
   6672,
   163,
   80,
   1056,
   3,
   88,
   133,
   3,
   1544,
   44,
   66,
   61,
   65,
   118,
   4682,
   14801,
   53,
   13434,
   91,
   13,
   160,
   3047,
   230,
   55,
   100,
   2005,
   19,
   182,
   306,
   2769,
   3,
   18,
   3,
   99,
   25,
   691,
   91,
   8,
   3,
   115,
   5,
   89,
   5,
   89,
   5,
   353,
   6,
   79,
   143,
   66,
   13,
   70,
   1173,
   16,
   3,
   9,
   936,
   542,
   3026,
   3064,
   6,
   11,
   132,
   19,
   150,
   8273,
   14,
   49,
   16,
   8,
   542,
   5,
   366,
   2101,
   8

In [16]:
tokenized_reviews_dataset = tokenized_reviews_dataset.train_test_split(test_size=.1)

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=base_tokenizer, model=base_model)

In [18]:
training_args = TrainingArguments(
    output_dir='t5_summary_results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    load_best_model_at_end=True,
    logging_steps=50,
    save_strategy='epoch'
)
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_reviews_dataset['train'],
    eval_dataset=tokenized_reviews_dataset['test'],
    data_collator=data_collator
)

In [19]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Text, Score, __index_level_0__, Summary. If Text, Score, __index_level_0__, Summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 4.595187664031982,
 'eval_runtime': 3.4443,
 'eval_samples_per_second': 145.169,
 'eval_steps_per_second': 18.291}

In [20]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

In [21]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Text, Score, __index_level_0__, Summary. If Text, Score, __index_level_0__, Summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4500
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11260
  Number of trainable parameters = 60506624


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Text, Score, __index_level_0__, Summary. If Text, Score, __index_level_0__, Summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to t5_summary_results/checkpoint-563
Configuration saved in t5_summary_results/checkpoint-563/config.json
Configuration saved in t5_summary_results/checkpoint-563/generation_config.json
Model weights saved in t5_summary_results/checkpoint-563/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Text, Score, __index_level_0__, Summary. If Text, Score, __index_level_0__, Summary are not expected by `T5ForCo

TrainOutput(global_step=11260, training_loss=2.9288137446076576, metrics={'train_runtime': 1712.6332, 'train_samples_per_second': 52.551, 'train_steps_per_second': 6.575, 'total_flos': 1.0042346866212864e+16, 'train_loss': 2.9288137446076576, 'epoch': 20.0})

In [22]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Text, Score, __index_level_0__, Summary. If Text, Score, __index_level_0__, Summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 3.1256039142608643,
 'eval_runtime': 2.6982,
 'eval_samples_per_second': 185.308,
 'eval_steps_per_second': 23.349,
 'epoch': 20.0}

In [23]:
trainer.save_model()

Saving model checkpoint to t5_summary_results
Configuration saved in t5_summary_results/config.json
Configuration saved in t5_summary_results/generation_config.json
Model weights saved in t5_summary_results/pytorch_model.bin


In [24]:
loaded_model = T5ForConditionalGeneration.from_pretrained('t5_summary_results')
generator = pipeline('summarization', loaded_model, tokenizer=base_tokenizer)

loading configuration file t5_summary_results/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopp

In [25]:
sample = reviews.sample(1)
print(sample['Summary'])
text = sample['Text'].tolist()[0]
text

377010    The Cocoa Taste Overpowers the Cinnamon.
Name: Summary, dtype: object


'Being on a health kick for the last couple years, I am always on the lookout for tasty snacking alternatives.  The Eat Think Smile brand boasts an all natural snack with the natural antioxidant of cocoa.  The product is small bit size and fun to eat except for a flavor that is listed as Sweet Cinnamon you can barely taste the cinnamon at all as the taste is vastly overpowered by the taste of cocoa from the first bite to long after you finish thanks to a strong aftertaste.  It is alright but the type of food I would only reach for if the rest of my cupboard is bare.  It is also nice that the bag is re-sealable to keep the food fresh.'

In [26]:
generator(text, min_length=3, max_length=15, early_stopping=True, num_beams=2)

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 1,
  "length_penalty": 2.0,
  "max_length": 200,
  "min_length": 30,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



[{'summary_text': 'Delicious snack with the natural antioxidant of cocoa.'}]

In [30]:
base_generator = pipeline(
    'summarization', 
    model='t5-base',
    tokenizer='t5-base',
)
base_generator(text, min_length=3, max_length=15, early_stopping=True, num_beams=2)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
  

[{'summary_text': 'the Eat Think Smile brand boasts an all natural snack with the natural'}]

In [31]:
base_generator = pipeline(
    'summarization', 
    model='t5-small',
    tokenizer='t5-small',
)
base_generator(text, min_length=3, max_length=15, early_stopping=True, num_beams=2)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/ad26363d1dadacd02b8d1b627db00a2db488fcf7/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
   

[{'summary_text': 'the Eat Think Smile brand boasts an all natural snack with the natural'}]

In [32]:
inputs = base_tokenizer('not my prompt: ' + text, return_tensors='pt')
outputs = loaded_model.generate(
    inputs['input_ids'], min_length=3, max_length=15
)
print(base_tokenizer.decode(outputs[0], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 1,
  "length_penalty": 2.0,
  "max_length": 200,
  "min_length": 30,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Not my prompt: Being on a health kick for the last couple
