Referenced: 
https://turbolab.in/abstractive-summarization-using-googles-t5/
https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration

In [1]:
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer_t5_large = T5Tokenizer.from_pretrained("t5-large")
model_t5_large = T5ForConditionalGeneration.from_pretrained("t5-large")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
from datasets import load_dataset

dataset = load_dataset("kmfoda/booksum")

Found cached dataset csv (/home/kims90/.cache/huggingface/datasets/kmfoda___csv/kmfoda--booksum-025141c210e07407/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 635.85it/s]


In [3]:
model_t5_large.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [4]:
import torch

In [5]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [6]:
def text_summarization_t5(text_input):
    text = text_input
    text = text.replace('\n','')
    input_ids = tokenizer_t5_large.encode(
        "summarize: " +  text, return_tensors="pt", max_length = 512, truncation = True,  padding=True
    ) # Batch size 1
    input_ids = input_ids.to('cuda')
    outputs = model_t5_large.generate(input_ids, num_beams=2, no_repeat_ngram_size=3, length_penalty=2.0, min_length=50, max_length=5000, early_stopping=True)
    summary_txt = tokenizer_t5_large.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return summary_txt

In [17]:
import sys

sys.path.append('../rouge_calc')

import model_rouge

In [20]:
from rouge import Rouge
import pickle
from tqdm import tqdm


rouge = Rouge()
all_summ = []

for i in tqdm(range(len(dataset["test"]))):
    summarized_output = text_summarization_t5(dataset["test"]["chapter"][i])
    all_summ.append(summarized_output)

t5_large_rouge = model_rouge.calc_rouge_pred(all_summ, dataset["test"]["summary_text"], rouge)
model_rouge.print_txt(t5_large_rouge, '../rouge_calc/t5_large_rouge.txt')
pickle.dump(t5_large_rouge, open('../rouge_calc/dict_pickle/t5_large_rouge.dict', 'wb'))


100%|██████████| 1431/1431 [39:22<00:00,  1.65s/it]
100%|██████████| 1431/1431 [00:11<00:00, 125.39it/s]


In [6]:
def text_summarization_t5_param(text_input, num_beams_v, no_repeat_ngram_size_v, length_penalty_v, min_length_v, max_length_v, early_stopping_v):
    text = text_input
    text = text.replace('\n','')
    input_ids = tokenizer_t5_large.encode(
        "summarize: " +  text, return_tensors="pt", max_length = 512, truncation = True,  padding=True
    ) # Batch size 1
    input_ids = input_ids.to('cuda')
    outputs = model_t5_large.generate(input_ids, num_beams=num_beams_v, no_repeat_ngram_size=no_repeat_ngram_size_v, length_penalty=length_penalty_v, min_length=min_length_v, max_length=max_length_v, early_stopping=early_stopping_v)
    summary_txt = tokenizer_t5_large.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return summary_txt

In [22]:
all_summ_v = []

for i in tqdm(range(len(dataset["test"]))):
    summarized_output = text_summarization_t5_param(dataset["test"]["chapter"][i], 4, 3, 2.0, 56, 142, True )
    all_summ_v.append(summarized_output)

t5_large_rouge_v = model_rouge.calc_rouge_pred(all_summ_v, dataset["test"]["summary_text"], rouge)
model_rouge.print_txt(t5_large_rouge_v, '../rouge_calc/t5_large_rouge_v.txt')
pickle.dump(t5_large_rouge_v, open('../rouge_calc/dict_pickle/t5_large_rouge_v.dict', 'wb'))


100%|██████████| 1431/1431 [43:31<00:00,  1.83s/it]
100%|██████████| 1431/1431 [00:12<00:00, 118.22it/s]


In [None]:
all_summ_v = []

for i in tqdm(range(len(dataset["test"]))):
    half_length = len(dataset["test"]["chapter"][i])//2
    summarized_output = text_summarization_t5_param(dataset["test"]["chapter"][i], 4, 3, 2.0, 56, half_length, True )
    all_summ_v.append(summarized_output)

t5_large_rouge_v = model_rouge.calc_rouge_pred(all_summ_v, dataset["test"]["summary_text"], rouge)
model_rouge.print_txt(t5_large_rouge_v, '../rouge_calc/t5_large_rouge_v.txt')
pickle.dump(t5_large_rouge_v, open('../rouge_calc/dict_pickle/t5_large_rouge_v.dict', 'wb'))


In [29]:
f = open("../source_text/pride_and_prejudice_ch1_part.txt", "r")
q_text = f.read()

result = text_summarization_t5_param(q_text, 4, 3, 2.0, 56, 156, True)



In [30]:
result

'a single man in possession of a good fortune, must be in want ofa wife. this truth is so well fixed in the minds of the surrounding families, that he is considered the rightfulproperty of some one orother of their daughters. "my dear, my dear, do you not want to know who has taken it?” cried his wife impatiently.'

In [7]:
f = open("../source_text/bbc_news_part.txt", "r")
q_text = f.read()

result = text_summarization_t5_param(q_text, 4, 3, 2.0, 56, 156, True)

result

'around £200m will be spent on food and drink this weekend, according to the centre for retail research. supermarket chain Lidl said it had sold enough bunting to line the Coronation procession route 75 times over. Tesco said it was on track to sell 675,000 pork pies and 300,000 pots of clotted cream.'