https://colab.research.google.com/github/BritneyMuller/colab-notebooks/blob/master/Easy_Text_Summarization_with_BART.ipynb#scrollTo=-DcmOk-0UPvv

Code taken and/or modified from colab file released by google

In [1]:
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [3]:
torch_device = 'cuda'

In [4]:
model.to(torch_device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [5]:
def bart_summarize(text, num_beams, length_penalty, max_length, min_length, no_repeat_ngram_size):

  text = text.replace('\n','')
  text_input_ids = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
  summary_ids = model.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))           
  summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
  return summary_txt

In [6]:
import numpy as np
import pickle 
from rouge import Rouge
from tqdm import tqdm
from datasets import load_dataset

dataset = load_dataset("kmfoda/booksum")
rouge = Rouge()

Found cached dataset csv (/home/kims90/.cache/huggingface/datasets/kmfoda___csv/kmfoda--booksum-025141c210e07407/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 317.05it/s]


In [7]:
scores_dict = {'rouge-1': {'r': np.zeros((len(dataset["test"], ))), 'p': np.zeros((len(dataset["test"], ))), 'f': np.zeros((len(dataset["test"], )))}, 
               'rouge-2': {'r': np.zeros((len(dataset["test"], ))), 'p': np.zeros((len(dataset["test"], ))), 'f': np.zeros((len(dataset["test"], )))}, 
               'rouge-l': {'r': np.zeros((len(dataset["test"], ))), 'p': np.zeros((len(dataset["test"], ))), 'f': np.zeros((len(dataset["test"], )))}, }


In [8]:

bart_summarize(dataset["test"]["chapter"][0], 4, 2.0, 142, 56, 3)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'All states, all powers, that have held and hold rule over men have beenand are either republics or principalities.Principalities are either hereditary, in which the family has been longestablished; or they are new. I will leave out all discussion on republics, inasmuch as in anotherplace I have written of them at length.'

In [None]:
def test(model):
    print(model())

In [9]:
bart_summarize(dataset["test"]["chapter"][0], 4, 2.0, 142, 56, 3)

'All states, all powers, that have held and hold rule over men have beenand are either republics or principalities.Principalities are either hereditary, in which the family has been longestablished; or they are new. I will leave out all discussion on republics, inasmuch as in anotherplace I have written of them at length.'

In [9]:
for i in tqdm(range(len(dataset["test"]))):
    hypothesis = bart_summarize(dataset["test"]["chapter"][i], 4, 2.0, 142, 56, 3)
    reference = dataset["test"]["summary_text"][i]
    scores = rouge.get_scores(hypothesis, reference)

    for j in scores_dict.keys():
        for k in scores_dict[j].keys():
            scores_dict[j][k][i] = scores[0][j][k]

  2%|▏         | 34/1431 [00:25<17:43,  1.31it/s]


KeyboardInterrupt: 

In [15]:
np.mean(scores_dict['rouge-1']['r'][:34])

0.108416854423227

In [5]:
def text_summarize_bart_param(tokenizer, model, text, device, num_beams, length_penalty, max_length, min_length, no_repeat_ngram_size, early_stopping):

  text = text.replace('\n','')
  text_input_ids = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=512, truncation=True, padding=True)['input_ids'].to(device)
  summary_ids = model.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size), early_stopping=early_stopping)           
  summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
  return summary_txt

In [8]:
f = open("../source_text/pride_and_prejudice_ch1_part.txt", "r")
q_text = f.read()

result = text_summarize_bart_param(tokenizer, model, q_text, 'cuda', 4, 2.0, 142, 56, 3, True)

result

'“Do you not want to know who has taken it?” cried his wife impatiently. “You want to tell me, and I have no objection to hearing it.”“Why, my dear, you must know, Mrs. Long says that Netherfield istaken by a young man of large fortune from the north of England”'

In [7]:
f = open("../source_text/bbc_news_part.txt", "r")
b_text = f.read()

result = text_summarize_bart_param(tokenizer, model, b_text, 'cuda', 4, 2.0, 142, 56, 3, True)

result

'Supermarket chain Lidl said it had sold enough bunting to line the Coronation procession route 75 times over. Tesco said it was on track to sell 675,000 pork pies and 300,000 pots of clotted cream. Around £200m will be spent on food and drink this weekend, according to Centre for Retail Research.'