In [9]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
from transformers import pipeline, AutoTokenizer, AutoModel

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"

model_id = "philschmid/bart-large-cnn-samsum"
tokenizer = AutoTokenizer.from_pretrained(model_id, max_length=1024, truncation=True)
summarizer = pipeline("summarization", model=model_id, device=device, tokenizer=tokenizer)

def summarize_text(text: str, max_len: int) -> str:
    try:
        text = text[:max_len] if len(text) > max_len else text
        summary = summarizer(text, max_length=max_len, min_length=10, do_sample=False)
        return summary[0]["summary_text"]
    except IndexError as ex:
        logging.warning("Sequence length too large for model, cutting text in half and calling again")
        return summarize_text(text=text[:(len(text) // 2)], max_len=max_len//2) + summarize_text(text=text[(len(text) // 2):], max_len=max_len//2)
  

In [7]:
from datasets import load_dataset

dataset = load_dataset("argilla/news-summary")
print(f"Train size: {len(dataset['train'])}")
print(f"Test size: {len(dataset['test'])}")

Found cached dataset parquet (/home/arun/.cache/huggingface/datasets/argilla___parquet/argilla--news-summary-53286f0044d57a8a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Train size: 47
Test size: 12


In [12]:
import evaluate

metric = evaluate.load("rouge")

# summarize dialogue
text = [x['text'] for x in dataset['train']]
res = [summarize_text(t, 1024) for t in text]

print(res[0])

Your max_length is set to 1024, but you input_length is only 219. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=109)
Your max_length is set to 1024, but you input_length is only 136. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 1024, but you input_length is only 117. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 1024, but you input_length is only 194. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 1024, but you input_length is only 67. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 1024, but you input_length is only 53. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Your max_length is set to 1024, but you input_length is only 90. You mi

["Donald Trump will begin a major push next week to convince the public of the need for tax reform. Trump will start the effort next Wednesday with a speech in Missouri. Gary Cohn, director of the National Economic Council, says the president's agenda will revolve around tax reform starting next week.", 'Britain has two proposals on how to secure a frictionless border with EU member Ireland after Brexit, Northern Ireland minister James Brokenshire says.', 'U.S. Senator Bill Nelson sent letters to the chief executives of 10 major U.S.-based airlines on Monday. He urged them to cap airline fares for passengers fleeing Hurricane Maria.', 'U.S. Senate Republicans reach a tentative budget deal that could allow tax reform legislation to eliminate as much as $1.5 trillion in revenues over 10 years through tax cuts. It raises the odds that their planned tax overhaul would expand the federal deficit.', 'U.S. congressional leaders and White House officials will release a document during the week

TypeError: string indices must be integers

In [17]:
references=[t['target'] for t in dataset['train']]
metric.compute(references=references, predictions=res)

{'rouge1': 0.26106030135658265,
 'rouge2': 0.09360420472838196,
 'rougeL': 0.24002516157789028,
 'rougeLsum': 0.24018285722455998}

# Flan-T5-Base-Samsum model

In [22]:
flan_summarize = pipeline('summarization', model='philschmid/flan-t5-base-samsum', device=device)

flan_res = flan_summarize(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (844 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 200, but you input_length is only 149. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=74)
Your max_length is set to 200, but you input_length is only 126. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 200, but you input_length is only 73. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max_length is set to 200, but you input_length is only 63. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
Your max_length is set to 200, but you input_length is only 107. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 200, but you input_lengt

In [24]:
flan_summaries = [s['summary_text'] for s in flan_res]
metric.compute(references=references, predictions=flan_summaries)

{'rouge1': 0.26671677669711646,
 'rouge2': 0.10141538977705652,
 'rougeL': 0.23888055617499332,
 'rougeLsum': 0.23772103425086222}