## Import Library

In [93]:
from transformers import pipeline
from transformers import AutoTokenizer
import os

## Setting GPU and Model to Use

In [102]:
## Setting to use the 0th GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Setting to use the bart-large-cnn model for summarization
# summarizer = pipeline("summarization")
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn', model_max_len=1024)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer=tokenizer, device=0)

## To use the t5-base model for summarization:
# summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")

## Input the Text to Summarize

In [103]:
text = """One month after the United States began what has become a troubled rollout of a national COVID vaccination campaign, the effort is finally gathering real steam.
Close to a million doses -- over 951,000, to be more exact -- made their way into the arms of Americans in the past 24 hours, the U.S. Centers for Disease Control and Prevention reported Wednesday. That's the largest number of shots given in one day since the rollout began and a big jump from the previous day, when just under 340,000 doses were given, CBS News reported.
That number is likely to jump quickly after the federal government on Tuesday gave states the OK to vaccinate anyone over 65 and said it would release all the doses of vaccine it has available for distribution. Meanwhile, a number of states have now opened mass vaccination sites in an effort to get larger numbers of people inoculated, CBS News reported."""

In [106]:
from datasets import load_dataset

cnn_train = load_dataset("cnn_dailymail", "3.0.0", split="train")
cnn_validation = load_dataset("cnn_dailymail", "3.0.0", split="validation")
cnn_summaries = cnn_train.select(range(1000)).to_dict()
cnn_summaries['generated_highlights'] = []

Reusing dataset cnn_dailymail (C:\Users\Geoff\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset cnn_dailymail (C:\Users\Geoff\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


## Summarize

In [107]:
for (i, data) in enumerate(cnn_summaries['article']):
    summary_text = summarizer(data, max_length=100, min_length=5, do_sample=False, truncation=True)[0]['summary_text']
    cnn_summaries['generated_highlights'].append(summary_text)

Your max_length is set to 100, but you input_length is only 93. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 100, but you input_length is only 69. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


In [91]:
# 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: 

In [110]:
import pickle
with open('generated_summaries.pickle', 'wb') as handle:
    pickle.dump(cnn_summaries, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [111]:
with open('generated_summaries.pickle', 'rb') as handle:
    cnn_summaries = pickle.load(handle)

In [114]:
cnn_summaries['generated_highlights']

['U.N. inspectors leave Syria, carrying evidence that will determine whether chemical weapons were used. "The issues are too big for business as usual," President Barack Obama says. Obama asks Congress to approve the use of military force in Syria. He says he believes he has the authority to take action without congressional approval.',
 "Usain Bolt wins his third gold of the world championships in Moscow. Bolt anchors Jamaica to victory in the men's 4x100m relay. The 26-year-old has now won eight gold medals at the championships. Jamaica's women also win gold in the 4x50m and 4x200m relays.",
 'The GSA employee is paid $84,440 and works from his home in Honolulu. In the past year, he has flown back to the mainland nine times for conferences and meetings. He is among several hundred GSA "virtual" workers who also travel to various conferences and their home offices. The GSA is already under investigation for lavish spending.',
 'Dr. Blaga Stancheva says she examined Harry Burkhart as p

In [69]:
from transformers import BartForConditionalGeneration, BartTokenizer
# from typing import List

# def summarization_pipeline(text: List[str]) -> List[str]:
#     tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
#     model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
#     input_ids = tokenizer.batch_encode_plus(text, return_tensors='pt', max_length=1024)['input_ids']
#     summary_ids = model.generate(input_ids, max_length=100, min_length=5, do_sample=False, truncation=True)
#     summaries = [tokenizer.decode(s) for s in summary_ids]
#     return summaries

# for (i, data) in enumerate(cnn_summaries['article']):
#     summary_text = summarization_pipeline(data)[0]['summary_text']
#     cnn_summaries['generated_highlights'].append(summary_text)

{'article': ['It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but be