In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 27.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 57.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 56.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 37.2 MB/s

In [3]:
from transformers import (
    AutoTokenizer,
    LEDForConditionalGeneration,
)
from datasets import load_dataset, load_metric
import torch


dataset=load_dataset('multi_news')

Downloading builder script:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.24k [00:00<?, ?B/s]



Downloading and preparing dataset multi_news/default (download: 721.73 MiB, generated: 664.42 MiB, post-processed: Unknown size, total: 1.35 GiB) to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Dataset multi_news downloaded and prepared to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
TOKENIZER = AutoTokenizer.from_pretrained('allenai/PRIMERA')
MODEL = LEDForConditionalGeneration.from_pretrained('allenai/PRIMERA')
MODEL.gradient_checkpointing_enable()
PAD_TOKEN_ID = TOKENIZER.pad_token_id
DOCSEP_TOKEN_ID = TOKENIZER.convert_tokens_to_ids("<doc-sep>")

Downloading tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/20.0 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

In [4]:
def process_document(documents):
    input_ids_all=[]
    for data in documents:
        all_docs = data.split("|||||")[:-1]
        for i, doc in enumerate(all_docs):
            doc = doc.replace("\n", " ")
            doc = " ".join(doc.split())
            all_docs[i] = doc

        #### concat with global attention on doc-sep
        input_ids = []
        for doc in all_docs:
            input_ids.extend(
                TOKENIZER.encode(
                    doc,
                    truncation=True,
                    max_length=4096 // len(all_docs),
                )[1:-1]
            )
            input_ids.append(DOCSEP_TOKEN_ID)
        input_ids = (
            [TOKENIZER.bos_token_id]
            + input_ids
            + [TOKENIZER.eos_token_id]
        )
        input_ids_all.append(torch.tensor(input_ids))
    input_ids = torch.nn.utils.rnn.pad_sequence(
        input_ids_all, batch_first=True, padding_value=PAD_TOKEN_ID
    )
    return input_ids


def batch_process(batch):
    input_ids=process_document(batch['document'])
    # get the input ids and attention masks together
    global_attention_mask = torch.zeros_like(input_ids).to(input_ids.device)
    # put global attention on <s> token

    global_attention_mask[:, 0] = 1
    global_attention_mask[input_ids == DOCSEP_TOKEN_ID] = 1
    generated_ids = MODEL.generate(
        input_ids=input_ids,
        global_attention_mask=global_attention_mask,
        use_cache=True,
        max_length=1024,
        num_beams=5,
    )
    generated_str = TOKENIZER.batch_decode(
            generated_ids.tolist(), skip_special_tokens=True
        )
    result={}
    result['generated_summaries'] = generated_str
    result['gt_summaries']=batch['summary']
    return result

In [5]:
#import random
#data_idx = random.choices(range(len(dataset['test'])),k=10)
#dataset_small = dataset['test'].select(data_idx)
dataset_small = dataset['test'].select([1,2,3,4,5,6,7,8,9,10])
dataset_small.to_csv("/content/drive/MyDrive/Colab_Notebooks/MultiNews")
from datetime import datetime
start_time = datetime.now()
result_small = dataset_small.map(batch_process, batched=True, batch_size=2)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

  0%|          | 0/5 [00:00<?, ?ba/s]

Duration: 0:32:04.233469


In [8]:
dataset_small = dataset['test'].select([1,2,3,4,5,6,7,8,9,10])
dataset_small2=dataset_small.rename_column('document', 'ext_article')
dataset_small2.to_csv("/content/drive/MyDrive/Colab_Notebooks/MultiNews/test10document.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

94317

In [9]:
dataset_small2


Dataset({
    features: ['ext_article', 'summary'],
    num_rows: 10
})

In [None]:
out1=''.join(map(str,result_small['generated_summaries']))
with open('/content/drive/MyDrive/Colab_Notebooks/Primera/summary1.txt', 'w') as f:
    f.write(out1)

print("Summary of Long Article 1:\n" + result1[0]["summary_text"])

Summary of Long Article 1:
additive models @xcite provide an important family of models for semiparametric regression or classification. many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0. in the last years many interesting results on learning rates have been published when the focus is on sparsity and when the classical least squares loss function is used, see e.g. @xcita. we present a new method that can provide a substantially better learning rate in high dimensions than an svm with a general kernel, say a classical gaussian rbf kernel, if the assumption of an additive model is satisfied. our leading example covers the learning rates for quantile regression based on the lipschitz continuous but non-differentiable pinball loss function, which is also called check function in the literature.


In [6]:
!pip install evaluate
!pip install rouge_score
import evaluate 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 7.0 MB/s 
Installing collected packages: evaluate
Successfully installed evaluate-0.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=3b9c9a4252b7b0d57c69ed07e1f93b335de9c3f814056fd8240245b27b991d9e
  Stored in directory: /root/.cache/pip/wheels/84/ac/6b/38096e3c5bf1dc87911e3585875e21a3ac610348e740409c76
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [7]:

result_small['generated_summaries']

['Secondly, the removal of the Facebook John Snow Kiss-In event: It turns out that the Facebook event for the John Snow Kiss-In was not blocked by Facebook, but made private by the creator of the event itself. Paul Shetler, the organizer, left this comment on the previous thread: “Hey I just saw this. Before it goes too far, I just want people to know that FB have NOT removed the kiss-in event page; it’s still there, but _I made the event private after the event_ was over and only visible to those who had been invited as there were starting to be trolls posting abusive nonsense on it.” Thanks for clearing that up, Paul. Now if Facebook will only reply to Richard’s query about why they removed my original post and photo when he put it up on his wall… It has been erroneously reported in the media that our own Richard Metzger (who lives in Los Angeles) organized the London “Kiss-In” event, which is untrue, and also unfair to Paul Shetler and the actual organizers. Also, Richard did not st

In [8]:
rouge = load_metric("rouge")
score=rouge.compute(predictions=result_small["generated_summaries"], references=result_small["gt_summaries"])
print(score['rouge1'].mid)
print(score['rouge2'].mid)
print(score['rougeL'].mid)

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Score(precision=0.39302704902171287, recall=0.43575513409878447, fmeasure=0.3682056652148361)
Score(precision=0.16639400382153685, recall=0.14557859964896402, fmeasure=0.13479385537883504)
Score(precision=0.2274521515987894, recall=0.2117288480624458, fmeasure=0.18939500363737544)


In [9]:
rouge = evaluate.load("rouge")
score = rouge.compute(predictions=result_small["generated_summaries"], references=result_small["gt_summaries"],rouge_types=['rouge1', 'rouge2','rouge3','rougeL'],use_aggregator=False,use_stemmer=True)
#print(score['rouge1'].mid)
#print(score['rouge2'].mid)
#print(score['rouge3'].mid)
#print(score['rougeL'].mid)
score  

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': [0.5200945626477542,
  0.3926096997690531,
  0.37962962962962965,
  0.3463917525773196,
  0.42743221690590105,
  0.3846153846153846,
  0.35185185185185186,
  0.07346938775510203,
  0.36977491961414793,
  0.5752212389380531],
 'rouge2': [0.16152019002375295,
  0.12064965197215777,
  0.06728538283062645,
  0.045454545454545456,
  0.1376,
  0.09392265193370167,
  0.09937888198757763,
  0.00823045267489712,
  0.14838709677419354,
  0.5357142857142857],
 'rouge3': [0.06682577565632457,
  0.06060606060606061,
  0.0046511627906976735,
  0.006211180124223602,
  0.05778491171749599,
  0.05,
  0.024999999999999998,
  0.0,
  0.06472491909385114,
  0.5225225225225224],
 'rougeL': [0.1796690307328605,
  0.19399538106235567,
  0.13194444444444445,
  0.12783505154639174,
  0.15948963317384368,
  0.18681318681318682,
  0.16049382716049382,
  0.04081632653061224,
  0.19935691318327978,
  0.5575221238938054]}

In [10]:
chrf = evaluate.load("chrf")
resultsc1 = chrf.compute(predictions=result_small["generated_summaries"], references=result_small["gt_summaries"],beta=1,word_order=0)
print(resultsc1)

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

ImportError: ignored

In [15]:
result_small['generated_summaries']

'"I grabbed a hold of her arm, and I wrapped it around my neck and grabbed her body and bear hugged her and dragged her over," Perez said. "2-0-2. It\'s code four. I got her off the bridge," Perez could be heard saying afterwards over the police radio. Perez said the girl didn\'t seem happy after the rescue. "She was sad and just started breaking down even more," he said. By the time paramedics arrived, Perez\'s job was done -- but not forgotten. Now when he looks at the bridge, fear goes through his mind, he said. She had told him she only had an aunt -- no other family -- and wanted to live with her. For now, the 12-year-old is waiting in protective police custody..'

In [11]:
dataset_small['document']

['\n \n \n \n UPDATE: 4/19/2001 Read Richard Metzger: How I, a married, middle-aged man, became an accidental spokesperson for gay rights overnight on Boing Boing \n \n It’s time to clarify a few details about the controversial “Hey Facebook what’s SO wrong with a pic of two men kissing?” story, as it now beginning to be reported in the mainstream media, and not always correctly. \n \n First of all, with regards to the picture: \n \n The photo which was used to illustrate my first post about the John Snow Kiss-In is a promotional still from the British soap opera “Eastenders.” It features one of the main characters from the show (Christian Clarke, played by the actor John Partridge- left) and someone else who I don’t know. I am not a regular viewer so I can’t say if the man on the right is an extra or an actual character. \n \n This picture has itself caused scandal in the UK, as it was a gay kiss that was broadcast before the watershed, and as such led to a number of complaints to the

In [16]:
len(dataset_small['document'])

10