# Summarization

In [1]:
!pip install rouge_score
!pip install py7zr

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=f345993608d464528831a2cca5832e34af9f228f39ba2da3f3bc9b4776c7205d
  Stored in directory: /root/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0mCollecting py7zr
  Downloading py7zr-0.20.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting brotli>=1.0.9
  Downloading Brotli-1.0.9-cp39-cp39-manylinux1_x86_64.whl (357 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.2/357.2 kB[0m [31m46.2 

In [16]:
from datasets import load_dataset, load_metric
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, set_seed, DataCollatorForSeq2Seq, TrainingArguments, Trainer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
SEED = 42

set_seed(SEED)

In [3]:
dataset = load_dataset('cnn_dailymail', version='3.0.0')
dataset

Downloading builder script:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset cnn_dailymail/default to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
dataset['train'][1]

{'article': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won\'t do what they\'re told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and less li

In [4]:
def three_sent_summary(text):
    return '\n'.join(sent_tokenize(text)[:3])

In [6]:
three_sent_summary(dataset['train'][1]['article'])

'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."'

## Evaluating PEGASUS

In [4]:
def eval_summaries_baseline(dataset, metric, column_text='article', column_summary='highlights'):
    summaries = [three_sent_summary(txt) for txt in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score

In [9]:
rouge_metric = load_metric('rouge')
rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

### Baseline

In [10]:
test_sample = dataset['test'].shuffle(seed=SEED).select(range(1000))

score = eval_summaries_baseline(test_sample, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

pd.DataFrame.from_dict(rouge_dict, orient='index', columns=['baseline']).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.389086,0.17148,0.244956,0.354055


### PEGASUS

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def chunks(elements, batch_size):
    for i in range(0, len(elements), batch_size):
        yield elements[i: i+batch_size]
        

def eval_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text='article', column_summary='highlights'):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))
    
    for ab, tb in tqdm.tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(ab, max_length=1024, truncation=True, padding='max_length', return_tensors='pt')
        summaries = model.generate(
            input_ids=inputs['input_ids'].to(device), attention_mask=inputs['attention_mask'].to(device),
            length_penalty=0.8, num_beams=8, max_length=128
        )
        decoded = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded = [d.replace('<n>', ' ') for d in decoded]
        metric.add_batch(predictions=decoded, references=tb)
        
    score = metric.compute()
    return score

In [12]:
torch.cuda.empty_cache()

ckpt = 'google/pegasus-cnn_dailymail'
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt).to(device)

score = eval_summaries_pegasus(test_sample, rouge_metric, model, tokenizer, batch_size=4)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

100%|██████████| 250/250 [28:04<00:00,  6.74s/it]


In [13]:
pd.DataFrame.from_dict(rouge_dict, orient='index', columns=['pegasus']).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.434987,0.216451,0.312244,0.374894


<hr>

## PEGASUS on Samsung Data

In [6]:
samsum = load_dataset('samsum')
split_lengths = [len(samsum[split]) for split in samsum]

print(f'Split lengths: {split_lengths}')
print(f'Features: {samsum["train"].column_names}')
print('\nDialogue:')
print(samsum['train'][0]['dialogue'])
print('\nSummary:')
print(samsum['train'][0]['summary'])

Downloading builder script:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading and preparing dataset samsum/samsum (download: 2.81 MiB, generated: 10.04 MiB, post-processed: Unknown size, total: 12.85 MiB) to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e...


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset samsum downloaded and prepared to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)

Summary:
Amanda baked cookies and will bring Jerry some tomorrow.


In [7]:
ckpt = 'google/pegasus-cnn_dailymail'
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt).to(device)

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [11]:
pipe = pipeline('summarization', model=ckpt)
pipe_out = pipe(samsum['test'][0]['dialogue'])
print(pipe_out[0]['summary_text'].replace(' .<n>', '\n'))

Your max_length is set to 128, but you input_length is only 122. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Amanda: Ask Larry Amanda: He called her last time we were at the park together
Hannah: I'd rather you texted him
Amanda: Just text him .


In [10]:
torch.cuda.empty_cache()

score = eval_summaries_pegasus(
    samsum['test'], rouge_metric, model, tokenizer, batch_size=4,
    column_text='dialogue', column_summary='summary'
)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

100%|██████████| 205/205 [13:16<00:00,  3.89s/it]


In [11]:
pd.DataFrame.from_dict(rouge_dict, orient='index', columns=['pegasus']).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.296091,0.087493,0.229237,0.229642


## Fine-tuning PEGASUS

In [12]:
def examples_to_features(batch):
    input_enc = tokenizer(batch['dialogue'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_enc = tokenizer(batch['summary'], max_length=128, truncation=True)
        
    return {
        'input_ids': input_enc['input_ids'],
        'attention_mask': input_enc['attention_mask'],
        'labels': target_enc['input_ids']
    }

In [13]:
samsum_pt = samsum.map(examples_to_features, batched=True)
columns = ['input_ids', 'labels', 'attention_mask']
samsum_pt.set_format(type='torch', columns=columns)



  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
training_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False,
    # evaluation_stategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_collator,
    train_dataset=samsum_pt['train'],
    eval_dataset=samsum_pt['validation']
)

In [20]:
trainer.train()

score = eval_summaries_pegasus(
    samsum['test'], rouge_metric, model, tokenizer, batch_size=4,
    column_text='dialogue', column_summary='summary'
)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

The following columns in the training set don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: id, summary, dialogue. If id, summary, dialogue are not expected by `PegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14732
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 920


Step,Training Loss
10,3.2116
20,3.0678
30,3.0725
40,2.9855
50,2.8516
60,2.9469
70,2.6274
80,2.4411
90,2.4045
100,2.255




Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 205/205 [07:36<00:00,  2.23s/it]


In [21]:
pd.DataFrame.from_dict(rouge_dict, orient='index', columns=['pegasus']).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.427882,0.196912,0.341494,0.341597


In [24]:
gen_kwargs = {'length_penalty': 0.8, 'num_beams': 8, 'max_length': 128}
sample = samsum['test'][0]['dialogue']
reference = samsum['test'][0]['summary']

pipe = pipeline('summarization', model=model.cpu(), tokenizer=tokenizer)

print('Dialogue')
print(sample)

print('\nReference Summary')
print(reference)

print('\nModel Summary')
print(pipe(sample, **gen_kwargs)[0]['summary_text'])

Your max_length is set to 128, but you input_length is only 122. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Dialogue
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary
Amanda can't find Hannah's Betty's number. Larry called Betty last time they were at the park together. Hannah wants Amanda to text him.


In [None]:
gen_kwargs