<a href="https://colab.research.google.com/github/cicattzo/nlp_project/blob/main/6_684_project_bertsum_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
%%capture
"""
Install working versions of packages
"""
!pip install torch==1.4.1
!pip install datasets==1.0.2
# !pip install transformers==4.0.1
!pip install transformers==4.4.2
!pip install bert-extractive-summarizer
!pip install sacrebleu
!pip install rouge_score



In [90]:
import datasets
from transformers import BertTokenizerFast
from summarizer import Summarizer
import sacrebleu
from rouge_score import rouge_scorer
from tqdm import tqdm
import pandas as pd
import numpy as np

# Download data

In [44]:
train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")

Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


In [45]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [46]:
train_data[0]['highlights']

'Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"\nObama sends a letter to the heads of the House and Senate .\nObama to seek congressional approval on military action against Syria .\nAim is to determine whether CW were used, not by whom, says U.N. spokesman .'

In [47]:
train_data[0]['article']

'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but because he want

In [48]:
# map article and summary len to dict as well as if sample is longer than 512 tokens
def map_to_length(x):
  x["article_len"] = len(tokenizer(x["article"]).input_ids)
  x["article_longer_512"] = int(x["article_len"] > tokenizer.model_max_length)
  x["summary_len"] = len(tokenizer(x["highlights"]).input_ids)
  x["summary_longer_64"] = int(x["summary_len"] > 64)
  x["summary_longer_128"] = int(x["summary_len"] > 128)
  return x

In [49]:
sample_size = 10000
data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)

Token indices sequence length is longer than the specified maximum sequence length for this model (1959 > 512). Running this sequence through the model will result in indexing errors


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602/cache-f1e4858d1ee1d31d.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602/cache-bdcfc4057a49b76c.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602/cache-7d8202ed347bd6d2.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602/cache-e718008ba06b451f.arrow


# Bert Extractive Summarization - bert-base-uncased tokenizer

In [50]:
model = Summarizer(custom_tokenizer=tokenizer)

In [79]:
rscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)

def evaluate_extractive_summarizer_bert(summarizer, corpus, full_text_key, label_key):
  rouge_score_list = []
  bleu_score_list = []

  print('Len of our corpus:', len(corpus))
  i = 1
  for email in tqdm(corpus):
    print("Current corpus number: ", i)
    txt = email[full_text_key]
    summary = email[label_key]
    # parser = PlaintextParser.from_string(txt, tokenizer)
    pred = summarizer(txt, num_sentences = 1)
    rouge_score_list.append(rscorer.score(summary, pred))
    bleu_score_list.append(sacrebleu.raw_corpus_bleu([pred], [[summary]], .01).score)

  return {
      'rouge2-fscore': np.mean([sc['rouge2'].fmeasure for sc in rouge_score_list]),
      'rouge1-fscore': np.mean([sc['rouge1'].fmeasure for sc in rouge_score_list]),
      'bleu-score': np.mean(bleu_score_list)
  }

In [None]:
rouge_score_list = []
bleu_score_list = []

print('Len of our corpus:', len(train_data))
i = 0
while i<1000:
  print('Current email: ', i)
  email = train_data[i]
  txt = email['article']
  summary = email['highlights']
  # parser = PlaintextParser.from_string(txt, tokenizer)
  pred = model(txt, num_sentences = 1)
  rouge_score_list.append(rscorer.score(summary, pred))
  bleu_score_list.append(sacrebleu.raw_corpus_bleu([pred], [[summary]], .01).score)
  i+=1


print('rouge2-fscore',np.mean([sc['rouge2'].fmeasure for sc in rouge_score_list]))
print('rouge1-fscore', np.mean([sc['rouge1'].fmeasure for sc in rouge_score_list]))
print('bleu-score',np.mean(bleu_score_list))

Len of our corpus: 287113
Current email:  0
Current email:  1
Current email:  2
Current email:  3
Current email:  4
Current email:  5
Current email:  6
Current email:  7
Current email:  8
Current email:  9
Current email:  10
Current email:  11
Current email:  12
Current email:  13
Current email:  14
Current email:  15


In [65]:
pred = model(train_data[0]['article'], num_sentences = 1)

In [67]:
true_sum = train_data[0]['highlights']

In [68]:
rscorer.score(true_sum, pred)

{'rouge1': Score(precision=0.27419354838709675, recall=0.3148148148148148, fmeasure=0.29310344827586204),
 'rouge2': Score(precision=0.03278688524590164, recall=0.03773584905660377, fmeasure=0.03508771929824562)}

In [69]:
sacrebleu.raw_corpus_bleu([pred], [[true_sum]], .01).score

0.3551260741812951

In [75]:
true_sum

'Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"\nObama sends a letter to the heads of the House and Senate .\nObama to seek congressional approval on military action against Syria .\nAim is to determine whether CW were used, not by whom, says U.N. spokesman .'