<h1 align = "left"> Finalized Summarizer Models and Evaluation Metrics </h1>

<h2 align = "left"><em> Abstractive Summarizer and Evaluation  </em></h2> 

In [16]:
############
# INSTALLS #
############

%%capture
!pip install datasets==1.0.2
!pip install transformers==4.2.1
!pip install rouge_score
!pip install sacrebleu

UsageError: Line magic function `%%capture` not found.


In [7]:
###########
# IMPORTS #
###########

import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  
from transformers import BertTokenizer, EncoderDecoderModel

from functools import reduce
from operator import add

In [8]:
###############
# GLOBAL VARS #
###############

tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")  
model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")

In [9]:
########
# DATA #
########

test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test")


Reusing dataset cnn_dailymail (/home/jupyter/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


In [14]:
####################
# HELPER FUNCTIONS #
####################

divString = lambda size, char = "#": reduce(add, [char for i in range(size)])
flatten = lambda lst: [i for sublst in lst for i in sublst]

def generate_summary(batch):
    """This function computes a summary for a given article from the Dataset object
    batch
    Params:
    batch: an article from the given Dataset object."""
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred"] = output_str
    return batch

def compute_metrics(batch, batch_size=16, metric_name="rouge"):
    """This function computes the rouge or bleu scores for predicted summaries
    Params:
    batch: A Dataset object which contains the articles at the specified indices
    Use the select method for this function call. 
    Example format: Dataset.select([list of indices to select from the original dataset])
    metric_name: The prefered evaluation metric to use"""
    
    metric = datasets.load_metric(metric_name)
    results = batch.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
    summary_pred = results["pred"]
    label_ref = results["highlights"]
    if metric_name == "rouge":
        output = metric.compute(predictions=summary_pred, references=label_ref, rouge_types=["rouge2"])["rouge2"].mid
        print("\n" + "ROUGE SCORE:")
        return output
    else:
        # Else compute bleu score with metric name "sacrebleu"
        all_bleu_scores = []
        for i in range(len(batch)):
            output = metric.compute(predictions= [summary_pred[i]], references= [[label_ref[i]]])
            all_bleu_scores.append(output)
            print("\n\n")
            print(divString(100))
            print("\n\n" + "Summary prediction: " + "\n\n", summary_pred[i])
            print("\n\n" + "Reference Label: " + "\n\n", label_ref[i])
            print("\n\n" + "BLEU SCORE:" + "\n\n", output)
            print("\n")
        return all_bleu_scores
    

In [15]:
# generate_summary(test_data[0])
# compute_metrics(test_data.select([1,2]), metric_name = "rouge")
compute_metrics(test_data.select([1, 2]), metric_name = "sacrebleu")

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602/cache-309f3785e634a4cc.arrow





####################################################################################################


Summary prediction: 

 dr. anthony moschetto, 54, is charged with criminal solicitation, conspiracy, arson, arson. two other men - - james chmela, 43, and james kalamaras, 41 - - were named as accomplices. they pleaded not guilty in nassau county district court.


Reference Label: 

 A lawyer for Dr. Anthony Moschetto says the charges against him are baseless .
Moschetto, 54, was arrested for selling drugs and weapons, prosecutors say .
Authorities allege Moschetto hired accomplices to burn down the practice of former associate .


BLEU SCORE:

 {'score': 3.5934005135957903, 'counts': [10, 2, 1, 0], 'totals': [51, 50, 49, 48], 'precisions': [19.607843137254903, 4.0, 2.0408163265306123, 1.0416666666666667], 'bp': 1.0, 'sys_len': 51, 'ref_len': 44}





####################################################################################################


Summary prediction: 

 obama 

[{'score': 3.5934005135957903,
  'counts': [10, 2, 1, 0],
  'totals': [51, 50, 49, 48],
  'precisions': [19.607843137254903,
   4.0,
   2.0408163265306123,
   1.0416666666666667],
  'bp': 1.0,
  'sys_len': 51,
  'ref_len': 44},
 {'score': 2.113706726725864,
  'counts': [10, 2, 0, 0],
  'totals': [61, 60, 59, 58],
  'precisions': [16.39344262295082,
   3.3333333333333335,
   0.847457627118644,
   0.43103448275862066],
  'bp': 1.0,
  'sys_len': 61,
  'ref_len': 33}]

<h2 align = "left"><em> Extractive Summarizer and Evaluation  </em></h2> 