In [1]:
%%capture
!pip install datasets
!pip install transformers

In [None]:
import datasets
import transformers
import pandas as pd
import numpy as np
import re

import nltk

nltk.download("punkt")

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Loading Data & Save Data (Do not run again!)

In [None]:
def get_cnn_data(train_split, val_split, test_split):
    train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:{}%]".format(train_split))
    val_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:{}%]".format(val_split))
    test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:{}%]".format(test_split))

    print("Training Data Count: {}".format(len(train_data)))
    print("Validation Data Count: {}".format(len(val_data)))
    print("Test Data Count: {}".format(len(test_data)))

    return train_data, val_data, test_data

In [None]:
CNN_data_train_2perc, CNN_data_val_5perc, CNN_data_test_5perc = get_cnn_data(2, 5, 5)

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.




Training Data Count: 5742
Validation Data Count: 668
Test Data Count: 574


In [None]:
# Training Data
CNN_data_train_2perc

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 5742
})

In [None]:
def save_data(data, name):
    lst_dics = [dic for dic in data]

    df = pd.DataFrame(lst_dics).rename(columns={"article":"text", "highlights":"reference_summary"})[["id", "text","reference_summary"]]

    print("DataFrame Shape: {}".format(df.shape))
    df.to_csv(name)
    print("Complete!")

In [None]:
save_data(CNN_data_train_2perc, "training_data.csv")

DataFrame Shape: (5742, 3)
Complete!


In [None]:
save_data(CNN_data_val_5perc, "validation_data.csv")

DataFrame Shape: (668, 3)
Complete!


In [None]:
save_data(CNN_data_test_5perc, "test_data.csv")

DataFrame Shape: (574, 3)
Complete!


# Data Preprocessing

In [None]:
df = pd.read_csv("./Data/data_ver1/test_data.csv", index_col=0)
df.drop(['id'], axis=1, inplace=True)
df

Unnamed: 0,text,reference_summary
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b..."
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...
...,...,...
569,(CNN)It came a day early. Baltimore investigat...,Prosecutors get investigative report a day ear...
570,(CNN)Robert Downey Jr. is making headlines for...,"Peggy Drexler: In interview to promote movie, ..."
571,(CNN)German police overnight thwarted a terror...,"German police say they think they ""have thwart..."
572,(CNN)Japanese Prime Minister Shinzo Abe is sch...,Japanese Prime Minister Shinzo Abe will addres...


In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [None]:
def clean_data(text):
  text = text.lower()
  text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
  
  text = re.sub("cnn", "", text)
  text=re.sub(r'\n',' ', text)
  text=re.sub(r'>',' ', text)
  text=re.sub(r'<',' ', text)
  text=re.sub(r'LRB',' ', text)
  text=re.sub(r'RRB',' ', text)
  text = re.sub(r'[" "]+', " ", text)
  text=re.sub(r'-- ',' ', text)
  text=re.sub(r"([?!¿])", r" \1 ", text)
  text=re.sub(r'-',' ', text)
  text=text.replace('/',' ')
  text=re.sub(r'\s+', ' ', text)
  text = re.sub('[^A-Za-z0-9.,]+', ' ', text)
  text = re.sub(r'([.?!]+) *', r' \1 ', text)
  return text

In [None]:
df['text'] = df['text'].map(lambda x: clean_data(x))
df['reference_summary'] = df['reference_summary'].map(lambda x: clean_data(x))
df

Unnamed: 0,text,reference_summary
0,the palestinian authority officially became t...,membership gives the icc jurisdiction over all...
1,never mind cats having nine lives . a stray p...,"theia, a bully breed mix, was apparently hit b..."
2,"if you have been following the news lately, t...",mohammad javad zarif has spent more time with ...
3,five americans who were monitored for three w...,17 americans were exposed to the ebola virus w...
4,a duke student has admitted to hanging a noos...,student is no longer on duke university campus...
...,...,...
569,it came a day early . baltimore investigators...,prosecutors get investigative report a day ear...
570,robert downey jr . is making headlines for wa...,"peggy drexler in interview to promote movie, r..."
571,german police overnight thwarted a terrorist ...,german police say they think they have thwarte...
572,japanese prime minister shinzo abe is schedul...,japanese prime minister shinzo abe will addres...


# Summarizer

In [None]:
def first_three_sent_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [None]:
df['generated_summary'] = df['text'].map(lambda x: first_three_sent_summary(x))

df

Unnamed: 0,text,reference_summary,generated_summary
0,the palestinian authority officially became t...,membership gives the icc jurisdiction over all...,the palestinian authority officially became t...
1,never mind cats having nine lives . a stray p...,"theia, a bully breed mix, was apparently hit b...",never mind cats having nine lives .\na stray ...
2,"if you have been following the news lately, t...",mohammad javad zarif has spent more time with ...,"if you have been following the news lately, t..."
3,five americans who were monitored for three w...,17 americans were exposed to the ebola virus w...,five americans who were monitored for three w...
4,a duke student has admitted to hanging a noos...,student is no longer on duke university campus...,a duke student has admitted to hanging a noos...
...,...,...,...
569,it came a day early . baltimore investigators...,prosecutors get investigative report a day ear...,it came a day early .\nbaltimore investigator...
570,robert downey jr . is making headlines for wa...,"peggy drexler in interview to promote movie, r...",robert downey jr . is making headlines for wa...
571,german police overnight thwarted a terrorist ...,german police say they think they have thwarte...,german police overnight thwarted a terrorist ...
572,japanese prime minister shinzo abe is schedul...,japanese prime minister shinzo abe will addres...,japanese prime minister shinzo abe is schedul...


In [None]:
df['reference_summary'][1]

'theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field  . she s a true miracle dog and she deserves a good life, says sara mellado, who is looking for a home for theia  . '

In [None]:
df['generated_summary'][1]

' never mind cats having nine lives .\na stray pooch in washington state has used up at least three of her own after being hit by a car, apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field only to survive .\nthat is according to washington state university, where the dog a friendly white and black bully breed mix now named theia has been receiving care at the veterinary teaching hospital .'

# BERTScore Evaluation on the Test Dataset

In [None]:
%%capture
!pip install evaluate
!pip install bert_score

In [None]:
from evaluate import load
bertscore = load("bertscore")

# Testing
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
results = bertscore.compute(predictions=predictions, references=references, lang="en")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

In [None]:
# Testing
results

{'precision': [1.0, 1.0],
 'recall': [1.0, 1.0],
 'f1': [1.0, 1.0],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.25.1)'}

In [None]:
# Performance on the test dataset
results = bertscore.compute(predictions=df['generated_summary'].to_numpy(), references=df['reference_summary'].to_numpy(), lang="en")

In [None]:
print("Mean Bertscore F1: {}".format(np.mean(results['f1'])))
print("Mean Bertscore Precision: {}".format(np.mean(results['precision'])))
print("Mean Bertscore Recall: {}".format(np.mean(results['recall'])))
print("*" * 25)
print("Median Bertscore F1: {}".format(np.median(results['f1'])))
print("Median Bertscore Precision: {}".format(np.median(results['precision'])))
print("Median Bertscore Recall: {}".format(np.median(results['recall'])))

Mean Bertscore F1: 0.8549875924396183
Mean Bertscore Precision: 0.8406311157273083
Mean Bertscore Recall: 0.8701216879622031
*************************
Median Bertscore F1: 0.8537062704563141
Median Bertscore Precision: 0.8392352759838104
Median Bertscore Recall: 0.8708000481128693
