In [None]:
%%capture
!pip install datasets transformers evaluate bert_score

In [None]:
import datasets
import transformers
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel,TrainingArguments, Trainer
import torch.optim as optim
import re
import random
from tqdm import tqdm, trange
import os

from nltk.tokenize import sent_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


# Load Data

We use CNN/DailyMail as the main training dataset

In [None]:
def get_cnn_data(train_split, val_split, test_split):
    train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:{}%]".format(train_split))
    val_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:{}%]".format(val_split))
    test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:{}%]".format(test_split))

    print("Training Data Count: {}".format(len(train_data)))
    print("Validation Data Count: {}".format(len(val_data)))
    print("Test Data Count: {}".format(len(test_data)))

    return train_data, val_data, test_data

In [None]:
CNN_data_train_2perc, CNN_data_val_5perc, CNN_data_test_5perc = get_cnn_data(2, 5, 5)

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.




Training Data Count: 5742
Validation Data Count: 668
Test Data Count: 574


In [None]:
tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
model=GPT2LMHeadModel.from_pretrained('gpt2')
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|summarize|>']}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [None]:
max_input_length = 1024
max_target_length = 128

In [None]:
train_article = []
train_article.extend(CNN_data_train_2perc["article"])
train_summary = []
train_summary.extend(CNN_data_train_2perc["highlights"])
val_article = []
val_article.extend(CNN_data_val_5perc["article"])
val_summary = []
val_summary.extend(CNN_data_val_5perc["highlights"])
test_article = []
test_article.extend(CNN_data_test_5perc["article"])
test_summary = []
test_summary.extend(CNN_data_test_5perc["highlights"])

In [None]:
class CNN_Dataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_input_length, max_target_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt, label in zip(txt_list, label_list):
            
            token_txt = tokenizer(txt)['input_ids']
            token_label = tokenizer(label)['input_ids']
            allowed_txt_len = max_input_length - 3 - len(token_label)
            if len(token_txt) > allowed_txt_len:
              token_txt = token_txt[:allowed_txt_len]
              txt = tokenizer.decode(token_txt)

            prep_txt = '<|startoftext|>' + txt +'<|summarize|>'+label+ '<|endoftext|>'

            input_encodings_dict = tokenizer(prep_txt, padding="max_length",truncation=True, max_length=max_input_length)
            
            self.input_ids.append(torch.tensor(input_encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(input_encodings_dict['attention_mask']))

            if label_list:
              prep_label = '<|startoftext|>' + label + '<|endoftext|>'
              output_encodings_dict = tokenizer(prep_label, truncation=True,
                                       max_length=max_target_length, padding="max_length")
              self.labels.append(torch.tensor(output_encodings_dict['input_ids']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]

In [None]:
train_dataset = CNN_Dataset(train_article, train_summary, tokenizer, max_input_length, max_target_length)
val_dataset = CNN_Dataset(val_article, val_summary, tokenizer, max_input_length, max_target_length)
test_dataset = CNN_Dataset(test_article, test_summary, tokenizer, max_input_length, max_target_length)

In [None]:
### Check input size == 1024
for i in range(5742):
  if len(train_dataset[i][0])!=1024:
    print(i)
for i in range(574):
  if len(test_dataset[i][0])!=1024:
    print(i)

# Train

In [None]:
torch.manual_seed(42)

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
training_args = TrainingArguments(output_dir="gpt2-finetuned-cnn-summarization-v2",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=3,
                                  logging_steps=100,
                                  save_steps=5000,
                                  save_total_limit = 3,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  weight_decay=0.01,
                                  logging_dir='logs',
                                  report_to = 'none',
                                  push_to_hub=True)

In [None]:
data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                            'attention_mask': torch.stack([f[1] for f in data]),
                            'labels': torch.stack([f[0] for f in data])}

In [None]:
trainer = Trainer(model=model,  
        args=training_args, 
        train_dataset=train_dataset, eval_dataset=val_dataset, 
        tokenizer=tokenizer,
        data_collator=data_collator
        )

In [None]:
trainer.train()

***** Running training *****
  Num examples = 5742
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 17226
  Number of trainable parameters = 124442112


Epoch,Training Loss,Validation Loss
1,2.1919,2.159749
2,2.0192,2.16266
3,1.9587,2.168443


Saving model checkpoint to gpt2-finetuned-cnn-summarization-v2/checkpoint-5000
Configuration saved in gpt2-finetuned-cnn-summarization-v2/checkpoint-5000/config.json
Model weights saved in gpt2-finetuned-cnn-summarization-v2/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in gpt2-finetuned-cnn-summarization-v2/checkpoint-5000/tokenizer_config.json
Special tokens file saved in gpt2-finetuned-cnn-summarization-v2/checkpoint-5000/special_tokens_map.json
added tokens file saved in gpt2-finetuned-cnn-summarization-v2/checkpoint-5000/added_tokens.json
tokenizer config file saved in gpt2-finetuned-cnn-summarization-v2/tokenizer_config.json
Special tokens file saved in gpt2-finetuned-cnn-summarization-v2/special_tokens_map.json
added tokens file saved in gpt2-finetuned-cnn-summarization-v2/added_tokens.json
***** Running Evaluation *****
  Num examples = 668
  Batch size = 1
Saving model checkpoint to gpt2-finetuned-cnn-summarization-v2/checkpoint-10000
Configuration saved in gpt

TrainOutput(global_step=17226, training_loss=2.035247945348159, metrics={'train_runtime': 6980.5756, 'train_samples_per_second': 2.468, 'train_steps_per_second': 2.468, 'total_flos': 9002033086464000.0, 'train_loss': 2.035247945348159, 'epoch': 3.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 668
  Batch size = 1


{'eval_loss': 2.168442964553833,
 'eval_runtime': 75.1087,
 'eval_samples_per_second': 8.894,
 'eval_steps_per_second': 8.894,
 'epoch': 3.0}

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Saving model checkpoint to gpt2-finetuned-cnn-summarization-v2
Configuration saved in gpt2-finetuned-cnn-summarization-v2/config.json
Model weights saved in gpt2-finetuned-cnn-summarization-v2/pytorch_model.bin
tokenizer config file saved in gpt2-finetuned-cnn-summarization-v2/tokenizer_config.json
Special tokens file saved in gpt2-finetuned-cnn-summarization-v2/special_tokens_map.json
added tokens file saved in gpt2-finetuned-cnn-summarization-v2/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.30k/487M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gavin124/gpt2-finetuned-cnn-summarization-v2
   8b870f1..94ae66e  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gavin124/gpt2-finetuned-cnn-summarization-v2
   8b870f1..94ae66e  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/gavin124/gpt2-finetuned-cnn-summarization-v2
   94ae66e..e5eb706  main -> main

   94ae66e..e5eb706  main -> main



'https://huggingface.co/gavin124/gpt2-finetuned-cnn-summarization-v2/commit/94ae66ee3024a82a7eb89d6fa0c5b1c912e1ea25'

# Finetune Text Generation

In [None]:
original_summary, predicted_summary, original_text, predicted_text = [], [], [], []
iter = 0
for text, label in tqdm(zip(CNN_data_test_5perc['article'], CNN_data_test_5perc['highlights'])):
  iter += 1
  if iter > 20:
    break
  token_txt = tokenizer(text)['input_ids']
  if len(token_txt) >= 900:
    token_txt2 = token_txt[:900]
    text = tokenizer.decode(token_txt2)

  prompt = '<|startoftext|>' + text +'<|summarize|>'
  generated = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
  generated_ids = generated.input_ids.cuda()
  attention_mask = generated.attention_mask.cuda()
  sample_outputs = model.generate(generated_ids, attention_mask= attention_mask, pad_token_id=tokenizer.pad_token_id,
                                  max_length = max_input_length, 
                                  do_sample=False)

  pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens = False)

  try:
    pred_summary = pred_text.split("<|summarize|>", 1)[1]
    pred_summary_clean = pred_summary.replace("<|endoftext|>","")
  except:
    pred_summary = "None"
  
  original_summary.append(label)
  predicted_summary.append(pred_summary_clean)
  original_text.append(text)
  predicted_text.append(pred_text)

df = pd.DataFrame({'original_summary':original_summary, 
                   'predicted_summary':predicted_summary,
                   'original_text':original_text,
                   'predicted_text':predicted_text})
df

20it [00:22,  1.15s/it]


Unnamed: 0,original_summary,predicted_summary,original_text,predicted_text
0,Membership gives the ICC jurisdiction over all...,NEW: Palestinian Authority formally becomes 1...,(CNN)The Palestinian Authority officially beca...,<|startoftext|> (CNN)The Palestinian Authority...
1,"Theia, a bully breed mix, was apparently hit b...",Dog's brush with death did not leave her unsc...,(CNN)Never mind cats having nine lives. A stra...,<|startoftext|> (CNN)Never mind cats having ni...
2,Mohammad Javad Zarif has spent more time with ...,Zarif is the Iranian foreign minister.\nHe is...,"(CNN)If you've been following the news lately,...",<|startoftext|> (CNN)If you've been following ...
3,17 Americans were exposed to the Ebola virus w...,Five Americans who were monitored for three w...,(CNN)Five Americans who were monitored for thr...,<|startoftext|> (CNN)Five Americans who were m...
4,Student is no longer on Duke University campus...,Duke student admitted hanging noose from tree...,(CNN)A Duke student has admitted to hanging a ...,<|startoftext|> (CNN)A Duke student has admitt...
5,College-bound basketball star asks girl with D...,"Trey Moses, Ellie Meredith, are prom dates fo...",(CNN)He's a blue chip college basketball recru...,<|startoftext|> (CNN)He's a blue chip college ...
6,Amnesty's annual death penalty report catalogs...,Amnesty International: Death penalty used as ...,(CNN)Governments around the world are using th...,<|startoftext|> (CNN)Governments around the wo...
7,Andrew Getty's death appears to be from natura...,NEW: Andrew Getty's parents say he died of na...,"(CNN)Andrew Getty, one of the heirs to billion...","<|startoftext|> (CNN)Andrew Getty, one of the ..."
8,"Once a super typhoon, Maysak is now a tropical...",Maysak gained super typhoon status thanks to ...,(CNN)Filipinos are being warned to be on guard...,<|startoftext|> (CNN)Filipinos are being warne...
9,"Bob Barker returned to host ""The Price Is Righ...","Bob Barker hosts ""The Price Is Right"" for 35 ...","(CNN)For the first time in eight years, a TV l...",<|startoftext|> (CNN)For the first time in eig...


In [None]:
### do_sample=True, top_k=20, top_p=0.90, temperature=1.0
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8690944373607635
0.8607147514820099
0.8647843331098557
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  NEW: Rome Statute gives ICC jurisdiction over alleged crimes committed in Palestinian territory.
NEW: U.S. urges the Israeli government to "reassure itself"

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog has been in veterinary school since 1991.
Theia is only one year old but still requires surgery to help her breathe.
Animal's owner: "She's a true miracle dog"


In [None]:
### do_sample=True, top_k=50, top_p=0.90, temperature=1.0
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8716547369956971
0.8593914270401001
0.8653830111026763
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  The Palestine Authority formally became the 123rd member of the International Criminal Court.
Legal powers are formally transferred to the court from the United States.
Lawyer says acceding to Rome Statute is good move.
The Palestinian Authority acquires all the rights as well as responsibilities.

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog had been missing since January after being hit by a car in Tampa, Florida.
Animal survived despite injury, neglect, head injuries and cavity 

In [None]:
### do_sample=True, top_k=50, top_p=0.90, temperature=2.0
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8440869450569153
0.8444231450557709
0.8441608518362045
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  Official: International Criminal Court "is formally becoming 121st and final membership court in'realist status' era," status papers go on display.
Citizen will receive acceding Rome Statute next week on Rome, said U.N. secretary-general.
More than 100 countries worldwide joined international justice center in Rome.
Court: Court member can accept tribunal ruling that Gaza has recognized its state status.

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog was pitied by car owner who save

In [None]:
### do_sample=True, top_k=50, top_p=0.90, temperature=0.8
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8724628567695618
0.8619839549064636
0.8671008497476578
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  Palestinians sign ICC's Rome Statute, formally becoming 123rd member.
Israel and United States oppose Palestinians' attempts to join the body.
Palestine acquires all rights as well as responsibilities as member.
Israel, United States, Palestinian say it is a step closer to ending long era of impunity.

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog was found dead in Tampa, Florida, on January 4.
Dog was found dead in a neighbor's yard five days after he was buried.
Dog has been recei

In [None]:
### do_sample=False, num_beams = 5
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8744258135557175
0.8626664459705353
0.8683514505624771
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  The Palestinian Authority officially became the 123rd member of the International Criminal Court.
The formal accession was marked with a ceremony at The Hague.
Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body.

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog, a friendly white-and-black bully breed mix now named Theia, has been receiving care at the Veterinary Teaching Hospital.
Four days after her apparent death, the d

In [None]:
### do_sample=False, num_beams = 10
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8722610384225845
0.8610029816627502
0.8664434462785721
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  The Palestinian Authority officially became the 123rd member of the International Criminal Court.
The formal accession was marked with a ceremony at The Hague.
Israel and the United States opposed the Palestinians' efforts to join the body.

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog's brush with death did not leave her unscathed.
She suffered a dislocated jaw, leg injuries and a caved-in sinus cavity.
The dog's brush with death did not leave her unscathed.


In [None]:
### do_sample=False, Greedy Search
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)
print(df['precision'].mean())
print(df['recall'].mean())
print(df['f1'].mean())
print("original-0\n", df['original_summary'][0])
print("\npredict-0\n", df['predicted_summary'][0])
print("\noriginal-1\n", df['original_summary'][1])
print("\npredict-1", df['predicted_summary'][1])

0.8644719541072845
0.8560375064611435
0.8601139962673188
original-0
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

predict-0
  NEW: Palestinian Authority formally becomes 123rd member of ICC.
NEW: Palestinian Authority formally becomes 123rd member of ICC.
NEW: ICC president says acceding to Rome Statute is "just the first step" for Palestinians.
NEW: ICC president: "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity"

original-1
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

predict-1  Dog's brush with death did not leave her unscathed.
Dog's brush 

# Evaluate and Metric (Bertscore)

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = GPT2LMHeadModel.from_pretrained('gavin124/gpt2-finetuned-cnn-summarization-v2')
tokenizer = tokenizer=GPT2Tokenizer.from_pretrained('gavin124/gpt2-finetuned-cnn-summarization-v2')
model.to(device)

In [None]:
original_summary, predicted_summary, original_text, predicted_text, generated_list = [], [], [], [], []
iter = 0
for text, label in tqdm(zip(CNN_data_test_5perc['article'], CNN_data_test_5perc['highlights'])):
  iter += 1
  
  token_txt = tokenizer(text)['input_ids']
  if len(token_txt) >= 900:
    token_txt2 = token_txt[:900]
    text = tokenizer.decode(token_txt2)

  prompt = '<|startoftext|>' + text +'<|summarize|>'
  generated = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
  generated_ids = generated.input_ids
  attention_mask = generated.attention_mask
  generated_list.append(generated)
  sample_outputs = model.generate(generated_ids, attention_mask= attention_mask, pad_token_id=tokenizer.pad_token_id,
                                  do_sample=True, top_k=50, max_length = max_input_length, top_p=0.90, temperature = 0.8)

  pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens = False)

  try:
    pred_summary_raw = pred_text.split("<|summarize|>", 1)[1]
    pred_summary = pred_summary_raw.replace("<|endoftext|>","")
  except:
    pred_summary = "None"
 
  original_summary.append(label)
  predicted_summary.append(pred_summary)
  original_text.append(text)
  predicted_text.append(pred_text)

df = pd.DataFrame({'original_summary':original_summary, 
                   'predicted_summary':predicted_summary,
                   'original_text':original_text,
                   'predicted_text':predicted_text})

In [None]:
df['original_summary'][1]

'Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .\n"She\'s a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .'

In [None]:
df['predicted_summary'][1]

" Dog has been receiving care at the Veterinary Teaching Hospital.\nSara Mellado, a volunteer, has set up a fundraising page to help pay for the dog's care.\nAnimal's brush with death left her unscathed."

In [None]:
import evaluate
from evaluate import load

bertscore = load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
def bert_score_eval_precision(row):
  generated_summary = [row['predicted_summary']]
  reference_summary = [row['original_summary']]
  result = bertscore.compute(predictions=generated_summary, references=reference_summary, lang="en")
  return result['precision'][0]
def bert_score_eval_recall(row):
  generated_summary = [row['predicted_summary']]
  reference_summary = [row['original_summary']]
  result = bertscore.compute(predictions=generated_summary, references=reference_summary, lang="en")
  return result['recall'][0]
def bert_score_eval_f1(row):
  generated_summary = [row['predicted_summary']]
  reference_summary = [row['original_summary']]
  result = bertscore.compute(predictions=generated_summary, references=reference_summary, lang="en")
  return result['f1'][0]

In [None]:
df['precision'] = df.apply(bert_score_eval_precision, axis=1)
df['recall'] = df.apply(bert_score_eval_recall, axis=1)
df['f1'] = df.apply(bert_score_eval_f1, axis=1)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

In [None]:
df.head()

Unnamed: 0,original_summary,predicted_summary,original_text,predicted_text,precision,recall,f1
0,Membership gives the ICC jurisdiction over all...,NEW: ICC's president says acceding to treaty ...,(CNN)The Palestinian Authority officially beca...,<|startoftext|> (CNN)The Palestinian Authority...,0.854889,0.85701,0.855948
1,"Theia, a bully breed mix, was apparently hit b...",Dog has been receiving care at the Veterinary...,(CNN)Never mind cats having nine lives. A stra...,<|startoftext|> (CNN)Never mind cats having ni...,0.871666,0.851027,0.861223
2,Mohammad Javad Zarif has spent more time with ...,"Zarif, 67, a native of Iran, has been U.S. Se...","(CNN)If you've been following the news lately,...",<|startoftext|> (CNN)If you've been following ...,0.855702,0.839995,0.847776
3,17 Americans were exposed to the Ebola virus w...,Five Americans were monitored for three weeks...,(CNN)Five Americans who were monitored for thr...,<|startoftext|> (CNN)Five Americans who were m...,0.883069,0.869112,0.876035
4,Student is no longer on Duke University campus...,Duke student has admitted hanging rope from t...,(CNN)A Duke student has admitted to hanging a ...,<|startoftext|> (CNN)A Duke student has admitt...,0.911599,0.89712,0.904301


In [None]:
df.to_csv("gpt2_test.csv")

# Final Test on BBC

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained('gavin124/gpt2-finetuned-cnn-summarization-v2')
tokenizer = tokenizer=GPT2Tokenizer.from_pretrained('gavin124/gpt2-finetuned-cnn-summarization-v2')
model.to(device)

In [None]:
final_test_news = pd.read_csv('bbc_news_updated.csv', index_col=0)
final_test_news.head()

Unnamed: 0,Titles,Content,Original Summary,Link,Section
0,UK orders Chinese firm to sell Welsh tech plant,The takeover of Britain's largest microchip pl...,The UK government says Nexperia's takeover of ...,https://www.bbc.com/news/uk-wales-63656816,/news/business
1,"Work long hours or leave, Musk tells Twitter s...",Elon Musk has told Twitter staff that they mus...,Elon Musk says workers at the social media fir...,https://www.bbc.com/news/business-63648505,/news/business
2,UK Chancellor to unveil spending cuts and tax ...,Chancellor Jeremy Hunt will pledge to face int...,Pensions and benefits will rise with prices bu...,https://www.bbc.com/news/uk-politics-63656522,/news/business
3,What is behind the big tech companies' job cuts?,The first sign of job cuts at Amazon came from...,Thousands of redundancies have been announced ...,https://www.bbc.com/news/technology-63635821,/news/business
4,UK food delivery firm Deliveroo quits Australia,UK-based food delivery app Deliveroo says it i...,The firm struggled to compete with rivals as w...,https://www.bbc.com/news/business-63645765,/news/business


In [None]:
predicted_summary, original_text, predicted_text = [], [], []
iter = 0
for text in tqdm(final_test_news['Content'].values):
  iter += 1
  
  token_txt = tokenizer(text)['input_ids']
  if len(token_txt) >= 900:
    token_txt2 = token_txt[:900]
    text = tokenizer.decode(token_txt2)

  prompt = '<|startoftext|>' + text +'<|summarize|>'
  generated = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
  generated_ids = generated.input_ids
  attention_mask = generated.attention_mask

  sample_outputs = model.generate(generated_ids, attention_mask= attention_mask, pad_token_id=tokenizer.pad_token_id,max_length = max_input_length,
                                  do_sample=True, top_k=50, top_p=0.90, temperature=0.8 )

  pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens = False)

  try:
    pred_summary_raw = pred_text.split("<|summarize|>", 1)[1]
    pred_summary = pred_summary_raw.replace("<|endoftext|>","")
  except:
    pred_summary = "None"
  
  predicted_summary.append(pred_summary)
  original_text.append(text)
  predicted_text.append(pred_text)

test_df = pd.DataFrame({'predicted_summary':predicted_summary,
                        'original_text':original_text,
                        'predicted_text':predicted_text})
test_df.head()

  3%|▎         | 2/74 [00:19<11:31,  9.61s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1150 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 74/74 [12:37<00:00, 10.24s/it]


Unnamed: 0,predicted_summary,original_text,predicted_text
0,NEW: UK government to appeal against sale of ...,The takeover of Britain's largest microchip pl...,<|startoftext|> The takeover of Britain's larg...
1,Elon Musk's Twitter employees must agree to p...,Elon Musk has told Twitter staff that they mus...,<|startoftext|> Elon Musk has told Twitter sta...
2,"Chancellor to unveil a range of tax cuts, inc...",Chancellor Jeremy Hunt will pledge to face int...,<|startoftext|> Chancellor Jeremy Hunt will pl...
3,"Apple, LinkedIn join forces to cut jobs, pay ...",The first sign of job cuts at Amazon came from...,<|startoftext|> The first sign of job cuts at ...
4,Food delivery app Deliveroo has come under pr...,UK-based food delivery app Deliveroo says it i...,<|startoftext|> UK-based food delivery app Del...


In [None]:
test_df.to_csv("gpt2_final_test.csv")