In [4]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [5]:
import torch

In [6]:
from datasets import load_dataset
from datasets import get_dataset_split_names
xsum_dataset = load_dataset("xsum")

Found cached dataset xsum (/home/vv2116/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)
100%|██████████| 3/3 [00:00<00:00, 471.18it/s]


In [7]:
xsum_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [8]:
# from transformers import 
from transformers import PegasusForConditionalGeneration,PegasusTokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

base_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [9]:
from dataset import *
num_samples=204045
val_dataset = Dataset("val", tokenizer,xsum_dataset["train"]["document"][:num_samples] , xsum_dataset["validation"]["summary"][:num_samples])


In [10]:
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = 2, shuffle = False)
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
model = base_model.to(device)



In [11]:
from tqdm import tqdm
import gc

def beam_search_step(batch, tokenizer, base_model, device):
    # 1 - beam search
    if True:
        summary_ids = base_model.generate(
            batch["text_inputs"]['input_ids'],
            attention_mask = batch["text_inputs"]["attention_mask"],
            num_beams = 5,
            num_return_sequences = 5,
            max_length = 64
        )

    generated = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    del summary_ids
    gc.collect()

    return generated


def get_summaries(tokenizer, val_loader, base_model, device):
    val_texts = []
    val_summaries = []
    val_labels = []

    for idx, batch in tqdm(enumerate(val_loader)):
        for k in batch["text_inputs"].keys():
            batch["text_inputs"][k] = batch["text_inputs"][k].to(device)
            if len(batch["text_inputs"][k].shape) > 2:
                batch["text_inputs"][k] = batch["text_inputs"][k].squeeze(1)

        model.zero_grad()
        val_texts += batch["text"]

        raw_summaries = beam_search_step(batch, tokenizer, base_model, device)
        
        summaries = []
        for i in range(len(batch["text"])):
            summaries.append(raw_summaries[i*15:(i+1)*15])
        val_summaries += summaries

        labels = batch["summary"]
        val_labels += labels

    print(len(val_texts), len(val_summaries), len(val_summaries[0]), len(val_labels))

    return val_texts, val_summaries, val_labels

In [12]:
val_texts, val_summaries, val_labels = get_summaries(tokenizer, val_loader, model, device)


348it [03:23,  1.71it/s]


KeyboardInterrupt: 

In [None]:


base_results = [val_summaries[i][0] for i in range(len(val_summaries))]
print("*"*100)
print("\nTop beam:")

r1, r2, rl = rouge_eval("true labels", val_texts, base_results, val_labels)

****************************************************************************************************

Top beam:

 ********** 1 - ROUGE evaluation with true labels **********
Mean R: 36.7436, R-1: 44.6532 (var: 20.2878), R-2: 26.0262 (var: 20.7903), R-L: 39.5514 (var: 20.5009)


In [None]:
# pip install torchmetrics
from torchmetrics.text.rouge import ROUGEScore
from nltk.tokenize import sent_tokenize
import numpy as np


def pre_rouge_processing(summary):

    summary = summary.replace("<n>", " ")    
    summary = "\n".join(sent_tokenize(summary))
    
    return summary
def rouge_eval(mode, val_texts, val_summaries, val_labels):
    print("\n", "*"*10, "1 - ROUGE evaluation with {}".format(mode), "*"*10)
    scorer = ROUGEScore()
    all_r1s = []
    all_r2s = []
    all_rls = []
    for i in range(len(val_summaries)):
        summary = val_summaries[i]
        summary = pre_rouge_processing(summary)
        label = val_labels[i]
        rscores = scorer(summary, label)
        all_r1s.append(rscores["rouge1_fmeasure"])
        all_r2s.append(rscores["rouge2_fmeasure"])
        all_rls.append(rscores["rougeL_fmeasure"])
    all_r1s = 100 * np.array(all_r1s)
    all_r2s = 100 * np.array(all_r2s)
    all_rls = 100 * np.array(all_rls)
    mean_r1 = np.mean(all_r1s)
    mean_r2 = np.mean(all_r2s)
    mean_rl = np.mean(all_rls)
    mean_r = (mean_r1 + mean_r2 + mean_rl) / 3
    print("Mean R: {:.4f}, R-1: {:.4f} (var: {:.4f}), R-2: {:.4f} (var: {:.4f}), R-L: {:.4f} (var: {:.4f})".format(
        mean_r, mean_r1, np.std(all_r1s), mean_r2, np.std(all_r2s), mean_rl, np.std(all_rls)))

    return all_r1s, all_r2s, all_rls

In [None]:
import pandas as pd
df = pd.DataFrame({"text": val_texts,"summaries":val_summaries,"labels":val_labels})

df.to_csv("candidates_test.csv",index=False)

