In [76]:
import torch
from datasets import load_dataset
from rouge import Rouge

import transformers

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import transformers
from trainer import Trainer
from torch.utils.data import DataLoader
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
import wandb
from logger import log_metrics
#import gradient checkpointing
from torch.utils.checkpoint import checkpoint_sequential
import numpy as np
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F

class CNNBERT(torch.utils.data.Dataset):
    def __init__(self, model_name = 'google/pegasus-large', max_length=256, split = 'train'):
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.tokenizer.max_length = max_length
        self.model = BertModel.from_pretrained('bert-base-uncased').cuda()
        self.tokenizer2 = BertTokenizer.from_pretrained('bert-base-uncased')
        self.dataset = load_dataset('cnn_dailymail', '3.0.0', split = split)
        self.max_length = max_length

    @torch.no_grad()
    def get_bert_embeddings(self, text):
        inputs = self.tokenizer2(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.cuda() for k, v in inputs.items()}
        embeddings = self.model(**inputs)['pooler_output']
        #compute cosine similarity between embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        return embeddings

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['article']
        text = text.split('. ')
        text = [i.strip() for i in text]
        
        embeddings = self.get_bert_embeddings(text)
        first = np.random.choice(len(text), 1, replace=False)[0]
        chosen_embeddings = torch.empty((1, 768)).cuda()
        bag_of_sentences = [first]
        chosen_embeddings[0] = embeddings[first]
        current_size = len(text[first])

        while current_size < self.max_length and len(bag_of_sentences) < len(text):
            new_cosine_sim = torch.mm(chosen_embeddings, embeddings.T)
            vals, indices = torch.topk(-torch.sum(new_cosine_sim, dim = 0), k = len(text))
            for i in indices:
                if i not in bag_of_sentences:
                    chosen_embeddings = torch.cat((chosen_embeddings, embeddings[i].unsqueeze(0)), dim = 0)
                    bag_of_sentences.append(i.item())
                    current_size += len(text[i])
                    break
        bag_of_sentences = sorted(bag_of_sentences)
        final = [text[i] for i in bag_of_sentences]
        final = '. '.join(final)
        print(final)

        summary_text = self.dataset[idx]['highlights']
        return {'article_text':final, 'summary_text': summary_text}

dataset = CNNBERT(max_length=512, split = 'train')
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset cnn_dailymail (/home/da2986/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e1

In [77]:
for i, batch1 in enumerate(dataloader):
    break

Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.


In [78]:
batch1['article'] = dataset.tokenizer(batch1['article_text'], max_length=512, truncation=True, padding='longest', return_tensors="pt")
batch1['summary'] = dataset.tokenizer(batch1['summary_text'], max_length=512, truncation=True, padding='longest', return_tensors="pt")

In [79]:
print(batch1['article_text'])
dataset.tokenizer.decode(batch1['article']['input_ids'][0])

['Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.']


'Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.'

In [65]:
print(batch1['summary_text'])
dataset.tokenizer.decode(batch1['summary']['input_ids'][0])

["Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."]


"[CLS] harry potter star daniel radcliffe gets £20m fortune as he turns 18 monday. young actor says he has no plans to fritter his cash away. radcliffe's earnings from first five potter films have been held in trust fund. [SEP]"

In [66]:
class PegasusCNNPowerLaw(torch.utils.data.Dataset):
    def __init__(self, model_name = 'google/pegasus-large', max_length=256, split = 'train', divisor = 2):
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.tokenizer.max_length = max_length
        self.dataset = load_dataset('cnn_dailymail', '3.0.0', split = split)
        self.max_length = max_length
        self.probability = np.ones(1000) * 1000000
        for i, val in enumerate(self.probability):
            if i == 0: continue
            self.probability[i] = self.probability[i-1] / divisor
        self.indexes = np.arange(1000)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['article']
        text = text.split('. ')
        
        max_idx = max(1, len(text))
        choices = np.random.choice(self.indexes[:max_idx], max_idx, replace = False, p = self.probability[:max_idx] / self.probability[:max_idx].sum())

        current_size = 0
        counter = 0
        while current_size < self.max_length and counter < max_idx:
            current_size += len(text[choices[counter]])
            counter += 1

        choices = sorted(choices[:counter])
        final = list(np.array(text)[choices])
        text = '. '.join(final)

        summary_text = self.dataset[idx]['highlights']
        return {'article_text':text, 'summary_text': summary_text}

In [67]:
dataset = PegasusCNNPowerLaw(max_length=512, split = 'train', divisor = 2)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
for i, batch2 in enumerate(dataloader):
    break

Found cached dataset cnn_dailymail (/home/da2986/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


In [68]:
batch2['article'] = dataset.tokenizer(batch2['article_text'], max_length=512, truncation=True, padding='longest', return_tensors="pt")
batch2['summary'] = dataset.tokenizer(batch2['summary_text'], max_length=512, truncation=True, padding='longest', return_tensors="pt")

In [69]:
dataset.tokenizer.decode(batch2['article']['input_ids'][0])

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant'

In [70]:
dataset.tokenizer.decode(batch2['summary']['input_ids'][0])

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday. Young actor says he has no plans to fritter his cash away. Radcliffe's earnings from first five Potter films have been held in trust fund."