In [36]:
import torch
from datasets import load_dataset
from rouge import Rouge

import transformers

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import transformers
from trainer import Trainer
from torch.utils.data import DataLoader
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
import wandb
from logger import log_metrics
import numpy as np


class PegasusCNNDataset(torch.utils.data.Dataset):
    def __init__(self, model_name = 'google/pegasus-large', max_length=256, split = 'train'):
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.tokenizer.max_length = max_length
        self.dataset = load_dataset('cnn_dailymail', '3.0.0', split = split)
        self.max_length = max_length
        
        #we want to tokenize both our inputs and outputs before passing to the model
        #self.inputs = self.tokenizer(self.dataset['article'], max_length=self.max_length, truncation=True, padding="longest", return_tensors="pt")
        #self.outputs = self.tokenizer(self.dataset['highlights'], max_length=self.max_length, truncation=True, padding="longest", return_tensors="pt")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['article']
        print(len(text))

        summary_text = self.dataset[idx]['highlights']
        return {'article_text':text, 'summary_text': summary_text}

class PegasusCNNDatasetRandom(torch.utils.data.Dataset):
    def __init__(self, model_name = 'google/pegasus-large', max_length=256, split = 'train'):
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.tokenizer.max_length = max_length
        self.dataset = load_dataset('cnn_dailymail', '3.0.0', split = split)
        self.max_length = max_length
        
        #we want to tokenize both our inputs and outputs before passing to the model
        #self.inputs = self.tokenizer(self.dataset['article'], max_length=self.max_length, truncation=True, padding="longest", return_tensors="pt")
        #self.outputs = self.tokenizer(self.dataset['highlights'], max_length=self.max_length, truncation=True, padding="longest", return_tensors="pt")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['article']
        text = text.split(' ')
        max_idx = max(1, len(text) - self.max_length)
        text = text[np.random.randint(max_idx):]
        text = ' '.join(text)

        summary_text = self.dataset[idx]['highlights']
        return {'article_text':text, 'summary_text': summary_text}

dataset = PegasusCNNDatasetRandom()
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)
for data in train_loader:
    break
#get dataloader


Found cached dataset cnn_dailymail (/home/da2986/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


868
317
626
150


In [37]:
data['article_text'][0]

'a fridge packed full of Coca-Cola and chocolate for her convenience. She was offered a gastric band on the NHS if she could lose enough weight to prove she wanted the operation but she failed to do so . She was unable to walk just a few steps to the shower or the living room without getting out of breath. Mr Davies, who said he had worried about his wife\'s health, added: \'She spent most of her time in bed, she had a telly beside her but she didn\'t watch it. Sometimes she would have music on without the picture. \'She couldn\'t walk very far. She had to get assisted to get to the shower. It took two of us, myself and her carer. \'I felt sad for her. I didn\'t feel sorry for her, I felt sad for her. \'I did worry about her health but I didn\'t like to tell her in case it made her upset. I wanted her to lose the weight so she was healthy.\' She was offered a gastric band by doctors if she lost enough weight to prove she wanted the £12,000 operation and when she didn\'t the operation w

In [24]:
data['article_text'][0].split(' ')
new = []

TypeError: list indices must be integers or slices, not list

In [16]:
import numpy as np
arr = np.arange(0, 3932)
np.random.shuffle(arr)
arr = arr[:dataset.max_length]
arr = sorted(arr)
arr

[14,
 15,
 36,
 100,
 108,
 127,
 141,
 144,
 151,
 156,
 208,
 211,
 212,
 239,
 247,
 257,
 258,
 264,
 274,
 299,
 319,
 343,
 347,
 350,
 360,
 386,
 389,
 390,
 400,
 405,
 410,
 420,
 437,
 439,
 440,
 451,
 474,
 483,
 487,
 541,
 549,
 551,
 558,
 569,
 587,
 593,
 599,
 607,
 611,
 613,
 620,
 621,
 629,
 674,
 686,
 707,
 725,
 754,
 790,
 829,
 843,
 848,
 859,
 903,
 905,
 943,
 959,
 973,
 983,
 988,
 990,
 1000,
 1001,
 1010,
 1018,
 1027,
 1039,
 1059,
 1067,
 1119,
 1123,
 1135,
 1184,
 1203,
 1212,
 1227,
 1263,
 1265,
 1281,
 1296,
 1315,
 1318,
 1328,
 1352,
 1378,
 1384,
 1398,
 1399,
 1407,
 1450,
 1457,
 1480,
 1487,
 1530,
 1557,
 1586,
 1591,
 1593,
 1616,
 1624,
 1627,
 1651,
 1662,
 1663,
 1664,
 1693,
 1703,
 1706,
 1709,
 1743,
 1745,
 1804,
 1806,
 1822,
 1826,
 1835,
 1836,
 1838,
 1855,
 1858,
 1869,
 1945,
 1977,
 1987,
 1997,
 2012,
 2015,
 2040,
 2047,
 2048,
 2059,
 2063,
 2068,
 2071,
 2087,
 2091,
 2095,
 2110,
 2131,
 2158,
 2192,
 2193,
 2230,
 22

In [4]:
out = dataset.tokenizer(data['article_text'], max_length=dataset.max_length, truncation=True, padding='longest', return_tensors="pt")

In [6]:
out['input_ids'].shape

torch.Size([4, 256])