In [1]:
import torch 
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import math

from datasets import load_dataset
dataset = load_dataset("gigaword")

Using custom data configuration default
Reusing dataset gigaword (/tmp/xdg-cache/huggingface/datasets/gigaword/default/1.2.0/c518c578e42a6afe842b09e979ee2907ea42a12b57ba992fae9e9d7347825245)


In [2]:
import pandas as pd

entailed = pd.read_csv("../data/gigawordfiltered.csv")

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
src_text = [text for i,text in enumerate(dataset['train']['document'][0:1000]) if entailed['index_keys'][i] == 1]
target_text = [text for i,text in enumerate(dataset['train']['summary'][0:1000]) if entailed['index_keys'][i] == 1]
print(len(src_text))
print(len(target_text))

model_name = 'google/pegasus-gigaword'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)

#model = PegasusForConditionalGeneration.from_pretrained(model_name,return_dict=True,output_attentions=True,output_hidden_states=True).to(torch_device)

train_data = tokenizer.prepare_seq2seq_batch(src_text, target_text, return_tensors="pt", truncation="only_first", padding="longest", max_length=64)

input_ids_train = train_data['input_ids']
attention_masks_train = train_data['attention_mask']
labels_train = train_data['labels']

947
947


In [4]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, PegasusConfig, modeling_outputs

In [6]:
config = PegasusConfig.from_pretrained(model_name, output_hidden_states=True, output_attentions=True)

In [7]:
pega_copyback_model = PegasusForConditionalGeneration.from_pretrained(model_name, config=config).to(torch_device)

Some weights of PegasusGuidedCopyBack were not initialized from the model checkpoint at google/pegasus-gigaword and are newly initialized: ['outdegree_score_w', 'indegree_score_w', 'p_gen_w.weight', 'p_gen_w.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

batch_size = 2
dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train), batch_size=batch_size)

#freezing the parameters
# for param in pega_copyback_model.model.parameters():
#     param.requires_grad = False
optimizer = AdamW(filter(lambda p: p.requires_grad, pega_copyback_model.parameters()),lr=5e-5)
                  
epochs = 10

In [9]:
from tqdm.notebook import tqdm
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)  

for epoch in tqdm(range(1, epochs+1)):
    
    pega_copyback_model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for b in progress_bar:

        optimizer.zero_grad()
        
        b = tuple(x.to(torch_device) for x in b)
        
        inputs = {'input_ids':      b[0],
                  'attention_mask': b[1],
                  'labels':         b[2],
                 }       

        outputs = pega_copyback_model(**inputs)
#         vocab_size =  outputs['logits'].shape[2]
    
        loss = outputs[0]#criterion(outputs.view(-1,vocab_size),b[2].view(-1))

        loss_train_total += loss.item()
        loss.backward()

#         torch.nn.utils.clip_grad_norm_(pega_copyback_model.parameters(), 1.0)

        optimizer.step()
        #scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(b))})
         
        
    torch.save(pega_copyback_model.state_dict(), f'data/finetuned_pega_in_outdegree_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=474.0, style=ProgressStyle(description_widt…




Epoch 1
Training loss: 7.250252215168144


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=474.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 1.5866059822340806


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=474.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.675090633144107


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=474.0, style=ProgressStyle(description_widt…


Epoch 4
Training loss: 0.519468003467417


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=474.0, style=ProgressStyle(description_widt…


Epoch 5
Training loss: 0.398780288807419



In [10]:
pega_copyback_model.eval()

tgt_text = tokenizer.batch_decode(pega_copyback_model.generate(train_data['input_ids'][[0],:].to(torch_device)))#, skip_special_tokens=True)

In [11]:
print(tgt_text)

['australian australian australian australian australian australian australian current account current account current account deficit narrows australian current account deficit narrows australian current account deficit narrows #.unk_3']


In [12]:
from datasets import load_metric

metric = load_metric("rouge")

src_text = dataset['validation']['document'][0:100]
target_text = dataset['validation']['summary'][0:100]

val_data = tokenizer.prepare_seq2seq_batch(src_text, target_text, return_tensors="pt", truncation="only_first", padding="longest", max_length=64)

input_ids_val = val_data['input_ids']
attention_masks_val = val_data['attention_mask']
labels_val = val_data['labels']

dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

batch_size = 1
dataloader_val = DataLoader(dataset_val, batch_size=batch_size)



with torch.no_grad():
    for b in dataloader_val:        
        b = tuple(x.to(torch_device) for x in b)
        
        inputs = {'input_ids':      b[0],
                  'attention_mask': b[1],
                  'labels':         b[2],
                 }       
        gen = tokenizer.batch_decode(pega_copyback_model.generate(inputs['input_ids']))
        ref = tokenizer.batch_decode(inputs['labels'])
        metric.add_batch(predictions=gen, references=ref)
        
print(metric.compute())

{'rouge1': AggregateScore(low=Score(precision=0.07872079594325206, recall=0.24750711580086587, fmeasure=0.1179690066388292), mid=Score(precision=0.09252074497511822, recall=0.2872151875901876, fmeasure=0.1377107346346041), high=Score(precision=0.1072972561640747, recall=0.3338265422077921, fmeasure=0.15935673330602604)), 'rouge2': AggregateScore(low=Score(precision=0.026462486831683726, recall=0.08466808261183262, fmeasure=0.03994401548319826), mid=Score(precision=0.03646153870759598, recall=0.11838762626262628, fmeasure=0.05501554192773852), high=Score(precision=0.04759849511188942, recall=0.15862531565656562, fmeasure=0.07195194785219448)), 'rougeL': AggregateScore(low=Score(precision=0.07616191244615397, recall=0.2366564574314574, fmeasure=0.11371305347403333), mid=Score(precision=0.0893201354396172, recall=0.277267316017316, fmeasure=0.13299962625777167), high=Score(precision=0.1032196970907561, recall=0.3189087932900434, fmeasure=0.15322803518562703)), 'rougeLsum': AggregateScore(