The purpose of this project is to fine tune Pegasus model for summarization

\\
Set ***`summarization = 'qry'`*** [To use query from debatepedia]

or, ***`summarization = 'evd'`*** [To use evidence as query]

---

In [1]:
summarization = 'evd'

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [3]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_md")
import en_core_web_md
nlp = en_core_web_md.load()
import nltk
nltk.download('punkt')

import operator

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')

def gen_sorted_docs (q ,src):
  qry = nlp(q)
  source_sorted = {}
  hash_table = {}

  source = tokenizer.tokenize(src)

  i = 0

  for sent in source:
    d = nlp (sent)
    score = qry.similarity(d)
    source_sorted[i] = score
    hash_table[i] = sent
    i = i + 1
  
  freq = operator.itemgetter(1)
  source_sorted = sorted(source_sorted.items(), reverse=True, key=freq)
 
  source = []
 
  for k, v in source_sorted:
    source.append(hash_table[k])
  
  return source


#Training (Fine Tuning)

In [None]:
import json
from tqdm import tqdm

with open('/content/gdrive/MyDrive/EvidenceQuery/debatepediaEvidence_train.json', 'r') as f:
  train_data = json.load(f)

train_src = []
train_tgt = []

for d in tqdm(train_data):
  sortDoc = gen_sorted_docs(d[summarization],d['src'])
  sDoc = " ".join(sortDoc)
  train_src.append(sDoc)
  train_tgt.append(d["tgt"])



with open('/content/gdrive/MyDrive/EvidenceQuery/debatepediaEvidence_valid.json', 'r') as f:
  val_data = json.load(f)

val_src = []
val_tgt = []

for d in tqdm(val_data):
  sortDoc = gen_sorted_docs(d[summarization],d['src'])
  sDoc = " ".join(sortDoc)
  val_src.append(sDoc)
  val_tgt.append(d["tgt"])


In [6]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
!pip install transformers
!pip install sentencepiece==0.1.91

In [8]:
from transformers import PegasusTokenizer
import torch

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])

tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

def tokenize_data(texts, labels):
  encodings = tokenizer(texts,  padding=True, truncation=True)
  decodings = tokenizer(labels, padding=True)
  dataset_tokenized = PegasusDataset(encodings, decodings)
  return dataset_tokenized

train_dataset = tokenize_data(train_src, train_tgt)
val_dataset = tokenize_data(val_src, val_tgt)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

In [9]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch

model_name = 'google/pegasus-xsum'

model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

training_args = TrainingArguments(
  output_dir='/content/gdrive/MyDrive/EvidenceQuery/results',
  evaluation_strategy='steps',
  eval_steps=250,
  warmup_steps=1000,
  weight_decay=0.01,
  logging_dir='/content/gdrive/MyDrive/EvidenceQuery/logs',
  logging_steps=100,
  load_best_model_at_end=True,
  
  per_device_train_batch_size=8,
  per_device_eval_batch_size=32,

)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  tokenizer=tokenizer
)

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [10]:
import os

trainer.train()

dir = '/content/gdrive/MyDrive/EvidenceQuery/model'

model.save_pretrained(dir)

***** Running training *****
  Num examples = 12000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4500


Step,Training Loss,Validation Loss
250,8.9542,7.551456
500,7.4467,5.727921
750,1.7589,1.276566
1000,1.3164,1.206037
1250,1.2616,1.165745
1500,1.21,1.136529
1750,1.1555,1.121782
2000,1.1242,1.107132
2250,1.0958,1.092103
2500,1.0984,1.078605


***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
Saving model checkpoint to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500
Configuration saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/config.json
Model weights saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
Saving model checkpoint to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-1000
Configuration saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-1000/config.json
Model weights saved 

#Testing

In [5]:
import json
from tqdm import tqdm


f = open('/content/gdrive/MyDrive/EvidenceQuery/debatepediaEvidence_test.json')
data = json.load(f)
f.close()


sorted_doc = []
for i in tqdm(range(len(data))):
  sortDoc = gen_sorted_docs(data[i][summarization],data[i]['src'])
  sDoc = " ".join(sortDoc)
  sorted_doc.append(sDoc)


100%|██████████| 1000/1000 [00:37<00:00, 26.34it/s]


In [6]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
!pip install transformers
!pip install sentencepiece==0.1.91
!pip install rouge

In [8]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model = PegasusForConditionalGeneration.from_pretrained('/content/gdrive/MyDrive/EvidenceQuery/model').to(device)
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

def generateSummary (text_document):
  input_ids = tokenizer([text_document], max_length=512, truncation = True, return_tensors='pt').to(device)
  output = model.generate(input_ids['input_ids'], max_length=15, min_length=10, no_repeat_ngram_size= 4)
  summary = tokenizer.batch_decode(output, skip_special_tokens=True)
  return summary[0]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

In [9]:
generated_summary = []

for i in tqdm(range(len(sorted_doc))):
  generated_summary.append(generateSummary(sorted_doc[i]))

100%|██████████| 1000/1000 [05:15<00:00,  3.17it/s]


In [10]:
reference_summary = []

for i in range(len(data)):
  reference_summary.append(data[i]['tgt'])

In [11]:
from rouge import Rouge 

rouge = Rouge()

scores = rouge.get_scores(generated_summary, reference_summary)

avg_score = {'r1r':0, 'r1p':0, 'r1f':0, 'r2r':0, 'r2p':0, 'r2f':0, 'rlr':0, 'rlp':0, 'rlf':0}
size = len(scores)

for i in tqdm(range(size)):
  avg_score['r1r'] = avg_score['r1r'] + (scores[i]['rouge-1']['r'])
  avg_score['r1p'] = avg_score['r1p'] + (scores[i]['rouge-1']['p'])
  avg_score['r1f'] = avg_score['r1f'] + (scores[i]['rouge-1']['f'])
  avg_score['r2r'] = avg_score['r2r'] + (scores[i]['rouge-2']['r'])
  avg_score['r2p'] = avg_score['r2p'] + (scores[i]['rouge-2']['p'])
  avg_score['r2f'] = avg_score['r2f'] + (scores[i]['rouge-2']['f'])
  avg_score['rlr'] = avg_score['rlr'] + (scores[i]['rouge-l']['r'])
  avg_score['rlp'] = avg_score['rlp'] + (scores[i]['rouge-l']['p'])
  avg_score['rlf'] = avg_score['rlf'] + (scores[i]['rouge-l']['f'])

print('')
print('Rouge-1 (R): ' + str(avg_score['r1r']/size*100))
print('Rouge-1 (P): ' + str(avg_score['r1p']/size*100))
print('Rouge-1 (F1-Score): ' + str(avg_score['r1f']/size*100))
print('Rouge-2 (R): ' + str(avg_score['r2r']/size*100))
print('Rouge-2 (P): ' + str(avg_score['r2p']/size*100))
print('Rouge-2 (F1-Score): ' + str(avg_score['r2f']/size*100))
print('Rouge-L (R): ' + str(avg_score['rlr']/size*100))
print('Rouge-L (P): ' + str(avg_score['rlp']/size*100))
print('Rouge-L (F1-Score): ' + str(avg_score['rlf']/size*100))

100%|██████████| 1000/1000 [00:00<00:00, 488789.65it/s]


Rouge-1 (R): 26.533544917844516
Rouge-1 (P): 27.590082972582984
Rouge-1 (F1-Score): 26.34589416001499
Rouge-2 (R): 9.776692877126331
Rouge-2 (P): 10.211121933621941
Rouge-2 (F1-Score): 9.708468787624037
Rouge-L (R): 24.357042254271395
Rouge-L (P): 25.170569985570012
Rouge-L (F1-Score): 24.102522971243207



