The purpose of this project is to fine tune BART model for summarization

\\
Set ***`summarization = 'qry'`*** [To use query from debatepedia]

or, ***`summarization = 'evd'`*** [To use evidence as query]

---

In [1]:
summarization = 'evd'

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [4]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_md")
import en_core_web_md
nlp = en_core_web_md.load()
import nltk
nltk.download('punkt')

import operator

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')

def gen_sorted_docs (q ,src):
  qry = nlp(q)
  source_sorted = {}
  hash_table = {}

  source = tokenizer.tokenize(src)

  i = 0

  for sent in source:
    d = nlp (sent)
    score = qry.similarity(d)
    source_sorted[i] = score
    hash_table[i] = sent
    i = i + 1
  
  freq = operator.itemgetter(1)
  source_sorted = sorted(source_sorted.items(), reverse=True, key=freq)
 
  source = []
 
  for k, v in source_sorted:
    source.append(hash_table[k])
  
  return source


#Training (Fine Tuning)

In [None]:
import json
from tqdm import tqdm

with open('/content/gdrive/MyDrive/EvidenceQuery/debatepediaEvidence_train.json', 'r') as f:
  train_data = json.load(f)

train_src = []
train_tgt = []

for d in tqdm(train_data):
  sortDoc = gen_sorted_docs(d[summarization],d['src'])
  sDoc = " ".join(sortDoc)
  train_src.append(sDoc)
  train_tgt.append(d["tgt"])



with open('/content/gdrive/MyDrive/EvidenceQuery/debatepediaEvidence_valid.json', 'r') as f:
  val_data = json.load(f)

val_src = []
val_tgt = []

for d in tqdm(val_data):
  sortDoc = gen_sorted_docs(d[summarization],d['src'])
  sDoc = " ".join(sortDoc)
  val_src.append(sDoc)
  val_tgt.append(d["tgt"])


In [7]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
!pip install transformers
!pip install sentencepiece==0.1.91
!pip install rouge
!pip install datasets==1.0.2
!pip install rouge_score


In [9]:
from transformers import RobertaTokenizerFast
import torch

class RoBERTaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

def tokenize_data(texts, labels):
  encodings = tokenizer(texts,  padding=True, truncation=True)
  decodings = tokenizer(labels, padding=True)
  dataset_tokenized = RoBERTaDataset(encodings, decodings)
  return dataset_tokenized

train_dataset = tokenize_data(train_src, train_tgt)
val_dataset = tokenize_data(val_src, val_tgt)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [10]:
import datasets

rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

In [12]:
from transformers import EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

model = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True)

# set special tokens
model.config.decoder_start_token_id = tokenizer.bos_token_id                                             
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
# set decoding params                               
model.config.max_length = 40
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.config.vocab_size = model.config.encoder.vocab_size


training_args = Seq2SeqTrainingArguments(
  output_dir='/content/gdrive/MyDrive/EvidenceQuery/results',
  evaluation_strategy='steps',
  eval_steps=250,
  warmup_steps=1000,
  weight_decay=0.01,
  logging_dir='/content/gdrive/MyDrive/EvidenceQuery/logs',
  logging_steps=100,
  load_best_model_at_end=True,
  
  per_device_train_batch_size=8,
  per_device_eval_batch_size=32,
  
  predict_with_generate=True,
  do_train=True,
  do_eval=True,
  overwrite_output_dir=True,
  save_total_limit=1,
  fp16=True, 
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  tokenizer=tokenizer,
  
  compute_metrics=compute_metrics
)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.5.crossattention.self.key.weight', 'roberta.encoder.layer.9.crossattention.output.dense.weight', 'roberta.encoder.layer.10.crossattention.output.den

In [13]:
import os

trainer.train()

dir = '/content/gdrive/MyDrive/EvidenceQuery/model'

model.save_pretrained(dir)

***** Running training *****
  Num examples = 12000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4500


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
250,10.6084,7.611978,0.0,0.0,0.0
500,3.9536,3.028337,0.0,0.0,0.0
750,2.6271,2.507697,0.0,0.0,0.0
1000,2.504,2.41977,0.0026,0.0017,0.002
1250,2.4895,2.345974,0.0049,0.0042,0.0044
1500,2.3444,2.287164,0.0058,0.0042,0.0048
1750,2.234,2.243608,0.0085,0.0059,0.0066
2000,2.2063,2.194828,0.0083,0.0047,0.0059
2250,2.1033,2.129592,0.0104,0.0084,0.0091
2500,2.0633,2.030812,0.0186,0.0157,0.0165


***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
Saving model checkpoint to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500
Configuration saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/config.json
Model weights saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
***** Running Evaluation *****
  Num examples = 719
  Batch size = 32
Saving model checkpoint to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-1000
Configuration saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-1000/config.json
Model weights saved 

#Testing

In [6]:
import json
from tqdm import tqdm


f = open('/content/gdrive/MyDrive/EvidenceQuery/debatepediaEvidence_test.json')
data = json.load(f)
f.close()


sorted_doc = []
for i in tqdm(range(len(data))):
  sortDoc = gen_sorted_docs(data[i][summarization],data[i]['src'])
  sDoc = " ".join(sortDoc)
  sorted_doc.append(sDoc)


100%|██████████| 1000/1000 [00:37<00:00, 27.02it/s]


In [7]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
!pip install transformers
!pip install sentencepiece==0.1.91
!pip install rouge

In [9]:
from transformers import RobertaTokenizerFast, EncoderDecoderModel

model = EncoderDecoderModel.from_pretrained('/content/gdrive/MyDrive/EvidenceQuery/model').to(device)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

def generateSummary (text_document):
  input_ids = tokenizer([text_document], max_length=512, truncation = True, return_tensors='pt').to(device)
  output = model.generate(input_ids['input_ids'], max_length=15, min_length=10, no_repeat_ngram_size= 4)
  summary = tokenizer.batch_decode(output, skip_special_tokens=True)
  return summary[0]

The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [10]:
generated_summary = []

for i in tqdm(range(len(sorted_doc))):
  generated_summary.append(generateSummary(sorted_doc[i]))

100%|██████████| 1000/1000 [03:05<00:00,  5.38it/s]


In [11]:
reference_summary = []

for i in range(len(data)):
  reference_summary.append(data[i]['tgt'])

In [12]:
from rouge import Rouge 

rouge = Rouge()

scores = rouge.get_scores(generated_summary, reference_summary)

avg_score = {'r1r':0, 'r1p':0, 'r1f':0, 'r2r':0, 'r2p':0, 'r2f':0, 'rlr':0, 'rlp':0, 'rlf':0}
size = len(scores)

for i in tqdm(range(size)):
  avg_score['r1r'] = avg_score['r1r'] + (scores[i]['rouge-1']['r'])
  avg_score['r1p'] = avg_score['r1p'] + (scores[i]['rouge-1']['p'])
  avg_score['r1f'] = avg_score['r1f'] + (scores[i]['rouge-1']['f'])
  avg_score['r2r'] = avg_score['r2r'] + (scores[i]['rouge-2']['r'])
  avg_score['r2p'] = avg_score['r2p'] + (scores[i]['rouge-2']['p'])
  avg_score['r2f'] = avg_score['r2f'] + (scores[i]['rouge-2']['f'])
  avg_score['rlr'] = avg_score['rlr'] + (scores[i]['rouge-l']['r'])
  avg_score['rlp'] = avg_score['rlp'] + (scores[i]['rouge-l']['p'])
  avg_score['rlf'] = avg_score['rlf'] + (scores[i]['rouge-l']['f'])

print('')
print('Rouge-1 (R): ' + str(avg_score['r1r']/size*100))
print('Rouge-1 (P): ' + str(avg_score['r1p']/size*100))
print('Rouge-1 (F1-Score): ' + str(avg_score['r1f']/size*100))
print('Rouge-2 (R): ' + str(avg_score['r2r']/size*100))
print('Rouge-2 (P): ' + str(avg_score['r2p']/size*100))
print('Rouge-2 (F1-Score): ' + str(avg_score['r2f']/size*100))
print('Rouge-L (R): ' + str(avg_score['rlr']/size*100))
print('Rouge-L (P): ' + str(avg_score['rlp']/size*100))
print('Rouge-L (F1-Score): ' + str(avg_score['rlf']/size*100))

100%|██████████| 1000/1000 [00:00<00:00, 393425.01it/s]


Rouge-1 (R): 15.672937122217325
Rouge-1 (P): 28.36309523809519
Rouge-1 (F1-Score): 19.553794870212542
Rouge-2 (R): 2.794965554741093
Rouge-2 (P): 3.750158730158732
Rouge-2 (F1-Score): 3.0880218833255184
Rouge-L (R): 14.623649779872713
Rouge-L (P): 26.426190476190435
Rouge-L (F1-Score): 18.2230254168753



