The purpose of this project is to fine-tune T5 model for generating evidence from the News Article.

---

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [2]:
import json

with open('/content/gdrive/MyDrive/EvidenceQuery/train.json', 'r') as f:
  train_data = json.load(f)

train_src = []
train_tgt = []

i = 0
for d in train_data:
  train_src.append(d["src"]) 
  train_tgt.append(d["evd"])
  i = i + 1
  if i >= 70000:
    break


with open('/content/gdrive/MyDrive/EvidenceQuery/val.json', 'r') as f:
  val_data = json.load(f)

val_src = []
val_tgt = []

for d in val_data:
  val_src.append(d["src"]) 
  val_tgt.append(d["evd"])

#print(train_src)
#print(train_tgt)

#print(val_src)
#print(val_tgt)


In [3]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
!pip install transformers
!pip install sentencepiece==0.1.91

In [5]:
from transformers import T5Tokenizer
import torch

class T5Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])

tokenizer = T5Tokenizer.from_pretrained('t5-base')

def tokenize_data(texts, labels):
  encodings = tokenizer(texts,  padding=True, truncation=True)
  decodings = tokenizer(labels, padding=True)
  dataset_tokenized = T5Dataset(encodings, decodings)
  return dataset_tokenized

train_dataset = tokenize_data(train_src, train_tgt)
val_dataset = tokenize_data(val_src, val_tgt)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [6]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
import torch

model_name = 't5-base'

model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

training_args = TrainingArguments(
  output_dir='/content/gdrive/MyDrive/EvidenceQuery/results',
  evaluation_strategy='steps',
  eval_steps=500,
  warmup_steps=5000,
  weight_decay=0.01,
  logging_dir='/content/gdrive/MyDrive/EvidenceQuery/logs',
  logging_steps=100,
  load_best_model_at_end=True,
  
  per_device_train_batch_size=8,
  per_device_eval_batch_size=32,

)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  tokenizer=tokenizer
)

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [7]:
import os

trainer.train()

dir = '/content/gdrive/MyDrive/EvidenceQuery/model'

model.save_pretrained(dir)

***** Running training *****
  Num examples = 70000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 26250


Step,Training Loss,Validation Loss
500,1.931,1.179201
1000,1.117,0.842306
1500,1.0187,0.785975
2000,0.9783,0.758993
2500,0.954,0.73927
3000,0.9352,0.729296
3500,0.9302,0.72265
4000,0.9053,0.713448
4500,0.8851,0.704933
5000,0.8729,0.694572


***** Running Evaluation *****
  Num examples = 1337
  Batch size = 32
Saving model checkpoint to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500
Configuration saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/config.json
Model weights saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/special_tokens_map.json
Copy vocab file to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-500/spiece.model
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 32
Saving model checkpoint to /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-1000
Configuration saved in /content/gdrive/MyDrive/EvidenceQuery/results/checkpoint-1000/config.json
Model weights saved in /content/gdrive/MyDrive/EvidenceQuery/resu