In [1]:
import sys
sys.path.append('../Src/')
import transformers
from transformers import Trainer, TrainingArguments
import torch
import pandas as pd
import numpy as np  
from prepare_dataset import Summary_dataset
import data_processing

In [None]:
with open('data.txt', 'r') as file:
    data = file.read().replace('\n', '')

In [2]:
text_path = '../Data/Text/'
summary_path = '../Data/Summary/'

In [3]:
full_data = data_processing.text_summary_to_csv(text_path,summary_path)
full_data = data_processing.text_processing(full_data)
train_texts,test_texts,train_decode,test_decode = data_processing.train_val_test_split(full_data,train_pct=0.8)

In [4]:
model_path = '../Pretrained/model/pegasus-original'
tokeniser_path = '../Pretrained/tokeniser/pegasus-tokeniser'

#If enough GPU RAM, use cuda, else just use cpu
#torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = transformers.PegasusTokenizer.from_pretrained(tokeniser_path)
model = transformers.PegasusForConditionalGeneration.from_pretrained(model_path)#.to(torch_device)

In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_labels = tokenizer(train_decode, truncation=True, padding=True)
test_labels = tokenizer(test_decode, truncation=True, padding=True)

In [7]:
train_dataset = Summary_dataset(train_encodings, train_labels)
test_dataset = Summary_dataset(test_encodings, test_labels)

In [12]:
training_args = TrainingArguments(
    output_dir='../results',         # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,
    logging_steps=1,               # strength of weight decay
    logging_dir='../logs',           # directory for storing logs
    overwrite_output_dir=True,
    no_cuda = True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss
1,10.750761
2,10.688639
3,10.812634
4,10.48642
5,10.866863


TrainOutput(global_step=5, training_loss=10.721063232421875)

In [None]:
model.save_pretrained('../Pretrained/model/pegasus-finetuned')

In [None]:
batch = tokenizer.prepare_seq2seq_batch(src_texts=[sample_text_2])#.to(torch_device)

In [None]:
gen = model.generate(**batch,max_length = 200, # max length of summary
                     min_length = 100, # min length of summary
                     do_sample = True, 
                     temperature = 3.0,
                     top_k =30,
                     top_p=0.70,
                     repetition_penalty = 1.2,
                     length_penalty = 5, # if more than 1 encourage model to generate #larger sequences
                     num_return_sequences=1 # no of summary you want to generate
                    )

In [None]:
summary = tokenizer.batch_decode(gen, skip_special_tokens=True)

In [None]:
print(summary)

In [None]:
# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_labels = tokenizer(train_decode, truncation=True, padding=True)
val_labels = tokenizer(val_decode, truncation=True, padding=True)
test_labels = tokenizer(test_decode, truncation=True, padding=True)

# Setup dataset objects
class Summary_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings)

train_dataset = Summary_dataset(train_encodings, train_labels)
val_dataset = Summary_dataset(val_encodings, val_labels)
test_dataset = Summary_dataset(test_encodings, test_labels)

In [None]:
# Training
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='../results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    no_cuda = True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()