In [None]:
!pip install textdistance

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
import torch
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import warnings
import matplotlib.pyplot as plt
import textdistance as td

In [None]:
# loading synthetic datasets from load path
load_path = './datasets/'

train_dataset = pd.read_csv(load_path + 'train_dataset_60k.csv')
eval_dataset = pd.read_csv(load_path + 'validation_dataset_60k.csv')
test_dataset = pd.read_csv(load_path + 'test_dataset_60k.csv')

train_size = len(train_dataset)
eval_size = len(eval_dataset)
test_size = len(test_dataset)

task_prefix = "Γραμματική: "

train_original = ['' for i in range(0,train_size)]
train_corrected = ['' for i in range(0,train_size)]

eval_original = ['' for i in range(0,eval_size)]
eval_corrected = ['' for i in range(0,eval_size)]

test_original = ['' for i in range(0,test_size)]
test_corrected = ['' for i in range(0,test_size)]

# basic data cleaning from special characters for training dataset
for i in range(0,train_size):

  train_original[i] = task_prefix + train_dataset['sentence'][i].replace("\n"," ").replace("\r"," ").replace("\t","")
  train_corrected[i] = train_dataset['corrected'][i].replace("\n"," ").replace("\r"," ").replace("\t","")

# basic data cleaning from special characters for validation dataset
for i in range(0,eval_size):

  eval_original[i] = task_prefix + eval_dataset['sentence'][i].replace("\n"," ").replace("\r"," ").replace("\t","")
  eval_corrected[i] = eval_dataset['corrected'][i].replace("\n"," ").replace("\r"," ").replace("\t","")

# basic data cleaning from special characters for testing dataset
for i in range(0,test_size):

  test_original[i] = task_prefix + test_dataset['sentence'][i].replace("\n"," ").replace("\r"," ").replace("\t","")
  test_corrected[i] = test_dataset['corrected'][i].replace("\n"," ").replace("\r"," ").replace("\t","")

# printing an example from each dataset
a = 47
print(train_original[a])
print(train_corrected[a])
print()

print(eval_original[a])
print(eval_corrected[a])
print()

print(test_original[a])
print(test_corrected[a])
print()

print("Train Dataset length: {}".format(len(train_original)))
print("Validation Dataset length: {}".format(len(eval_original)))
print("Test Dataset length: {}".format(len(test_original)))


# class used for pytorch dataloader
class GreekDataset(Dataset):
    def __init__(self, original, corrected):
        self.original = original
        self.corrected = corrected

    def __getitem__(self, index):
        x = self.original[index]
        y = self.corrected[index]

        return x, y

    def __len__(self):
        return len(self.original)

# creating the dataloader for training dataset
train_dataset = GreekDataset(train_original, train_corrected)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True,num_workers=2)

In [None]:
# lists to save lengths for original and corrected sentences respectively
lens_or = []
lens_corr = []

# getting length in tokens for each sentence
for i in range(test_size):
  lens_or.append(len(tokenizer.encode(test_original[i])))
  lens_corr.append(len(tokenizer.encode(test_corrected[i])))


# checking if length requirements are met
print("Max original length: "+ str(max(lens_or)))
print("Original Index: " + str(lens_or.index(max(lens_or))))
print("Max corrected length: " + str(max(lens_corr)))
print("Original Index:" + str(lens_corr.index(max(lens_corr))))

In [None]:
# loading model, optimizer, tokenizer and scheduler
model_name = 'IMISLab/GreekT5-umt5-base-greeksum'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)
model.to(device);

In [None]:
# a function that calculates Normalized Levenshtein Similarity and BLEU score 
# for a given pair of sentences
def calculate_metrics(reference, predicted):

    # 1 - Levenshtein(reference, predicted) / max(len(reference), len(predicted)))
    nld = td.levenshtein.normalized_similarity(reference, predicted)
    # Calculate BLEU score
    bleu_score = sentence_bleu([reference],predicted)

return nld, bleu_score

# a function that calculates metrics in validation dataset and returns the mean values
def evaluation_model(model, tokenizer, eval_original, eval_corrected):

    model.eval()
    nlds = []
    bleus = []
    with torch.no_grad():
        torch.cuda.empty_cache()

        for i in range(len(eval_original)):
            enc_seq = tokenizer.encode(eval_original[i], return_tensors="pt")
            dec_seq = model.generate(enc_seq.to(device), max_new_tokens=128)
            predicted = tokenizer.decode(dec_seq[0], skip_special_tokens=True)

            (nld,bleu) = calculate_metrics(eval_corrected[i], predicted)
            nlds.append(nld)
            bleus.append(bleu)

            if i % 200 == 0:
            print("Completed {:.2f} %".format((i/len(eval_original))*100))

        mean_nld = np.mean(nlds)
        mean_bleu = np.mean(bleus)

return mean_nld, mean_bleu


In [None]:
# number of epochs
num_epochs = 1
# some counters for printing purposes
examples = 0
counter = 0

# number of examples in each batch
batch_size = 4

training_losses = []
val_nlds = []
val_bleus = []

# for clearer output purposes
warnings.filterwarnings("ignore", category=UserWarning, module="nltk.translate.bleu_score")

num_steps = 0

for epoch in range(num_epochs):

  for (source,target) in train_loader:

    torch.cuda.empty_cache()
    model.train()
        
    enc_seq = list(source)
    dec_seq = list(target)

    # tokenize encoder and decoder inputs
    enc_input = tokenizer(enc_seq, padding=True, return_tensors='pt')
    dec_input = tokenizer(dec_seq, padding=True, return_tensors='pt')

    # transfer to gpu
    enc_ids = enc_input['input_ids'].to(device)
    enc_mask = enc_input['attention_mask'].to(device)
    
    # handle padding tokens
    dec_ids = dec_input['input_ids'].to(device)
    dec_ids[dec_ids == tokenizer.pad_token_id] = -100

    # forward pass and cross entropy calculation
    curr_loss = model(input_ids=enc_ids, attention_mask=enc_mask, labels=dec_ids).loss
    # backpropagation process
    curr_loss.backward()
    # update the weights
    optimizer.step()

    # clear gradients
    optimizer.zero_grad()
    
    # update counters
    examples = examples + batch_size
    counter = counter + batch_size
    num_steps = num_steps + batch_size
    
    # printing progress
    if counter % 400 == 0:
      print("Completed {:.2f} %".format((examples/train_size)*100))
    
  # evaluate the model's performance in validation dataset
  (val_nld,val_bleu) = evaluation_model(model,tokenizer, eval_original, eval_corrected)
  print("Validation NLS: {} BLEU: {}".format(val_nld,val_bleu))
  print("Loss: {}".format(curr_loss.item()))

  # update learning rate
  scheduler.step(val_bleu)

  # save results
  val_nlds.append(val_nld)
  val_bleus.append(val_bleu)
  training_losses.append(curr_loss.item())

In [None]:
# load path for model checkpoint
load_path = './'
modelname = 'greek-t5-after-training-epochs-10.pt'
load_path = load_path + modelname

torch.save({
            'epoch': 10,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': curr_loss.item(),
            'scheduler': scheduler.state_dict(),
            }, load_path)