In [None]:
!pip install textdistance

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
import torch
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import warnings
import matplotlib.pyplot as plt
import textdistance as td

In [None]:
# loading model, tokenizer, optimizer and scheduler
model_name = 'IMISLab/GreekT5-umt5-base-greeksum'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)
model.to(device);
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [None]:
# urls for datasets from github
glc_load_url = 'https://raw.githubusercontent.com/katkorre/elerrant/main/GLC2.csv'
gnc_load_url = 'https://github.com/katkorre/elerrant/raw/main/GNC/GNC_annotator_A.xlsx'

# reading datasets from urls
glc = pd.read_csv(glc_load_url)
gnc = pd.read_excel(gnc_load_url)

# task specific prefix should be appended before each input sentence
task_prefix = "Γραμματική: "

glc_original = ['' for i in range(0,len(glc))]
glc_corrected = ['' for i in range(0,len(glc))]

for i in range(0,len(glc)):
    
  # basic data cleaning for glc
  glc_original[i] = task_prefix + glc['original_text'][i].replace("\n"," ").replace("\r"," ").replace("\t","")
  glc_original[i] = re.sub(" +", " ", glc_original[i])

  # type is float in case of Nan which means that original sentence has no errors
  if type(glc['corrected_text'][i]) == float:
    glc_corrected[i] = glc['original_text'][i].replace("\n"," ").replace("\r"," ").replace("\t","")

  else:
    glc_corrected[i] = glc['corrected_text'][i].replace("\n"," ").replace("\r"," ").replace("\t","")
    glc_corrected[i] = re.sub(" +", " ", glc_corrected[i])


gnc_original = ['' for i in range(len(gnc))]
gnc_corrected = ['' for i in range(len(gnc))]

for i in range(len(gnc)):
    
    # remove empty lines
    if type(gnc['Original Text'][i]) == float:
        continue
        
    # appending task prefix before each sentence
    gnc_original[i] = task_prefix + gnc['Original Text'][i]
    gnc_original[i] = re.sub(" +", " ", gnc_original[i])

    # type is float in case of Nan which means that original sentence has no errors
    if type(gnc['Corrected Text'][i]) == float:
        gnc_corrected[i] = gnc['Original Text'][i]
    else:
        gnc_corrected[i] = gnc['Corrected Text'][i]


# concatenating the two datasets
dataset_original = glc_original + gnc_original
dataset_corrected = glc_corrected + gnc_corrected

# maximum number of allowed tokens per sentence
max_tokens = 120
dataset_new_original = []
dataset_new_corrected = []


for i in range(len(dataset_original)):
    # remove sentences longer than 120 tokens and empty strings
    if len(tokenizer.encode(dataset_original[i])) > 120 or dataset_original[i] == '':
        continue

    dataset_new_original.append(dataset_original[i])
    dataset_new_corrected.append(dataset_corrected[i])

dataset_original = dataset_new_original.copy()
dataset_corrected = dataset_new_corrected.copy()

# lists to save lengths in tokens for original and corrrected sentences
lens_or = []
lens_corr = []

for i in range(len(dataset_original)):
    lens_or.append(len(tokenizer.encode(dataset_original[i])))
    lens_corr.append(len(tokenizer.encode(dataset_corrected[i])))

# printing to ensure length requirements are met
print('Max original length: {} tokens'.format(max(lens_or)))
print('Max corrected length: {} tokens'.format(max(lens_corr)))
print('Dataset length: {}\n'.format(len(dataset_original)))

# splitting in training and validation datasets in 80-20%
train_original, eval_original, train_corrected, eval_corrected = train_test_split(dataset_original,dataset_corrected, test_size=0.2,random_state=20)
print("Train Dataset Length: {}".format(len(train_original)))
print("Evaluation Dataset Length: {}\n".format(len(eval_original)))

# printing a sample sentence
a = 43
print(dataset_original[a])
print(dataset_corrected[a])

# class for Pytorch DataLoader
class GreekDataset(Dataset):
    def __init__(self, original, corrected):
        self.original = original
        self.corrected = corrected

    def __getitem__(self, index):
        x = self.original[index]
        y = self.corrected[index]

        return x, y

    def __len__(self):
        return len(self.original)

# creating dataloaders
train_dataset = GreekDataset(train_original, train_corrected)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True,num_workers=2)

In [None]:
# a function that computes Normalized Levenshtein Similarity and  BLEU scores for a 
# given sentence and corrected one
def calculate_metrics(reference, predicted):

  # 1 - Levenshtein(reference, predicted) / max(len(reference), len(predicted)))
  nld = td.levenshtein.normalized_similarity(reference, predicted)

  # Calculate BLEU score
  bleu_score = sentence_bleu([reference],predicted)

  return nld, bleu_score

# a function that calculates Normalized Levenshtein Similarity and BLEU Score for all sentences
# in validation dataset and returns the mean value
def evaluation_model(model, tokenizer, eval_original, eval_corrected):

  model.eval()
  nlds = []
  bleus = []

  with torch.no_grad():
    torch.cuda.empty_cache()

    for i in range(len(eval_original)):
      # tokenizing encoder and decoder input
      enc_seq = tokenizer.encode(eval_original[i], return_tensors="pt")
      dec_seq = model.generate(enc_seq.to(device), max_new_tokens=128)
      predicted = tokenizer.decode(dec_seq[0], skip_special_tokens=True)
      
      # caculate metrics for each sentence
      (nld,bleu) = calculate_metrics(eval_corrected[i], predicted)
      nlds.append(nld)
      bleus.append(bleu)

  mean_nld = np.mean(nlds)
  mean_bleu = np.mean(bleus)

  return mean_nld, mean_bleu

In [None]:
# number of epochs
num_epochs = 25

training_losses = []
val_nlds = []
val_bleus = []

# used for more clear printing
warnings.filterwarnings("ignore", category=UserWarning, module="nltk.translate.bleu_score")

# for each epoch
for epoch in range(num_epochs):

  # training loop
  for (source,target) in train_loader:
  
    torch.cuda.empty_cache()
    model.train()
    
    enc_seq = list(source)
    dec_seq = list(target)

    # tokenize original and corrected sentences
    enc_input = tokenizer(enc_seq, padding=True, return_tensors='pt')
    dec_input = tokenizer(dec_seq, padding=True, return_tensors='pt')

    # store in gpu
    enc_ids = enc_input['input_ids'].to(device)
    enc_mask = enc_input['attention_mask'].to(device)

    # for padding tokens
    dec_ids = dec_input['input_ids'].to(device)
    dec_ids[dec_ids == tokenizer.pad_token_id] = -100

    # calculate batch loss
    curr_loss = model(input_ids=enc_ids, attention_mask=enc_mask, labels=dec_ids).loss
    curr_loss.backward()
    # change weights
    optimizer.step()
    # clear grad
    optimizer.zero_grad()

  print("Epoch {}/{} loss: {}".format(epoch+1, num_epochs,curr_loss.item()))

  # calculate validation set metrics
  (val_nld,val_bleu) = evaluation_model(model,tokenizer, eval_original, eval_corrected)
  print("Validation NLS: {} BLEU: {}".format(val_nld,val_bleu))
  val_nlds.append(val_nld)
  val_bleus.append(val_bleu)
  training_losses.append(curr_loss.item())

  # keep track of learning rate value
  before_lr = optimizer.param_groups[0]["lr"]
  scheduler.step()
  after_lr = optimizer.param_groups[0]["lr"]
  print("lr {} -> {}\n".format(before_lr, after_lr))


In [None]:
# load path to save model
load_path = './'
modelname = 'greek-t5-gec-epochs-10-stepLrSchedulerGamma01.pt'
load_path = load_path + modelname

# saving training checkpoint
torch.save({
            'epoch': 10,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': curr_loss.item(),
            'scheduler': scheduler.state_dict(),
            }, load_path)