In [None]:
!pip install transformers

In [None]:
from transformers import (T5Tokenizer, 
                          T5ForConditionalGeneration, 
                          AdamW, 
                          get_linear_schedule_with_warmup)

Try this training function from https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [None]:
import time
import pandas as pd
import numpy as np
import math
import os
import editdistance

import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Build the dataset

In [None]:
df = pd.read_csv("sentences.tsv", "\t")
df.head()

In [None]:
df = df.dropna()

In [None]:
df.original.str.len().hist(bins=20)
plt.show()

In [None]:
df = df[df.original.str.len() < 400]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
df["edit_distance"] = df.apply(lambda x: editdistance.eval(x[0], x[1]) / max(len(x[0]), len(x[1])), axis=1)

In [None]:
df.edit_distance.hist(bins=30)
plt.show()

In [None]:
df.original = pd.Series(["edit: "] * df.shape[0]) + df.original
df = df.dropna()

In [None]:
class EditDataset(Dataset):
    def __init__(self, original_sentences, revised_sentences):
        self.original_sentences = original_sentences
        self.revised_sentences = revised_sentences
         
    def __len__(self):
        return (len(self.revised_sentences))
    
    def __getitem__(self, i):
        return (self.original_sentences[i], self.revised_sentences[i])

In [None]:
def train(model, trainloader):
  running_loss = 0.
  epoch_loss = 0.
  start_time = time.time()
  for batch, (data, target) in enumerate(trainloader):
    optimizer.zero_grad()

    max_length_data = max([len(tokenizer.encode(sent)) for sent in data])
    data = torch.Tensor([tokenizer.encode(sent, max_length=max_length_data, pad_to_max_length=True) for sent in data])
    data = data.to(torch.int64)
    max_length_target = max([len(tokenizer.encode(sent)) for sent in target])
    target = torch.Tensor([tokenizer.encode(sent, max_length=max_length_target, pad_to_max_length=True) for sent in target])
    target = target.to(torch.int64)
    
    data, target = data.to(device), target.to(device)

    outputs = model(input_ids=data, lm_labels=target)
    loss = outputs[0]
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

    #Reuse GPU memory
    del data
    del target
    torch.cuda.empty_cache()
    ##########################

    running_loss += loss.item()
    epoch_loss += loss.item()
    log_interval = 200
    if batch % log_interval == 0 and batch > 0:
      cur_loss = running_loss / log_interval
      elapsed = time.time() - start_time
      print('| epoch {:3d} | batch {:5d} / {:5d} | '
            'lr {:05.5f} | ms/batch {:5.2f} | '
            'loss {:5.2f} | ppl {:8.2f}'.format(
              epoch + 1, batch, len(trainloader),
              scheduler.get_last_lr()[0],
              elapsed * 1000 / log_interval,
              cur_loss, math.exp(cur_loss)))
      running_loss = 0.
      start_time = time.time()
  
  ## Validation
  validation_loss = 0.
  for batch, (data, target) in enumerate(valloader):
    max_length_data = max([len(tokenizer.encode(sent)) for sent in data])
    data = torch.Tensor([tokenizer.encode(sent, max_length=max_length_data, pad_to_max_length=True) for sent in data])
    data = data.to(torch.int64)
    
    max_length_target = max([len(tokenizer.encode(sent)) for sent in target])
    target = torch.Tensor([tokenizer.encode(sent, max_length=max_length_target, pad_to_max_length=True) for sent in target])
    target = target.to(torch.int64)
    
    data, target = data.to(device), target.to(device)

    outputs = model(input_ids=data, lm_labels=target)
    loss = outputs[0]
    validation_loss += loss.item()

    #Reuse GPU memory
    del data
    del target
    torch.cuda.empty_cache()
    ##########################
    
  epoch_loss = epoch_loss/len(trainloader)
  validation_loss = validation_loss/len(valloader)
  print('############# Epoch {:3d} ##############'.format(epoch + 1))
  print('Train loss: {:5.2f}; Validation loss: {:5.2f}'.format(epoch_loss, validation_loss))

In [None]:
df_train, df_val = train_test_split(df[df.edit_distance > 0.3], test_size=0.1, random_state=42)

In [None]:
train_data = EditDataset(df_train.original.values, df_train.revised.values)
val_data = EditDataset(df_val.original.values, df_val.revised.values)

# dataloaders
trainloader = DataLoader(train_data, batch_size=8, shuffle=True)
valloader = DataLoader(val_data, batch_size=8, shuffle=True)

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

Add missing tokens to t5-base tokenizer

In [None]:
tokenized_original = df.original.apply(lambda x: tokenizer.tokenize(x)).values
tokenized_revised = df.revised.apply(lambda x: tokenizer.tokenize(x)).values

length = max(map(len, tokenized_original + tokenized_revised))
sents = np.array([xi+[None]*(length-len(xi)) for xi in tokenized_original + tokenized_revised])

vocab = list(set(sents.reshape(-1)))
tokenizer_vocab = tokenizer.get_vocab()

missing = [token for token in vocab if token not in tokenizer_vocab]
missing.remove(None)

print("A total of {} missing tokens will be added.".format(len(missing)))

In [None]:
tokenizer.add_tokens(missing)
model.resize_token_embeddings(len(tokenizer))

Fine tune the model

In [None]:
num_epochs = 4

t_total = len(trainloader) // num_epochs

optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=t_total
)

In [None]:
model.to(device)

model.train() # Turn on the train mode

for epoch in range(num_epochs):
  train(model, trainloader)

In [None]:
#os.mkdir("/content/drive/My Drive/Colab Notebooks/Pytorch/T5_editor/Edit_distance_30")
model.save_pretrained("editing_model/")

#os.mkdir("/content/drive/My Drive/Colab Notebooks/Pytorch/T5_editor/Edit_distance_30_tokenizer")
tokenizer.save_pretrained("editing_tokenizer/")

In [None]:
input_ids = tokenizer.encode("edit: Cancer patients was used as a control groups.", return_tensors="pt")  # Batch size 1
input_ids = input_ids.to(device)
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

In [None]:
input_ids = tokenizer.encode("edit: The tropical cyclone’s (TC’s) development and movement relate to its structure and background environmental flow.", return_tensors="pt")  # Batch size 1
input_ids = input_ids.to(device)
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

In [None]:
input_ids = tokenizer.encode("edit: The development and movement of the TCs relate to its structure and background environmental flow.", return_tensors="pt")  # Batch size 1
input_ids = input_ids.to(device)
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

In [None]:
input_ids = tokenizer.encode("edit: A sentence with two error.", return_tensors="pt")  # Batch size 1
input_ids = input_ids.to(device)
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])