In [14]:
import random
%matplotlib inline

Grammar Error Correction with nn.Transformer and torchtext
======================================================

This notebook shows how to train a GEC model based on transformers.

# Data Sourcing and Processing

C4 200M dataset from Google Research is used in this notebook. You can find more information about the C4 200M dataset on GR's [BEA 2021 paper](https://aclanthology.org/2021.bea-1.4/).
The already [processed dataset](https://huggingface.co/datasets/liweili/c4_200m) was extracted from Huggingface, then was transformed to HDF5 format for better manageability. The conversion process was based on this [notebook](https://github.com/rasbt/deeplearning-models/blob/master/pytorch_ipynb/mechanics/custom-data-loader-csv.ipynb).
The final version of the dataset is uploaded on [Kaggle](https://www.kaggle.com/datasets/dariocioni/c4200m).

A custom class ``Hdf5Dataset`` based on ``torch.utils.data.Dataset`` is developed, which yields a pair of source-target raw sentences.

| source                                             | target                                                  |
|----------------------------------------------------|---------------------------------------------------------|
| Much many brands and sellers still in the market.  | Many brands and sellers still in the market.            |
| She likes playing in park and come here every week | She likes playing in the park and comes here every week |

In [15]:
# Import libraries
import torch
import pandas as pd
import numpy as np
import pathlib as pl

In [16]:
import h5py
from torch.utils.data import Dataset,IterableDataset
random.seed(42)

class Hdf5Dataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, h5_path, transform=None,num_entries = None,randomized=False):

        self.h5f = h5py.File(h5_path, 'r')
        self.size = self.h5f['labels'].shape[0]
        self.transform = transform
        self.randomized = randomized
        self.max_index = num_entries if num_entries is not None else self.size
        #Chooses an offset for the dataset when using a subset of a Hdf5 file
        if randomized:
            self.offset = random.choice(range(0,self.size//self.max_index))*self.max_index


    def __getitem__(self, index):
        if index > self.max_index:
            raise StopIteration
        input = self.h5f['input'][self.offset+index].decode('utf-8')
        label = self.h5f['labels'][self.offset+index].decode('utf-8')
        if self.transform is not None:
            features = self.transform(input)
        return input, label

    def __len__(self):
        return self.max_index

    def reshuffle(self):
        if self.randomized:
            self.offset = random.choice(range(0,self.size//self.max_index))*self.max_index
        else:
            print("Please set randomized=True")

In [17]:
from typing import Iterable, List
from tqdm import tqdm
import pathlib as pl

SRC_LANGUAGE = 'incorrect'
TGT_LANGUAGE = 'correct'

# Place-holders
token_transform = {}
# vocab_transform = {}

folder = 'D:\Datasets\c4_200m\data\hdf5'
train_filename = 'C4_200M.hf5-00000-of-00010'
valid_filename = 'C4_200M.hf5-00001-of-00010'

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for data_sample in tqdm(data_iter):
        if data_sample[language_index[language]] and isinstance(data_sample[language_index[language]],str):
            yield token_transform[language](data_sample[language_index[language]])

## Tokenizing and Embedding
Data is then tokenized by a pre-trained ``BertTokenizer`` from HuggingFace's ``transformers`` library, based on a Wordpiece tokenization.
The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and headers).

In [18]:
from transformers import AutoTokenizer

# # Define special symbols and indices
PAD_IDX, BOS_IDX, EOS_IDX = 0, 101, 102
# # Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['[PAD]', '[CLS]', '[SEP]']

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', add_special_tokens=True, padding=True, truncation=True, return_tensors="pt")
token_transform = tokenizer

In [19]:
text = 'data mining is awesome!'
encoded_input = tokenizer(text)
print(encoded_input)
decoded = tokenizer.convert_ids_to_tokens(encoded_input.input_ids)
print(decoded)

{'input_ids': [101, 2951, 5471, 2003, 12476, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'data', 'mining', 'is', 'awesome', '!', '[SEP]']


### Unknown words

In [20]:
text = 'data maining is awesome!'
encoded_input = tokenizer(text)
print(encoded_input)
decoded = tokenizer.convert_ids_to_tokens(encoded_input.input_ids)
print(decoded)

{'input_ids': [101, 2951, 2364, 2075, 2003, 12476, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'data', 'main', '##ing', 'is', 'awesome', '!', '[SEP]']


In [21]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# def glove_transform(tokens: List[str]):



# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = sequential_transforms(token_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform(src_sample.rstrip("\n")))
        tgt_batch.append(text_transform(tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

# Seq2Seq Network using BERT
BERT model, proposed in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova is a bidirectional transformer pre-trained using a combination of masked language modeling objective and next sentence prediction on a large corpus comprising the Toronto [Book Corpus](https://yknzhu.wixsite.com/mbweb) and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia).

In [22]:
from transformers import BertConfig
from torch import nn
import torch.nn.functional as F

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16

config = BertConfig.from_pretrained('bert_config.json')

# encoded_input = tokenizer(text, return_tensors='pt')
# output = bert(**encoded_input)
# print(output)

In [23]:
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased", bos_token_id=101, eos_token_id=102)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained(
    "bert-base-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertGenerationEncoder: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.pooler.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertGenerationEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertGenerationEncoder from the checkpoint of a 

During training, we need a subsequent word mask that will prevent model to look into the future words when making predictions. We will also need masks to hide source and target padding tokens. Below, let's define a function that will take care of both.

In [38]:
input_ids = tokenizer(
    "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
).input_ids
labels = tokenizer("This is a short summary", return_tensors="pt").input_ids



# train...
loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss

output = bert2bert.generate(input_ids=input_ids).tolist()[0]

decoded = tokenizer.convert_ids_to_tokens(output)
print(decoded)

print(loss)
loss.backward()

Setting `pad_token_id` to `eos_token_id`:102 for open-end generation.


['[CLS]', 'ware', '[CLS]', '##ndra', '[CLS]', 'ware', '[CLS]', 'ware', '[CLS]', 'ware', '[CLS]', 'ware', '[CLS]', 'ware', '[CLS]', 'ware', '[CLS]', 'ware', '[CLS]', 'ware']
tensor(9.0753, grad_fn=<NllLossBackward0>)


Let's now define the parameters of our model and instantiate the same. Below, we also define our loss function which is the cross-entropy loss and the optmizer used for training.




In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset

def train_epoch(model):
    losses = 0
    train_iter = Hdf5Dataset(pl.Path(folder)/train_filename,num_entries=1000000)
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in tqdm(train_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        # src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        # logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        loss = bert2bert(input_ids=input_ids, decoder_input_ids=tgt_input, labels=tgt).loss
        loss.backward()

        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Hdf5Dataset(pl.Path(folder)/valid_filename,num_entries=10000)
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        tgt_out = tgt[1:, :]
        loss = bert2bert(input_ids=input_ids, decoder_input_ids=tgt_input, labels=labels).loss
        losses += loss.item()

    return losses / len(val_dataloader)

Now we have all the ingredients to train our model. Let's do it!




In [15]:
from timeit import default_timer as timer
NUM_EPOCHS = 1

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(bert2bert)
    end_time = timer()
    val_loss = evaluate(bert2bert)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    torch.save({
        'epoch': epoch,
        'model_state_dict': bert2bert.state_dict(),
        'optimizer_state_dict': bert2bert.state_dict(),
        'loss': val_loss,
    }, pl.Path('checkpoints')/"model.pt")

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'D:\Dataset\c4200m\data\hdf5\C4_200M.hf5-00000-of-00010', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

References
----------

1. Attention is all you need paper.
   https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding

