In [16]:
import torchtext
from torchtext.data import Field,NestedField, BucketIterator
import underthesea
from underthesea import word_tokenize
from utils import read_file
import torch
from torch import nn
from torch.optim import Adam
import time
from torchcrf import CRF
class Dataset:
    def __init__(self, train_path, val_path, test_path, batch_size):
        self.word_field = Field(lower=True)
        self.tag_field = Field(unk_token=None)
        self.char_nesting_field = Field(tokenize=list)
        self.char_field = NestedField(self.char_nesting_field)  # [batch_size, sent len, word len]
        
        self.datafields = [(("word", "char"), (self.word_field, self.char_field)),
                ("tag", self.tag_field)]
        
        
        
        self.train_dataset = read_file(train_path, self.datafields)
        self.val_dataset = read_file(val_path, self.datafields)
        self.test_dataset = read_file(test_path, self.datafields)

        self.word_field.build_vocab(self.train_dataset.word)
        self.char_field.build_vocab(self.train_dataset.char)  # NEWLY ADDED
        self.tag_field.build_vocab(self.train_dataset.tag)

        self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset, self.test_dataset),
            batch_size=batch_size, sort=False)

        self.char_pad_idx = self.char_field.vocab.stoi[self.char_field.pad_token]  # NEWLY ADDED
        self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]



In [3]:
corpus = Dataset(train_path='./PhoNER_COVID19/data/word/train_word.conll',
                    val_path='./PhoNER_COVID19/data/word/dev_word.conll',
                    test_path='./PhoNER_COVID19/data/word/test_word.conll',
                    batch_size=64)

print(f"Train set: {len(corpus.train_dataset)} sentences")
print(f"Val set: {len(corpus.val_dataset)} sentences")
print(f"Test set: {len(corpus.test_dataset)} sentences")

Train set: 5027 sentences
Val set: 2000 sentences
Test set: 3000 sentences


In [4]:
for x in corpus.train_dataset:
    print(x.word)
    print(x.tag)
    print(x.char)
    break

['đồng_thời', ',', 'bệnh_viện', 'tiếp_tục', 'thực_hiện', 'các', 'biện_pháp', 'phòng_chống', 'dịch_bệnh', 'covid', '-', '00', 'theo', 'hướng_dẫn', 'của', 'bộ', 'y_tế', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'O']
[['Đ', 'ồ', 'n', 'g', '_', 't', 'h', 'ờ', 'i'], [','], ['b', 'ệ', 'n', 'h', '_', 'v', 'i', 'ệ', 'n'], ['t', 'i', 'ế', 'p', '_', 't', 'ụ', 'c'], ['t', 'h', 'ự', 'c', '_', 'h', 'i', 'ệ', 'n'], ['c', 'á', 'c'], ['b', 'i', 'ệ', 'n', '_', 'p', 'h', 'á', 'p'], ['p', 'h', 'ò', 'n', 'g', '_', 'c', 'h', 'ố', 'n', 'g'], ['d', 'ị', 'c', 'h', '_', 'b', 'ệ', 'n', 'h'], ['C', 'O', 'V', 'I', 'D'], ['-'], ['0', '0'], ['t', 'h', 'e', 'o'], ['h', 'ư', 'ớ', 'n', 'g', '_', 'd', 'ẫ', 'n'], ['c', 'ủ', 'a'], ['B', 'ộ'], ['Y', '_', 't', 'ế'], ['.']]


In [5]:
 def normalize_word(word):
    new_word = ""
    for char in word:
        if char.isdigit():
            new_word += '0'
        else:
            new_word += char.lower()
    return new_word

def read_file(path):
    with open(path, encoding = 'utf-8') as f:
        examples = []
        words = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                examples.append([words, tags])
                words = []
                tags = []
            else:
                columns = line.split()
                words.append(normalize_word(columns[0]))
                tags.append(columns[-1])
    return examples

data = read_file('./PhoNER_COVID19/data/word/train_word.conll')

In [6]:
print(data[3][0])

['bà', 'này', 'khi', 'trở', 'về', 'quá_cảnh', 'doha', '(', 'qatar', ')', ',', 'đáp', 'xuống', 'tân_sơn_nhất', 'sáng', '0/0', 'cùng', '00', 'hành_khách', ',', 'trong', 'đó', 'có', '00', 'người', 'nước_ngoài', '.']


In [7]:
sample_train = next(iter(corpus.train_iter))
print(sample_train.word.shape)
print("The shape of the character input follows [batch size, max number of words in the batch, max number of characters in a word within the batch]:")
print(sample_train.char.shape)

torch.Size([77, 64])
The shape of the character input follows [batch size, max number of words in the batch, max number of characters in a word within the batch]:
torch.Size([64, 77, 17])


In [8]:
sample_char_field = sample_train.char[0, :, :]
char_pad_id = corpus.char_pad_idx
print("Character input for a sample sentence in the first train batch:")
for word in sample_char_field:
  if word[0] != char_pad_id:
    print(" ".join([corpus.char_field.vocab.itos[char] for char in word if char != char_pad_id]))
print()
print("As you can see, we can preserve the capital cases as useful information with character-based representation.")

Character input for a sample sentence in the first train batch:
S a u
đ ó
,
m ẫ u
b ệ n h _ p h ẩ m
n à y
đ ư ợ c
g ử i
n g a y
đ ế n
C D C
H à _ N ộ i
đ ể
l à m
x é t _ n g h i ệ m
k h ẳ n g _ đ ị n h
v à
c ó
k ế t _ q u ả
d ư ơ n g _ t í n h
.

As you can see, we can preserve the capital cases as useful information with character-based representation.


In [18]:
class BiLSTM(nn.Module):

    def __init__(self,
                 input_dim,
                 embedding_dim,
                 char_emb_dim,
                 char_input_dim,
                 char_cnn_filter_num,
                 char_cnn_kernel_size,
                 hidden_dim,
                 output_dim,
                 lstm_layers,
                 emb_dropout,
                 cnn_dropout,
                 lstm_dropout,
                 fc_dropout,
                 word_pad_idx,
                 char_pad_idx,
                 tag_pad_idx):
        super().__init__()
        # LAYER 1A: Word Embedding
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=embedding_dim,
            padding_idx=word_pad_idx
        )
        self.emb_dropout = nn.Dropout(emb_dropout)
        # LAYER 1B: Char Embedding-CNN
        self.char_emb_dim = char_emb_dim
        self.char_emb = nn.Embedding(
            num_embeddings=char_input_dim,
            embedding_dim=char_emb_dim,
            padding_idx=char_pad_idx
        )
        self.char_cnn = nn.Conv1d(
            in_channels=char_emb_dim,
            out_channels=char_emb_dim * char_cnn_filter_num,
            kernel_size=char_cnn_kernel_size,
            groups=char_emb_dim  # different 1d conv for each embedding dim
        )
        self.cnn_dropout = nn.Dropout(cnn_dropout)
        # LAYER 2: BiLSTM
        self.lstm = nn.LSTM(
            input_size=embedding_dim + (char_emb_dim * char_cnn_filter_num),
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            dropout=lstm_dropout if lstm_layers > 1 else 0
        )
        # LAYER 3: Fully-connected
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # times 2 for bidirectional
        ### BEGIN MODIFIED SECTION: CRF OUTPUT ###
        # LAYER 4: CRF
        self.tag_pad_idx = tag_pad_idx
        self.crf = CRF(num_tags=output_dim)
        ### END MODIFIED SECTION ###
        # init weights from normal distribution
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)

    def forward(self, words, chars, tags=None):
        # words = [sentence length, batch size]
        # chars = [batch size, sentence length, word length)
        # tags = [sentence length, batch size]
        # embedding_out = [sentence length, batch size, embedding dim]
        embedding_out = self.emb_dropout(self.embedding(words))
        # character cnn layer forward
        # reference: https://github.com/achernodub/targer/blob/master/src/layers/layer_char_cnn.py
        # char_emb_out = [batch size, sentence length, word length, char emb dim]
        char_emb_out = self.emb_dropout(self.char_emb(chars))
        batch_size, sent_len, word_len, char_emb_dim = char_emb_out.shape
        char_cnn_max_out = torch.zeros(batch_size, sent_len, self.char_cnn.out_channels)
        for sent_i in range(sent_len):
            # sent_char_emb = [batch size, word length, char emb dim]
            sent_char_emb = char_emb_out[:, sent_i, :, :]
            # sent_char_emb_p = [batch size, char emb dim, word length]
            sent_char_emb_p = sent_char_emb.permute(0, 2, 1)
            # char_cnn_sent_out = [batch size, out channels * char emb dim, word length - kernel size + 1]
            char_cnn_sent_out = self.char_cnn(sent_char_emb_p)
            char_cnn_max_out[:, sent_i, :], _ = torch.max(char_cnn_sent_out, dim=2)
        char_cnn = self.cnn_dropout(char_cnn_max_out)
        # concat word and char embedding
        # char_cnn_p = [sentence length, batch size, char emb dim * num filter]
        char_cnn_p = char_cnn.permute(1, 0, 2)
        word_features = torch.cat((embedding_out, char_cnn_p), dim=2)
        # lstm_out = [sentence length, batch size, hidden dim * 2]
        lstm_out, _ = self.lstm(word_features)
        # ner_out = [sentence length, batch size, output dim]
        fc_out = self.fc(self.fc_dropout(lstm_out))
        ### BEGIN MODIFIED SECTION: CRF ###
        if tags is not None:
            mask = tags != self.tag_pad_idx
            crf_out = self.crf.decode(fc_out, mask=mask)
            crf_loss = -self.crf(fc_out, tags=tags, mask=mask)
        else:
            crf_out = self.crf.decode(fc_out)
            crf_loss = None
        ### END MODIFIED SECTION ###
        return crf_out, crf_loss

    def init_embeddings(self, char_pad_idx, word_pad_idx):
        # initialize embedding for padding as zero
        self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)
        self.char_emb.weight.data[char_pad_idx] = torch.zeros(self.char_emb_dim)

    ### BEGIN MODIFIED SECTION: CRF OUTPUT ###
    def init_crf_transitions(self, tag_names, imp_value=-1e4):
        num_tags = len(tag_names)
        for i in range(num_tags):
            tag_name = tag_names[i]
            # I and L and <pad> impossible as a start
            if tag_name[0] in ("I") or tag_name == "<pad>":
                torch.nn.init.constant_(self.crf.start_transitions[i], imp_value)
            # B and I impossible as an end
            if tag_name[0] in ("B"):
                torch.nn.init.constant_(self.crf.end_transitions[i], imp_value)
        
        # init impossible transitions between positions
        tag_is = {}
        for tag_position in ("B", "I", "O"):
            tag_is[tag_position] = [i for i, tag in enumerate(tag_names) if tag[0] == tag_position]
        tag_is["P"] = [i for i, tag in enumerate(tag_names) if tag == "tag"]
        
        print("tag_is: ")
        print(tag_is)
        
#         impossible_transitions_position = {
#             "B": "BOUP",
#             "I": "BOUP",
#             "O": "IL",
#             "U": "IL",
#             "L": "IL"
#         }
#         for from_tag, to_tag_list in impossible_transitions_position.items():
#             to_tags = list(to_tag_list)
#             for from_tag_i in tag_is[from_tag]:
#                 for to_tag in to_tags:
#                     for to_tag_i in tag_is[to_tag]:
#                         torch.nn.init.constant_(
#                             self.crf.transitions[from_tag_i, to_tag_i], imp_value
#                         )
#         # init impossible B and I transitions to different entity types
#         impossible_transitions_tags = {
#             "B": "IL",
#             "I": "IL"
#         }
#         for from_tag, to_tag_list in impossible_transitions_tags.items():
#             to_tags = list(to_tag_list)
#             for from_tag_i in tag_is[from_tag]:
#                 for to_tag in to_tags:
#                     for to_tag_i in tag_is[to_tag]:
#                         if tag_names[from_tag_i].split("-")[1] != tag_names[to_tag_i].split("-")[1]:
#                             torch.nn.init.constant_(
#                                 self.crf.transitions[from_tag_i, to_tag_i], imp_value
#                             )
    ### END MODIFIED SECTION: CRF OUTPUT ###

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [19]:
bilstm = BiLSTM(
    input_dim=len(corpus.word_field.vocab),
    embedding_dim=300,
    char_emb_dim=25,
    char_input_dim=len(corpus.char_field.vocab),
    char_cnn_filter_num=5,
    char_cnn_kernel_size=3,
    hidden_dim=64,
    output_dim=len(corpus.tag_field.vocab),
    lstm_layers=2,
    emb_dropout=0.5,
    cnn_dropout=0.25,
    lstm_dropout=0.1,
    fc_dropout=0.25,
    word_pad_idx=corpus.word_pad_idx,
    char_pad_idx=corpus.char_pad_idx,
    tag_pad_idx=corpus.tag_pad_idx
)

bilstm.init_embeddings(
    char_pad_idx=corpus.char_pad_idx,
    word_pad_idx=corpus.word_pad_idx
)

# CRF transitions initialization for impossible transitions
bilstm.init_crf_transitions(
    tag_names=corpus.tag_field.vocab.itos
)
# print(f"The model has {bilstm.count_parameters():,} trainable parameters.")
# print(bilstm)

tag_is: 
{'B': [2, 4, 5, 9, 10, 11, 12, 13, 14, 15], 'I': [3, 6, 7, 8, 16, 17, 18, 19, 20], 'O': [1], 'P': []}


In [55]:
class Trainer(object):

    def __init__(self, model, data, optimizer_cls, loss_fn_cls):
        self.model = model
        self.data = data
        self.optimizer = optimizer_cls(model.parameters())
        self.loss_fn = loss_fn_cls(ignore_index=self.data.tag_pad_idx)

    @staticmethod
    def epoch_time(start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def accuracy(self, preds, y):
        max_preds = preds.argmax(dim=1, keepdim=True)  # get the index of the max probability
        non_pad_elements = (y != self.data.tag_pad_idx).nonzero()  # prepare masking for paddings
        correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
        return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

    def epoch(self):
        epoch_loss = 0
        epoch_acc = 0
        self.model.train()
        for batch in self.data.train_iter:
            # words = [sent len, batch size]
            words = batch.word
            # chars = [batch size, sent len, char len]
            chars = batch.char  # NEWLY ADDED
            # tags = [sent len, batch size]
            true_tags = batch.tag
            self.optimizer.zero_grad()
            pred_tags = self.model(words, chars)  # MODIFIED
            # to calculate the loss and accuracy, we flatten both prediction and true tags
            # flatten pred_tags to [sent len, batch size, output dim]
            pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
            # flatten true_tags to [sent len * batch size]
            true_tags = true_tags.view(-1)
            batch_loss = self.loss_fn(pred_tags, true_tags)
            batch_acc = self.accuracy(pred_tags, true_tags)
            batch_loss.backward()
            self.optimizer.step()
            epoch_loss += batch_loss.item()
            epoch_acc += batch_acc.item()
        return epoch_loss / len(self.data.train_iter), epoch_acc / len(self.data.train_iter)

    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        self.model.eval()
        with torch.no_grad():
            # similar to epoch() but model is in evaluation mode and no backprop
            for batch in iterator:
                words = batch.word
                chars = batch.char  # NEWLY ADDED
                true_tags = batch.tag
                pred_tags = self.model(words, chars)  # MODIFIED
                pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
                true_tags = true_tags.view(-1)
                batch_loss = self.loss_fn(pred_tags, true_tags)
                batch_acc = self.accuracy(pred_tags, true_tags)
                epoch_loss += batch_loss.item()
                epoch_acc += batch_acc.item()
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    def train(self, n_epochs):
        for epoch in range(n_epochs):
            start_time = time.time()
            train_loss, train_acc = self.epoch()
            end_time = time.time()
            epoch_mins, epoch_secs = Trainer.epoch_time(start_time, end_time)
            print(f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
            print(f"\tTrn Loss: {train_loss:.3f} | Trn Acc: {train_acc * 100:.2f}%")
            val_loss, val_acc = self.evaluate(self.data.val_iter)
            print(f"\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc * 100:.2f}%")
        test_loss, test_acc = self.evaluate(self.data.test_iter)
        print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc * 100:.2f}%")

    def infer(self, sentence, true_tags=None):
        self.model.eval()
        # tokenize sentence
        nlp = Indonesian()
        tokens = [token.text for token in nlp(sentence)]
        max_word_len = max([len(token) for token in tokens])
        # transform to indices based on corpus vocab
        numericalized_tokens = [self.data.word_field.vocab.stoi[token.lower()] for token in tokens]
        ### BEGIN MODIFIED SECTION: CHARACTER EMBEDDING ###
        numericalized_chars = []
        char_pad_id = self.data.char_pad_idx
        for token in tokens:
            numericalized_chars.append(
                [self.data.char_field.vocab.stoi[char] for char in token]
                + [char_pad_id for _ in range(max_word_len - len(token))]
            )
        ### END MODIFIED SECTION ###
        # find unknown words
        unk_idx = self.data.word_field.vocab.stoi[self.data.word_field.unk_token]
        unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
        # begin prediction
        token_tensor = torch.as_tensor(numericalized_tokens)
        token_tensor = token_tensor.unsqueeze(-1)
        char_tensor = torch.as_tensor(numericalized_chars)  # NEWLY ADDED
        char_tensor = char_tensor.unsqueeze(0)  # NEWLY ADDED: batch size at the beginning
        predictions = self.model(token_tensor, char_tensor)  # MODIFIED
        # convert results to tags
        top_predictions = predictions.argmax(-1)
        predicted_tags = [self.data.tag_field.vocab.itos[t.item()] for t in top_predictions]
        # print inferred tags
        max_len_token = max([len(token) for token in tokens] + [len('word')])
        max_len_tag = max([len(tag) for tag in predicted_tags] + [len('pred')])
        print(
            f"{'word'.ljust(max_len_token)}\t{'unk'.ljust(max_len_token)}\t{'pred tag'.ljust(max_len_tag)}"
            + ("\ttrue tag" if true_tags else "")
        )
        for i, token in enumerate(tokens):
            is_unk = "✓" if token in unks else ""
            print(
                f"{token.ljust(max_len_token)}\t{is_unk.ljust(max_len_token)}\t{predicted_tags[i].ljust(max_len_tag)}"
                + (f"\t{true_tags[i]}" if true_tags else "")
            )
        return tokens, predicted_tags, unks

In [58]:
trainer = Trainer(
    model=bilstm,
    data=corpus,
    optimizer_cls=Adam,
    loss_fn_cls=nn.CrossEntropyLoss
)
trainer.train(5)

Epoch: 01 | Epoch Time: 1m 26s
	Trn Loss: 1.199 | Trn Acc: 76.50%
	Val Loss: 0.852 | Val Acc: 75.48%
Epoch: 02 | Epoch Time: 1m 21s
	Trn Loss: 0.431 | Trn Acc: 87.97%
	Val Loss: 0.294 | Val Acc: 92.74%
Epoch: 03 | Epoch Time: 1m 25s
	Trn Loss: 0.179 | Trn Acc: 95.61%
	Val Loss: 0.187 | Val Acc: 95.18%


KeyboardInterrupt: 