In [15]:
from helper.dataset_reader import read_tsv
import time
import torch
from torch import nn
from torch.optim import Adam
from torchtext.data import Field, BucketIterator
from torchtext.datasets import SequenceTaggingDataset
from corpus import Corpus
from spacy.lang.id import Indonesian

In [16]:
crp = Corpus(
	input_path = '../../dataset/',
	min_word_freq = 2,
	batch_size = 32
)
print(f"Train set: {len(crp.train_data)} sentences")
print(f"Val set: {len(crp.val_data)} sentences")
print(f"Test set: {len(crp.test_data)} sentences")

Train set: 2889 sentences
Val set: 723 sentences
Test set: 1781 sentences


In [17]:
class BiLSTM(nn.Module):

  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, lstm_layers,
               emb_dropout, lstm_dropout, fc_dropout, word_pad_idx):
    super().__init__()
    self.embedding_dim = embedding_dim
    # LAYER 1: Embedding
    self.embedding = nn.Embedding(
        num_embeddings=input_dim,
        embedding_dim=embedding_dim,
        padding_idx=word_pad_idx
    )
    self.emb_dropout = nn.Dropout(emb_dropout)
    # LAYER 2: BiLSTM
    self.lstm = nn.LSTM(
        input_size=embedding_dim,
        hidden_size=hidden_dim,
        num_layers=lstm_layers,
        bidirectional=True,
        dropout=lstm_dropout if lstm_layers > 1 else 0
    )
    # LAYER 3: Fully-connected
    self.fc_dropout = nn.Dropout(fc_dropout)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)  # times 2 for bidirectional

  def forward(self, sentence):
    # sentence = [sentence length, batch size]
    # embedding_out = [sentence length, batch size, embedding dim]
    embedding_out = self.emb_dropout(self.embedding(sentence))
    # lstm_out = [sentence length, batch size, hidden dim * 2]
    lstm_out, _ = self.lstm(embedding_out)
    # ner_out = [sentence length, batch size, output dim]
    lid_out = self.fc(self.fc_dropout(lstm_out))
    return lid_out

  def init_weights(self):
    # to initialize all parameters from normal distribution
    # helps with converging during training
    for name, param in self.named_parameters():
      nn.init.normal_(param.data, mean=0, std=0.1)

  def init_embeddings(self, word_pad_idx):
    # initialize embedding for padding as zero
    self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)

  def count_parameters(self):
    return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [18]:
bilstm = BiLSTM(
    input_dim=len(crp.word_field.vocab),
    embedding_dim=100,
    hidden_dim=64,
    output_dim=len(crp.tag_field.vocab),
    lstm_layers=2,
    emb_dropout=0.5,
    lstm_dropout=0.1,
    fc_dropout=0.25,
    word_pad_idx=crp.word_pad_idx
)
bilstm.init_weights()
bilstm.init_embeddings(word_pad_idx=crp.word_pad_idx)
print(f"The model has {bilstm.count_parameters():,} trainable parameters.")
print(bilstm)

The model has 709,352 trainable parameters.
BiLSTM(
  (embedding): Embedding(5240, 100, padding_idx=1)
  (emb_dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(100, 64, num_layers=2, dropout=0.1, bidirectional=True)
  (fc_dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=128, out_features=8, bias=True)
)


In [19]:
class LID(object):

    def __init__(self, model, data, optimizer_cls, loss_fn_cls):
        self.model = model
        self.data = data
        self.optimizer = optimizer_cls(model.parameters())
        self.loss_fn = loss_fn_cls(ignore_index=self.data.tag_pad_idx)

    @staticmethod
    def epoch_time(start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def accuracy(self, preds, y):
        max_preds = preds.argmax(dim=1, keepdim=True)  # get the index of the max probability
        non_pad_elements = (y != self.data.tag_pad_idx).nonzero()  # prepare masking for paddings
        correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
        return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

    def epoch(self):
        epoch_loss = 0
        epoch_acc = 0
        self.model.train()
        for batch in self.data.train_iter:
            # text = [sent len, batch size]
            text = batch.word
            # tags = [sent len, batch size]
            true_tags = batch.tag
            self.optimizer.zero_grad()
            pred_tags = self.model(text)
            # to calculate the loss and accuracy, we flatten both prediction and true tags
            # flatten pred_tags to [sent len, batch size, output dim]
            pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
            # flatten true_tags to [sent len * batch size]
            true_tags = true_tags.view(-1)
            batch_loss = self.loss_fn(pred_tags, true_tags)
            batch_acc = self.accuracy(pred_tags, true_tags)
            batch_loss.backward()
            self.optimizer.step()
            epoch_loss += batch_loss.item()
            epoch_acc += batch_acc.item()

        return epoch_loss / len(self.data.train_iter), epoch_acc / len(self.data.train_iter)

    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        self.model.eval()
        with torch.no_grad():
        # similar to epoch() but model is in evaluation mode and no backprop
            for batch in iterator:
                text = batch.word
                true_tags = batch.tag
                pred_tags = self.model(text)
                pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
                true_tags = true_tags.view(-1)
                batch_loss = self.loss_fn(pred_tags, true_tags)
                batch_acc = self.accuracy(pred_tags, true_tags)
                epoch_loss += batch_loss.item()
                epoch_acc += batch_acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    # main training sequence
    def train(self, n_epochs):
        for epoch in range(n_epochs):
            start_time = time.time()
            train_loss, train_acc = self.epoch()
            end_time = time.time()
            epoch_mins, epoch_secs = LID.epoch_time(start_time, end_time)
            print(f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
            print(f"\tTrn Loss: {train_loss:.3f} | Trn Acc: {train_acc * 100:.2f}%")
            val_loss, val_acc = self.evaluate(self.data.val_iter)
            print(f"\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc * 100:.2f}%")
        test_loss, test_acc = self.evaluate(self.data.test_iter)
        print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc * 100:.2f}%")

    def infer(self, sentence, true_tags=None):
        self.model.eval()
        # tokenize sentence
        nlp = Indonesian()
        tokens = [token.text.lower() for token in nlp(sentence)]
        # transform to indices based on corpus vocab
        numericalized_tokens = [self.data.word_field.vocab.stoi[t] for t in tokens]
        # find unknown words
        unk_idx = self.data.word_field.vocab.stoi[self.data.word_field.unk_token]
        unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
        # begin prediction
        token_tensor = torch.LongTensor(numericalized_tokens)
        token_tensor = token_tensor.unsqueeze(-1)
        predictions = self.model(token_tensor)
        # convert results to tags
        top_predictions = predictions.argmax(-1)
        predicted_tags = [self.data.tag_field.vocab.itos[t.item()] for t in top_predictions]
        # print inferred tags
        max_len_token = max([len(token) for token in tokens] + [len("word")])
        max_len_tag = max([len(tag) for tag in predicted_tags] + [len("pred")])
        print(
            f"{'word'.ljust(max_len_token)}\t{'unk'.ljust(max_len_token)}\t{'pred tag'.ljust(max_len_tag)}"
            + ("\ttrue tag" if true_tags else "")
            )
        for i, token in enumerate(tokens):
            is_unk = "✓" if token in unks else ""
            print(
              f"{token.ljust(max_len_token)}\t{is_unk.ljust(max_len_token)}\t{predicted_tags[i].ljust(max_len_tag)}"
              + (f"\t{true_tags[i]}" if true_tags else "")
              )

        return tokens, predicted_tags, unks

In [20]:
lid = LID(
  model=bilstm,
  data=crp,
  optimizer_cls=Adam,
  loss_fn_cls=nn.CrossEntropyLoss
)
lid.train(10)

Epoch: 01 | Epoch Time: 0m 7s
	Trn Loss: 1.317 | Trn Acc: 52.65%
	Val Loss: 1.025 | Val Acc: 66.39%
Epoch: 02 | Epoch Time: 0m 7s
	Trn Loss: 0.684 | Trn Acc: 76.94%
	Val Loss: 0.588 | Val Acc: 80.06%
Epoch: 03 | Epoch Time: 0m 6s
	Trn Loss: 0.460 | Trn Acc: 84.39%
	Val Loss: 0.468 | Val Acc: 83.72%
Epoch: 04 | Epoch Time: 0m 7s
	Trn Loss: 0.366 | Trn Acc: 87.55%
	Val Loss: 0.399 | Val Acc: 86.19%
Epoch: 05 | Epoch Time: 0m 6s
	Trn Loss: 0.302 | Trn Acc: 89.65%
	Val Loss: 0.352 | Val Acc: 87.98%
Epoch: 06 | Epoch Time: 0m 6s
	Trn Loss: 0.263 | Trn Acc: 91.06%
	Val Loss: 0.327 | Val Acc: 88.55%
Epoch: 07 | Epoch Time: 0m 6s
	Trn Loss: 0.236 | Trn Acc: 91.89%
	Val Loss: 0.315 | Val Acc: 89.27%
Epoch: 08 | Epoch Time: 0m 7s
	Trn Loss: 0.221 | Trn Acc: 92.51%
	Val Loss: 0.314 | Val Acc: 89.45%
Epoch: 09 | Epoch Time: 0m 7s
	Trn Loss: 0.208 | Trn Acc: 93.00%
	Val Loss: 0.302 | Val Acc: 89.61%
Epoch: 10 | Epoch Time: 0m 6s
	Trn Loss: 0.196 | Trn Acc: 93.30%
	Val Loss: 0.301 | Val Acc: 89.65%


In [25]:
sentence = "If I am happy, aku akan ngesave semua aja haha !! kowe ki piye"
tags = ["EN","EN","EN","EN","O","ID","ID","MIX-ID-EN","ID","ID","O","O","JV","JV","JV"]
words, infer_tags, unknown_tokens = lid.infer(sentence=sentence, true_tags=tags)

word   	unk    	pred tag 	true tag
if     	       	EN       	EN
i      	       	EN       	EN
am     	       	EN       	EN
happy  	       	EN       	EN
,      	       	O        	O
aku    	       	ID       	ID
akan   	       	ID       	ID
ngesave	       	MIX-ID-EN	MIX-ID-EN
semua  	       	ID       	ID
aja    	       	ID       	ID
haha   	       	O        	O
!      	       	O        	O
!      	       	O        	JV
kowe   	       	JV       	JV
ki     	       	JV       	JV


IndexError: list index out of range