In [73]:
import torch
import torch.nn
import torch.optim

from torchtext import data
from torchtext import datasets

import spacy
import numpy as np

import time
import random

In [129]:
train_file = "data/train.tsv"
test_file = ""
val_split = 0.2

In [74]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [75]:
TEXT = data.Field(lower = True)
UD_TAGS = data.Field(unk_token = None)
PTB_TAGS = data.Field(unk_token = None)

In [97]:
fields = (("text", TEXT), ("tags", UD_TAGS), ("ptbtags", PTB_TAGS))

In [130]:
train_data = datasets.SequenceTaggingDataset(train_file, fields)
val_data = datasets.SequenceTaggingDataset(train_file, fields)
test_data = datasets.SequenceTaggingDataset(train_file, fields)



In [99]:
print("Training samples: {}".format(len(train_data)))
print("Validation samples: {}".format(len(val_data)))
print("Testing samples: {}".format(len(train_data)))

Training samples: 12543
Validation samples: 2002
Testing samples: 12543


In [100]:
print(vars(train_data.examples[0]))

{'text': ['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.'], 'tags': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT'], 'ptbtags': ['NNP', 'HYPH', 'NNP', ':', 'JJ', 'NNS', 'VBD', 'NNP', 'NNP', 'NNP', 'HYPH', 'NNP', ',', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNP', ',', 'IN', 'DT', 'JJ', 'NN', '.']}


In [101]:
MIN_FREQ = 2

TEXT.build_vocab(train_data,
                 min_freq = MIN_FREQ)
UD_TAGS.build_vocab(train_data)
PTB_TAGS.build_vocab(train_data)

In [102]:
print("Unique tokens in TEXT: {}".format(len(TEXT.vocab)))
print("Unique tokens in UD_TAG: {}".format(len(UD_TAGS.vocab)))
print("Unique tokens in PTB_TAG: {}".format(len(PTB_TAGS.vocab)))

Unique tokens in TEXT: 8866
Unique tokens in UD_TAG: 18
Unique tokens in PTB_TAG: 51


In [103]:
print(TEXT.vocab.itos)



In [104]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)



In [105]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self,
                 input_dim,
                 embedding_dim,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout,
                 pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)

        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)

        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        #text = [sent len, batch size]

        embedded = self.dropout(self.embedding(text))

        #embedded = [sent len, batch size, emb size]

        outputs, (hidden, cell) = self.lstm(embedded)

        predictions = self.fc(self.dropout(outputs))

        return predictions


In [106]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(UD_TAGS.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTMPOSTagger(INPUT_DIM,
                        EMBEDDING_DIM,
                        HIDDEN_DIM,
                        OUTPUT_DIM,
                        N_LAYERS,
                        BIDIRECTIONAL,
                        DROPOUT,
                        PAD_IDX)

In [107]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

model.apply(init_weights)

BiLSTMPOSTagger(
  (embedding): Embedding(8866, 100, padding_idx=1)
  (lstm): LSTM(100, 128, num_layers=2, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [109]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("The model has {} trainable parameters".format(count_parameters(model)))

The model has 1522010 trainable parameters


In [110]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)


tensor([[ 0.0070,  0.0035, -0.0052,  ..., -0.0327,  0.1718, -0.1129],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0742,  0.0897, -0.1725,  ...,  0.0885, -0.1142,  0.1798],
        ...,
        [-0.0940, -0.0592, -0.0242,  ..., -0.1438, -0.0581,  0.0398],
        [ 0.2084,  0.0594, -0.1474,  ..., -0.0104,  0.0608, -0.0354],
        [-0.2271,  0.1532, -0.0793,  ...,  0.0072,  0.1115, -0.1867]])


In [111]:
optimizer = torch.optim.Adam(model.parameters())

In [112]:
TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [113]:
model = model.to(device)
criterion.to(device)

CrossEntropyLoss()

In [114]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [121]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        text = batch.text
        tags = batch.tags

        optimizer.zero_grad()

        predictions = model(text)

        # reshape predictions since pytorch can't handle 3-dimensional predictions
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)

        loss = criterion(predictions, tags)

        acc = categorical_accuracy(predictions, tags, tag_pad_idx)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [124]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.tags

            predictions = model(text)

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [126]:
N_EPOCHS = 10

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    val_loss, val_acc = evaluate(model, val_iterator, criterion, TAG_PAD_IDX)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'model.pt')

    print("Epoch: {}".format(epoch+1))
    print(f"Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}")
    print(f"Val Loss: {val_loss:.3f} | Val Acc: {val_acc:.3f}")



Epoch: 1
Train Loss: 0.309 | Train Acc: 0.906
Val Loss: 0.479 | Val Acc: 0.854
Epoch: 2
Train Loss: 0.240 | Train Acc: 0.926
Val Loss: 0.444 | Val Acc: 0.868
Epoch: 3
Train Loss: 0.204 | Train Acc: 0.936
Val Loss: 0.430 | Val Acc: 0.870
Epoch: 4
Train Loss: 0.180 | Train Acc: 0.943
Val Loss: 0.421 | Val Acc: 0.869
Epoch: 5
Train Loss: 0.163 | Train Acc: 0.949
Val Loss: 0.409 | Val Acc: 0.867
Epoch: 6
Train Loss: 0.148 | Train Acc: 0.953
Val Loss: 0.411 | Val Acc: 0.874
Epoch: 7
Train Loss: 0.136 | Train Acc: 0.957
Val Loss: 0.404 | Val Acc: 0.877
Epoch: 8
Train Loss: 0.125 | Train Acc: 0.960
Val Loss: 0.410 | Val Acc: 0.875
Epoch: 9
Train Loss: 0.115 | Train Acc: 0.963
Val Loss: 0.412 | Val Acc: 0.884
Epoch: 10
Train Loss: 0.108 | Train Acc: 0.966
Val Loss: 0.413 | Val Acc: 0.883
