In [1]:
import torch
import warnings
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import build_vocab_from_iterator, GloVe

### Data

In [3]:
data = pd.read_csv("./data/ner_datasetreference.csv", encoding='latin1')


def getData(data):
    data = data.fillna(method = 'ffill')
    data = data.drop(['POS'], axis = 1)
    agg_function = lambda s: [(w,t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["Tag"].values.tolist())]
    group = data.groupby('Sentence #').apply(agg_function)
    sentence = [s for s in group]
    return [[word[0] for word in s] for s in sentence], [[lab[1] for lab in s] for s in sentence]

sentences, labels = getData(data)
split_idx = int(0.9 * len(sentences))
train, train_labels, test, test_labels = sentences[:split_idx], labels[:split_idx], sentences[split_idx:], labels[split_idx:]


def iterator(ss):
    for s in ss:
        yield s

vocab = build_vocab_from_iterator(iterator(train), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
labels_dict = build_vocab_from_iterator(iterator(train_labels), specials=["<pad>"])
glove_vocab = GloVe(name='42B', dim=300)
torch.save(vocab, './vocabs/vocab_obj.pth')
torch.save(labels_dict, './vocabs/labels_dict.pth')
torch.save(glove_vocab, './vocabs/glove_vocab.pth')
use_glove = False


# map-style
class Data(Dataset):
    def __init__(self, d, l, m_len, use_glove=False):
        self.x_train = []
        self.y_train = []
        self.pad_idx = []
        self.l = l
        for i in range(len(d)):
            padding = ["<pad>"] * (m_len - len(vocab(d[i])))
            self.pad_idx.append(len(vocab(d[i])))
            new_x = d[i] + padding
            if use_glove:
                self.x_train.append(glove_vocab.get_vecs_by_tokens(new_x))
            else:
                self.x_train.append(vocab(new_x))
            if l:
                new_y = l[i] + padding
                self.y_train.append(labels_dict(new_y))
            else:
                pass
        if use_glove:
            self.x_train = torch.stack(self.x_train)
        else:
            self.x_train = torch.tensor(self.x_train)
        if l:
            self.y_train = torch.tensor(self.y_train)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        if self.l:
            return self.x_train[idx], self.y_train[idx], self.pad_idx[idx]
        else:
            return self.x_train[idx], self.pad_idx[idx]

max_len = max([len(s) for s in sentences])

train_pipeline = Data(train, train_labels, max_len, use_glove=use_glove)
test_pipeline = Data(test, test_labels, max_len, use_glove=use_glove)

train_loader = DataLoader(train_pipeline, batch_size=64, shuffle=True)
test_loader = DataLoader(test_pipeline)

In [27]:
print(max_len)

104


### Hyperparameters

In [20]:
vocab_size = len(vocab)
tag_size = len(labels_dict)
embed_size = 300
num_layers = 5
hidden_size = 64
n_epochs = 20
learning_rate = 1e-3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model

In [21]:
class NER(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, num_classes, fc_dropout=0.3, embed_dropout=0.5, use_glove=False):
        super(NER, self).__init__()
        self.use_glove = use_glove
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, dropout=0.2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.classifier = nn.Softmax(dim=2)
        self.embed_dropout = nn.Dropout(embed_dropout)
        self.fc_dropout = nn.Dropout(fc_dropout)

    def forward(self, x):
        if not self.use_glove:
            x = self.embed_dropout(self.embed(x))
        x, _ = self.lstm(x)
        x = self.fc_dropout(self.fc(x))
        return x

### Training

In [22]:
model = NER(vocab_size, embed_size, hidden_size, num_layers, tag_size, use_glove=use_glove).to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train_and_eval(model, train, test, optimizer, criterion):
    epoch_loss = 0
    acc = 0
    count = 0
    
    model.train()
    for x, y, pad_idx in train:
        x = x.to(device=device)
        y = y.to(device=device)
        optimizer.zero_grad()
        predictions = model(x)
        predictions = predictions.view(-1, predictions.shape[-1])
        y = y.view(-1)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'training loss of epoch {epoch + 1} is {(epoch_loss / len(train)):.5f}')
    
    model.eval()
    with torch.inference_mode():
        for x, y, pad_idx in test:
            x = x.to(device=device)
            y = y.to(device=device).float().squeeze(1)
            pad_idx = pad_idx[0]
            count += pad_idx
            predictions = torch.argmax(model(x), 2)
            correct = torch.sum((predictions[0, :pad_idx] == y[0, :pad_idx]))
            acc += correct
    print(f'test acc after epoch {epoch + 1} is {(acc / count):.5f}')

for epoch in range(n_epochs):
    train_and_eval(model, train_loader, test_loader, optimizer, criterion)

training loss of epoch 1 is 0.69603
test acc after epoch 1 is 0.88480
training loss of epoch 2 is 0.59994
test acc after epoch 2 is 0.90695
training loss of epoch 3 is 0.58486
test acc after epoch 3 is 0.94190
training loss of epoch 4 is 0.57192
test acc after epoch 4 is 0.95497
training loss of epoch 5 is 0.56491
test acc after epoch 5 is 0.95944
training loss of epoch 6 is 0.56110
test acc after epoch 6 is 0.96179
training loss of epoch 7 is 0.55819
test acc after epoch 7 is 0.96340
training loss of epoch 8 is 0.55636
test acc after epoch 8 is 0.96483
training loss of epoch 9 is 0.55445
test acc after epoch 9 is 0.96433
training loss of epoch 10 is 0.55299
test acc after epoch 10 is 0.96564
training loss of epoch 11 is 0.55132
test acc after epoch 11 is 0.96591
training loss of epoch 12 is 0.55060
test acc after epoch 12 is 0.96594
training loss of epoch 13 is 0.54960
test acc after epoch 13 is 0.96763
training loss of epoch 14 is 0.54884
test acc after epoch 14 is 0.96730
training l

### Testing

In [25]:
def test_model(model, sentence, use_glove):
    with torch.inference_mode():
        temp = Data([sentence.split()], None, max_len, use_glove)
        for x, pad_idx in temp:
            x = x.to(device=device)
            if use_glove:
                x = x.reshape((1, x.shape[0], x.shape[1]))
            else:
                x = x.reshape((1, -1))
            predictions = torch.argmax(model(x), 2).squeeze().tolist()
        return labels_dict.lookup_tokens(predictions)[:pad_idx]


print(test_model(model, "Alex and Alice expected the Tibetan leader to return", use_glove))

['B-per', 'O', 'B-per', 'O', 'O', 'B-gpe', 'O', 'O', 'O']


### Saving Model

In [26]:
torch.save(model.state_dict(), '.\saved_no_glove.pth')