In [6]:
import numpy as np
import pandas as pd
from tqdm import trange
from typing import Tuple
import warnings
warnings.filterwarnings('ignore')

from nltk.util import ngrams
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score


#initialize start and end tokens, and ''unknown'' tag
START = '<s>'
END = '</s>'
UNK = 'UNK'

# 1

We train a feed-forward neural network classifier to predict the POS tag of a word in its context. The input
should be the word embedding for the center word concatenated with the word embeddings for words
in a context window. We’ll define a context window as the sequence of words containing w words
to either side of the center word and including the center word itself, so the context window contains
1 + 2w words in total. For example, if w = 1 and the word embedding dimensionality is d, the total
dimensionality of the input will be 3d. For words near the sentence boundaries, pad the sentence with
beginning-of-sentence and end-of-sentence characters (< s> and < /s>). The word embeddings should be randomly initialized and learned along with all other parameters in the model.

The input is the concatenation of word embeddings in the context window, with the word to be tagged in the center. We use a single hidden layer of width 128 with a tanh nonlinearity. The hidden layer is then fed to an affine transformation which will produce scores for all possible POS tags. Finally, we use a softmax transformation on the scores to produce a probability distribution over tags.

In [None]:
# Load data
train = pd.read_table('data/twpos-data/twpos-train.tsv', sep=r'\t', header=None, skip_blank_lines=False)
dev = pd.read_table('data/twpos-data/twpos-dev.tsv', sep=r'\t', header=None, skip_blank_lines=False)
devtest = pd.read_table('data/twpos-data/twpos-devtest.tsv', sep=r'\t', header=None, skip_blank_lines=False)

In [None]:
# Define the POS Tagger Neural Network
class POS_Tagger(nn.Module):
    def __init__(self, vocab_size, tag_size, window, embedding_dim, hidden_dim):
        super(POS_Tagger, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.uniform_(-0.01, 0.01)
        self.fc1 = nn.Linear(embedding_dim * (1 + 2 * window), hidden_dim)
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(hidden_dim, tag_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = embeds.view(embeds.size(0), -1)
        out = self.tanh(self.fc1(embeds))
        tag_space = self.fc2(out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
# Constants
VOCAB_SIZE = len(VOCAB_TO_IDX)
TAG_SIZE = len(TAG_TO_IDX)
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
WINDOW = 1
LEARING_RATE = 0.1
BATCH_SIZE = 16
EPOCHS = 10

# Initialize the model
model = POS_Tagger(VOCAB_SIZE, TAG_SIZE, WINDOW, EMBEDDING_DIM, HIDDEN_DIM)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARING_RATE)

# Create dataloaders
train_dl = create_dataloader(train, WINDOW, BATCH_SIZE)
dev_dl = create_dataloader(dev, WINDOW, BATCH_SIZE)
devtest_dl = create_dataloader(devtest, WINDOW, BATCH_SIZE)

# Training loop
for epoch in trange(EPOCHS):

    # Training phase
    model.train()
    total_loss = 0
    for context_tensor, target in train_dl:
        model.zero_grad()
        tag_scores = model(context_tensor)
        loss = loss_function(tag_scores, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Eval phase
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for context_tensor, target in dev_dl:
            tag_scores = model(context_tensor)
            _, preds = torch.max(tag_scores, 1)
            all_preds.extend(preds.tolist())
            all_targets.extend(target.tolist())
    
    # Calculate accuracy
    dev_acc = accuracy_score(all_targets, all_preds)
    print(f'Epoch {epoch}: Dev Accuracy: {round(dev_acc, 3)}, Total Loss: {round(total_loss, 3)}')

### We obtain that 82.14% is the best accuracy achieved when testing finally on DEVTEST.

## 1.2

In [148]:
# As a feature function we concatenate to each vector the number of characters in its corresponding word
emb = {}
for word in train_dataset[0].values:
    emb[word] = [random.uniform(-0.1, 0.1) for _ in range(50)] + [len(word)]
    
emb['</s>'] = [random.uniform(-0.1, 0.1) for _ in range(51)]
emb['UUUNKKK'] = [random.uniform(-0.1, 0.1) for _ in range(51)]

In [149]:
train_data_feat = RandomEmbeddingsDataSet(1)
train_loader_feat = DataLoader(train_data_feat, batch_size = 1, shuffle = True)

100%|██████████████████████████████████| 10274/10274 [00:00<00:00, 50642.22it/s]


In [150]:
#create vector embeddings for test set using the embeddings for training
test_emb = {}
for word in test_dataset[0].values:
    if word not in train_dataset[0].values:
        test_emb[word] = [0]*51
    else:
        test_emb[word] = emb[word]
test_emb['</s>'] = emb['</s>']
test_emb['UUUNKKK'] = emb['UUUNKKK']

In [151]:
test_data_feat = TestEmbeddingsDataSet(1)
test_loader_feat = DataLoader(test_data_feat, batch_size = 1, shuffle = True)

100%|████████████████████████████████████| 4650/4650 [00:00<00:00, 40323.68it/s]


In [152]:
class NeuralNetFeat(nn.Module):
    def __init__(self, w):
        super(NeuralNetFeat, self).__init__()

        self.stack = nn.Sequential(
            nn.Linear(((2*w)+1)*51 ,128),
            nn.Tanh(),
            nn.Linear(128, 25),
            nn.Softmax()
            )

        for m in self.modules():
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.05)
                m.bias.data.zero_()

    def forward(self, input):
        u = self.stack(input)
        return u

In [None]:
criterion = torch.nn.CrossEntropyLoss()
lr = 0.01
model = NeuralNetFeat(1)
early_stopping = EarlyStopping(patience=2, path='my_model_checkpoint.pt')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum = 0.9)
train_errs, test_errs = train(10, model, criterion, optimizer, train_loader_feat, test_loader_feat)