In [1]:
import numpy as np
import pandas as pd

In [2]:
# when quoting=3, '"' will be interpreted as an ordinary char, not a quote character
train = pd.read_csv('hw4/data/train', sep=' ', quoting=3, names=['index', 'word', 'ner'])
dev = pd.read_csv('hw4/data/dev', sep=' ', quoting=3, names=['index', 'word', 'ner'])

In [3]:
# split training data into individual training samples
word_train = []
ner_train = []

word = []
ner = []

for i in range(len(train)-1):
    if train.loc[i]['index'] < train.loc[i+1]['index']:
        word.append(train.loc[i]['word'])
        ner.append(train.loc[i]['ner'])
    else:
        word.append(train.loc[i]['word'])
        ner.append(train.loc[i]['ner'])
        word_train.append(word)
        ner_train.append(ner)
        word = []
        ner = []

# split dev data into individual samples
word_dev = []
ner_dev = []

word = []
ner = []

for i in range(len(dev)-1):
    if dev.loc[i]['index'] < dev.loc[i+1]['index']:
        word.append(dev.loc[i]['word'])
        ner.append(dev.loc[i]['ner'])
    else:
        word.append(dev.loc[i]['word'])
        ner.append(dev.loc[i]['ner'])
        word_dev.append(word)
        ner_dev.append(ner)
        word = []
        ner = []

In [4]:
print(word_train[:5])
print(ner_train[:5])

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22'], ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']]
[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'], ['B-PER', 'I-PER'], ['B-LOC', 'O'], ['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'O

## Task1: Simple Bidirectional LSTM model

In [5]:
# Create a vocabulary using the training data
vocab = train['word'].unique().tolist()

# Add a token '<unk>' to the vocabulary in case that there is unknown words
# in the dev and testing data
vocab.append('<unk>')
# Add a token '<pad>' to the vocabulary to pad shorter sentences in a batch
vocab.append('<pad>')

# Create a dictionary with integer as key and named entity as value
ner_ls = train['ner'].unique().tolist()
# Add to ner_ls a fake ne 'PAD' corresponding to the '<pad>' in the vocabulary
ner_ls.append('PAD')
ner_dict, ner_dict2 = {}, {}
for i, ner in enumerate(ner_ls):
    ner_dict[ner] = i
    ner_dict2[i] = ner

# Convert the ner_train and ner_dev to label_train and label_dev
label_train = [[ner_dict[ner] for ner in ner_train[i]] for i in range(len(ner_train))]
label_dev = [[ner_dict[ner] for ner in ner_dev[i]] for i in range(len(ner_dev))]

In [6]:
ner_ls

['B-ORG',
 'O',
 'B-MISC',
 'B-PER',
 'I-PER',
 'B-LOC',
 'I-ORG',
 'I-MISC',
 'I-LOC',
 'PAD']

In [7]:
ner_dict

{'B-ORG': 0,
 'O': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8,
 'PAD': 9}

In [8]:
ner_dict2

{0: 'B-ORG',
 1: 'O',
 2: 'B-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-LOC',
 6: 'I-ORG',
 7: 'I-MISC',
 8: 'I-LOC',
 9: 'PAD'}

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
import functools as fc
from sklearn.metrics import accuracy_score, f1_score

In [10]:
# create a custom dataset
class CustomDataset(Dataset):
    def __init__(self, features, labels, vocab):
        self.features = features
        self.labels = labels
        self.vocab = vocab
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # For each word in the vocab, we will represent it using a one-hot vector.
        # For instance, for the first word 'EU' in the vocabulary, its one-hot vector
        # will be [1, 0, 0, ..., 0], and the size of the one-hot vector is the size of
        # the vocabualry
        
        sentence = self.features[idx]
        label = self.labels[idx]
        
        vectors = []
        # keep track of word index
        word_ids = []
        
        for _, word in enumerate(sentence):
            vector = [0 for i in range(len(self.vocab))]
            # handle (possibly unknown words in the dev or testing data)
            if word in vocab:
                vector[vocab.index(word)] = 1
                word_id = vocab.index(word)
            else:
                vector[vocab.index('<unk>')] = 1
                word_id = vocab.index('<unk>')
            vectors.append(vector)
            word_ids.append(word_id)
        vectors = torch.Tensor(vectors)
        
        return vectors, label, word_ids     

In [11]:
# define a collate_fn to pad sequences in a batch to the same length
# https://discuss.pytorch.org/t/how-to-create-batches-of-a-list-of-varying-dimension-tensors/50773/14
def collate_fn(data):
    """
       data: is a list of tuples with (vectors, label, word_ids) returned
             the CustomDataset, where vectors is a tensor with shape of
             seq_len * word_embeddings, label a list of named entities,
             and word_ids the word indices in the vocabulary.  
    """
    labels = [data[i][1] for i in range(len(data))]
    word_ids = [data[i][2] for i in range(len(data))]
    # the length of an example corresponds to the first dimension of its tensor
    lengths = [data[i][0].size(0) for i in range(len(data))]
    max_len = max(lengths)
    # the number of features is the second dimension of the tensor:
    num_features = data[0][0].size(1)
    # initialize a zero tensor of shape batch_size * max_len * n_ftrs
    features = torch.zeros((len(data), max_len, num_features))
    
    # pad the tensor of each sequence with zeros, the label of each sequence with the
    # label of 'PAD', and the index of each sequence with the index of '<pad>' in the
    # vocabulary.
    for i in range(len(data)):
        j = data[i][0].size(0)
        features[i] = torch.cat([data[i][0], torch.zeros((max_len-j, num_features))])
        labels[i] = labels[i] + [ner_dict['PAD'] for _ in range(max_len-j)]
        word_ids[i] = word_ids[i] + [vocab.index('<pad>') for _ in range(max_len-j)]
    labels = torch.Tensor(labels)
    word_ids = torch.Tensor(word_ids)
    lengths = torch.Tensor(lengths)
    
    return features.float(), labels.long(), lengths.long(), word_ids.long()

In [12]:
# create a dataloader for training data
train_dataset = CustomDataset(word_train, label_train, vocab)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True,
                              collate_fn=collate_fn)

# create a dataloader for dev data
dev_dataset = CustomDataset(word_dev, label_dev, vocab)
dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=True,
                            collate_fn=collate_fn)

In [18]:
# define the architecture of the simple bidirectional LSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_dim=len(vocab), embed_dim=100, lstm_dim=256, 
                 dropout=0.33, linear_dim=128, output_dim=len(ner_ls)):
        super().__init__()
        # dimension of inputs
        self.input_dim = input_dim
        # dimension of embeddings
        self.embed_dim = embed_dim
        # linear layer from input one hot vectors to word embeddings
        self.fc1 = nn.Linear(self.input_dim, self.embed_dim)
        # dimension of the lstm layer
        self.lstm_dim = lstm_dim
        # lstm layer
        self.lstm = nn.LSTM(self.embed_dim, self.lstm_dim, batch_first=True,
                            dropout=0.33, bidirectional=True)
        # dimension of the linear layer following lstm
        self.linear_dim = linear_dim
        # linear layer following lstm
        self.fc2 = nn.Linear(self.lstm_dim * 2, self.linear_dim)
        # elu layer following linear layer
        self.elu = nn.ELU()
        # dimension of outputs
        self.output_dim = output_dim
        # linear layer following elu
        self.fc3 = nn.Linear(self.linear_dim, self.output_dim)
        
    def forward(self, batched_data, lengths):
        
        # batched data is the tensor loaded from dataloader
        # shape: batch_size * sequence length * feature number
        batch_size = batched_data.shape[0]
        
        # convert one hot vectors of words in a sequence to word embeddings
        batched_data = self.fc1(batched_data[:, :,])
        
        # pack padded sequences to their original lengths
        # lengths must be on cpu even if it is a tensor. Therefore, in case
        # that lengths is on gpu, we need to transfer it (with batched_data)
        # to cpu, and transfer packed_data back to gpu
        packed_data = pack_padded_sequence(batched_data, lengths,
                                           batch_first=True,
                                           enforce_sorted=False)
        h0 = torch.randn(2, batch_size, self.lstm_dim)
        c0 = torch.randn(2, batch_size, self.lstm_dim)
        '''if lengths.is_cuda:
            batched_data, lengths = batched_data.cpu(), lengths.cpu()
            packed_data = pack_padded_sequence(batched_data, lengths,
                                               batch_first=True,
                                               enforce_sorted=False)
            packed_data = packed_data.to(device)
        else:
            packed_data = pack_padded_sequence(batched_data, lengths,
                                               batch_first=True,
                                               enforce_sorted=False)
        
        h0 = torch.randn(2, batch_size, self.lstm_dim).to(device)
        c0 = torch.randn(2, batch_size, self.lstm_dim).to(device)'''
        
        # the output contains the output features (h_t) and has a shape of 
        # batch_size * seq_len * (2(for bidirectional)*num_lstm_layers)
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        output, (hn, cn) = self.lstm(packed_data, (h0, c0))
        # the output is a PackedSequence object, need to convert it back to
        # a Variable object using pad_packed_sequence
        output, _ = pad_packed_sequence(output, batch_first=True)
            
        # pass hidden state through linear layer following lstm
        linear_out = self.fc2(output)
            
        # pass output from linear layer through elu
        elu_out = self.elu(linear_out)
            
        # pass output from elu through another linear layer
        # the shape of results: batch_size * seq_len * num_of_ners
        results = self.fc3(elu_out)
        
        return results

In [14]:
# initialize a BiLSTM model
model = BiLSTM()
print(model)

BiLSTM(
  (fc1): Linear(in_features=23304, out_features=100, bias=True)
  (lstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)




In [25]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss(ignore_index=ner_dict['PAD'])

# specify optimizer (stochastic gradient descent) and initial learning rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# adjust learning rate using learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=0.001)

In [16]:
# train model on gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [28]:
# transfer model to GPU
# model.to(device)

# train the network
max_epochs = 100

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ## train the model
    model.train()
    
    for features, labels, lengths, _ in train_dataloader:
        
        # transfer to GPU
        #features, labels, lengths = features.to(device), labels.to(device), lengths.to(device)
        
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        
        # forward pass: compute predictions by passing inputs to the model
        output = model(features, lengths)
        
        # the shape of output: batch_size * seq_len * num_of_ners
        # need: batch_size * num_of_ners * seq_len to fit in the loss function
        # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        output = torch.transpose(output, 1, 2)
        
        # calculate the loss
        loss = criterion(output, labels)
        
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # perform a single optimization step (parameter update)
        optimizer.step()
        
        # update running training loss
        train_loss += loss.item() * features.size(0)
        
    ## validate the model
    model.eval()
    
    for features, labels, lengths, _ in dev_dataloader:
        # transfer to GPU
        #features, labels, lengths = features.to(device), labels.to(device), lengths.to(device)
        
        # forward pass
        output = model(features, lengths)
        
        # the shape of output: batch_size * seq_len * num_of_ners
        # need: batch_size * num_of_ners * seq_len to fit in the loss function
        # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        output = torch.transpose(output, 1, 2)
        
        # calculate the loss
        loss = criterion(output, labels)
        
        # update running validation loss
        valid_loss += loss.item() * features.size(0)
    
    # scheduler should be called after validation
    scheduler.step(valid_loss)
    
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_dataloader.dataset)
    valid_loss = valid_loss/len(dev_dataloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.470560 	Validation Loss: 0.478189
Validation loss decreased (0.500000 --> 0.478189).  Saving model ...
Epoch: 2 	Training Loss: 0.460265 	Validation Loss: 0.738701
Epoch: 3 	Training Loss: 0.453596 	Validation Loss: 0.439980
Validation loss decreased (0.478189 --> 0.439980).  Saving model ...
Epoch: 4 	Training Loss: 0.440506 	Validation Loss: 0.451953
Epoch: 5 	Training Loss: 0.430057 	Validation Loss: 0.505304
Epoch: 6 	Training Loss: 0.422528 	Validation Loss: 0.419714
Validation loss decreased (0.439980 --> 0.419714).  Saving model ...
Epoch: 7 	Training Loss: 0.410976 	Validation Loss: 0.529148
Epoch: 8 	Training Loss: 0.403119 	Validation Loss: 0.412609
Validation loss decreased (0.419714 --> 0.412609).  Saving model ...
Epoch: 9 	Training Loss: 0.386645 	Validation Loss: 0.399781
Validation loss decreased (0.412609 --> 0.399781).  Saving model ...
Epoch: 10 	Training Loss: 0.381126 	Validation Loss: 0.419596
Epoch: 11 	Training Loss: 0.370905 	Validati

In [17]:
def predict(model, dataloader):
    preds = torch.Tensor()
    labels = torch.Tensor()
    word_ids = torch.Tensor()
    lengths = torch.Tensor()
    for features, label, length, word_id in dataloader:
        # the shape of output: batch_size * seq_len * num_of_ners
        output = model(features, length)
        # torch.max returns values (0) and indices (1) of the max, here we
        # only need indices (1)
        _, pred = torch.max(output, 2)
        # Paddings should be excluded from the computation of the prediction
        # accuracy
        pred = pack_padded_sequence(pred, length, batch_first=True,
                                    enforce_sorted=False).data
        label = pack_padded_sequence(label, length, batch_first=True,
                                    enforce_sorted=False).data
        word_id = pack_padded_sequence(word_id, length, batch_first=True,
                                    enforce_sorted=False).data
        # concatenate batches
        preds = torch.cat((preds, pred))
        labels = torch.cat((labels, label))
        word_ids = torch.cat((word_ids, word_id))
        lengths = torch.cat((lengths, length))
    preds = preds.reshape(-1).tolist()
    labels = labels.reshape(-1).tolist()
    word_ids = word_ids.reshape(-1).tolist()
    lengths = lengths.reshape(-1).tolist()
    return preds, labels, lengths, word_ids

In [29]:
model.load_state_dict(torch.load('model.pt'))
preds, labels, lengths, word_ids = predict(model, dev_dataloader)

In [30]:
# create a results list holding the required information (idx, word, gold, pred)
# results is a list of sublists, with sublist being a list of (idx, word, gold, pred)
# for each word in a sentence
results = []
pos = 0
for _, length in enumerate(lengths):
    # sentence-level result
    result = []
    idx = 0
    for i in range(pos, pos+int(length)):
        # word-level result
        result_word = [idx, vocab[int(word_ids[i])], ner_dict2[int(labels[i])], ner_dict2[int(preds[i])]]
        result.append(result_word)
        idx += 1
    results.append(result)
    pos += int(length)

# convert results (a list of sublists) to a single list
results = fc.reduce(lambda a, b: a + b, results)
# convert results to a dataframe
results = pd.DataFrame(data=results, columns=['idx', 'word', 'gold', 'pred'])
# write results to a file with the required format:
# idx word gold pred
results.to_csv('recap/dev_results', sep=' ', header=False, index=False)

In [31]:
# convert results (a list of sublists) to a single list
results = fc.reduce(lambda a, b: a + b, results)
# convert results to a dataframe
results = pd.DataFrame(data=results, columns=['idx', 'word', 'gold', 'pred'])
# write results to a file with the required format:
# idx word gold pred
results.to_csv('recap/dev_results', sep=' ', header=False, index=False)

In [32]:
f1_score(labels, preds, average='macro')

0.6877308453562512

In [33]:
accuracy_score(labels, preds)

0.933765075425602

## Task 2: Using GloVe word embeddings

In [11]:
## import the pretrained glove model

# each line in the glove file is in the format:
# word dim0 dim1 dim2 ...
# for example:
# after 0.38315 -0.3561 -0.1283 -0.19527 0.047629...
glove = {}
with open('hw4/glove.6B.100d/glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], 'float32')
        glove[word] = vector
f.close()

In [12]:
# create a vocabulary from glove
vocab_glove = list(glove.keys())
# for words that are included in the training data but not in vocab_glove, that is, 
# 1) words whose case-insensitive counterparts are included in the vocab_glove
# 2) words whose case-insensitive counterparts are not included in the vocab_glove,
# we will add them to vocab_glove
for word in train['word'].unique().tolist():
    if word not in vocab_glove:
        vocab_glove.append(word)

# Add a token '<unk>' to the vocabulary in case that there is unknown words
# in the dev and testing data
vocab_glove.append('<unk>')
# Add a token '<pad>' to the vocabulary to pad shorter sentences in a batch
vocab_glove.append('<pad>')

In [13]:
# because the pretrain glove model is case-insensitive, while for ner task,
# case sensitivity is critical. Therefore, we can train the embedding for
# those words (e.g. China) based on the embedding of their case-insensitive
# counterparts (e.g. china) in the glove. That is, for words in glove, we use
# the embedding from glove, while for words having their counterparts in
# glove, we will initialize their embedding with the embedding of their
# counterparts, and train it during the training of the entire lstm model.
# in order to do this, we are going to modify the custom dataset class used
# in the previous task:

# create a custom dataset compatible with glove
class CustomDataset2(Dataset):
    def __init__(self, features, labels, vocab, wvm):
        self.features = features
        self.labels = labels
        self.vocab = vocab
        self.wvm = wvm # wvm is the word embedding model (e.g. glove), in a dictionary
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # For words that the glove contains, we will use the pretrained word embedding.
        # For words whose case-insensitive counterparts are in the glove, we will
        # initialize their word embedding with the word embedding of their counterparts.
        # For words whose case-insensitive counterparts are not in the glove, we will
        # initialize their word embedding randomly.
        # For instance, word 'after' is in the glove, so we will use the pretrained
        # word embedding glove['after']; word 'China' is not in the glove, but 'china'
        # is, so we will initialize the word bedding for 'China' with glove['china'],
        # and update it during the training stage; word 'nihao' is not in the glove,
        # we will initialize its word embedding randomly.
        
        sentence = self.features[idx]
        label = self.labels[idx]
        
        vectors = []
        # keep track of word index
        word_ids = []
        
        for _, word in enumerate(sentence):
            # handle (possibly unknown words in the dev or testing data)
            if word in self.vocab:
                if word in self.wvm.keys():
                    vector = self.wvm[word]
                    word_id = self.vocab.index(word)
                elif word.lower() in self.wvm.keys():
                    vector = self.wvm[word.lower()]
                    word_id = self.vocab.index(word)
                else:
                    vector = np.random.rand(100)
                    word_id = self.vocab.index(word)
            else:
                vector = np.random.rand(100)
                word_id = self.vocab.index('<unk>')
                
            vectors.append(vector)
            word_ids.append(word_id)   
        vectors = torch.Tensor(vectors)
        
        return vectors, label, word_ids  

In [15]:
# define a collate_fn to pad sequences in a batch to the same length
# The collate_fn is almost identical to the one we defined for the simple bidirectional lstm model,
# the only difference is that the vocabulary here is vocab_glove, rather than vocab.
# https://discuss.pytorch.org/t/how-to-create-batches-of-a-list-of-varying-dimension-tensors/50773/14
def collate_fn(data):
    """
       data: is a list of tuples with (vectors, label, word_ids) returned
             the CustomDataset, where vectors is a tensor with shape of
             seq_len * word_embeddings, label a list of named entities,
             and word_ids the word indices in the vocabulary.  
    """
    labels = [data[i][1] for i in range(len(data))]
    word_ids = [data[i][2] for i in range(len(data))]
    # the length of an example corresponds to the first dimension of its tensor
    lengths = [data[i][0].size(0) for i in range(len(data))]
    max_len = max(lengths)
    # the number of features is the second dimension of the tensor:
    num_features = data[0][0].size(1)
    # initialize a zero tensor of shape batch_size * max_len * n_ftrs
    features = torch.zeros((len(data), max_len, num_features))
    
    # pad the tensor of each sequence with zeros, the label of each sequence with the
    # label of 'PAD', and the index of each sequence with the index of '<pad>' in the
    # vocabulary.
    for i in range(len(data)):
        j = data[i][0].size(0)
        features[i] = torch.cat([data[i][0], torch.zeros((max_len-j, num_features))])
        labels[i] = labels[i] + [ner_dict['PAD'] for _ in range(max_len-j)]
        word_ids[i] = word_ids[i] + [vocab_glove.index('<pad>') for _ in range(max_len-j)]
    labels = torch.Tensor(labels)
    word_ids = torch.Tensor(word_ids)
    lengths = torch.Tensor(lengths)
    
    return features.float(), labels.long(), lengths.long(), word_ids.long()

In [16]:
# create a dataloader for training data
train_dataset = CustomDataset2(word_train, label_train, vocab_glove, glove)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True,
                              collate_fn=collate_fn)

# create a dataloader for dev data
dev_dataset = CustomDataset2(word_dev, label_dev, vocab_glove, glove)
dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=True,
                            collate_fn=collate_fn)

In [19]:
# initialize a glove-based BiLSTM model and its architecture is the same
# as the simple BiLSTM model.
# because the dimension of pretrained word embedding in glove is also 100,
# we need to assign it to input_dim
model_glove = BiLSTM(input_dim=100)
print(model_glove)

BiLSTM(
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (lstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)




In [20]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss(ignore_index=ner_dict['PAD'])

# specify optimizer (stochastic gradient descent) and initial learning rate = 0.1
optimizer = torch.optim.SGD(model_glove.parameters(), lr=0.1)

# adjust learning rate using learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=0.001)

In [None]:
# transfer model to GPU
# model_glove.to(device)

# train the network
max_epochs = 100

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ## train the model
    model_glove.train()
    
    for features, labels, lengths, _ in train_dataloader:
        
        # transfer to GPU
        #features, labels, lengths = features.to(device), labels.to(device), lengths.to(device)
        
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        
        # forward pass: compute predictions by passing inputs to the model
        output = model_glove(features, lengths)
        
        # the shape of output: batch_size * seq_len * num_of_ners
        # need: batch_size * num_of_ners * seq_len to fit in the loss function
        # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        output = torch.transpose(output, 1, 2)
        
        # calculate the loss
        loss = criterion(output, labels)
        
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # perform a single optimization step (parameter update)
        optimizer.step()
        
        # update running training loss
        train_loss += loss.item() * features.size(0)
        
    ## validate the model
    model_glove.eval()
    
    for features, labels, lengths, _ in dev_dataloader:
        # transfer to GPU
        #features, labels, lengths = features.to(device), labels.to(device), lengths.to(device)
        
        # forward pass
        output = model_glove(features, lengths)
        
        # the shape of output: batch_size * seq_len * num_of_ners
        # need: batch_size * num_of_ners * seq_len to fit in the loss function
        # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        output = torch.transpose(output, 1, 2)
        
        # calculate the loss
        loss = criterion(output, labels)
        
        # update running validation loss
        valid_loss += loss.item() * features.size(0)
    
    # scheduler should be called after validation
    scheduler.step(valid_loss)
    
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_dataloader.dataset)
    valid_loss = valid_loss/len(dev_dataloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_glove.state_dict(), 'model_glove.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.317559 	Validation Loss: 0.387928
Validation loss decreased (0.413378 --> 0.387928).  Saving model ...
Epoch: 2 	Training Loss: 0.298991 	Validation Loss: 0.402841
Epoch: 3 	Training Loss: 0.283400 	Validation Loss: 0.367822
Validation loss decreased (0.387928 --> 0.367822).  Saving model ...
Epoch: 4 	Training Loss: 0.270355 	Validation Loss: 0.350736
Validation loss decreased (0.367822 --> 0.350736).  Saving model ...
Epoch: 5 	Training Loss: 0.259955 	Validation Loss: 0.338315
Validation loss decreased (0.350736 --> 0.338315).  Saving model ...
Epoch: 6 	Training Loss: 0.249380 	Validation Loss: 0.329682
Validation loss decreased (0.338315 --> 0.329682).  Saving model ...
Epoch: 7 	Training Loss: 0.241916 	Validation Loss: 0.318010
Validation loss decreased (0.329682 --> 0.318010).  Saving model ...
Epoch: 8 	Training Loss: 0.233156 	Validation Loss: 0.320361
Epoch: 9 	Training Loss: 0.225202 	Validation Loss: 0.309085
Validation loss decreased (0.318010 -

In [None]:
model_glove.load_state_dict(torch.load('model_glove.pt'))
preds, labels, lengths, word_ids = predict(model_glove, dev_dataloader)

In [None]:
# create a results list holding the required information (idx, word, gold, pred)
# results is a list of sublists, with sublist being a list of (idx, word, gold, pred)
# for each word in a sentence
results = []
pos = 0
for _, length in enumerate(lengths):
    # sentence-level result
    result = []
    idx = 0
    for i in range(pos, pos+int(length)):
        # word-level result
        result_word = [idx, vocab_glove[int(word_ids[i])], ner_dict2[int(labels[i])], ner_dict2[int(preds[i])]]
        result.append(result_word)
        idx += 1
    results.append(result)
    pos += int(length)

# convert results (a list of sublists) to a single list
results = fc.reduce(lambda a, b: a + b, results)
# convert results to a dataframe
results = pd.DataFrame(data=results, columns=['idx', 'word', 'gold', 'pred'])
# write results to a file with the required format:
# idx word gold pred
results.to_csv('dev_results2', sep=' ', header=False, index=False)