In [25]:
pip install --upgrade gensim

Note: you may need to restart the kernel to use updated packages.


In [26]:
import gensim.downloader
word2vec = gensim.downloader.load('word2vec-google-news-300')

##### Question 1.1
Based on word2vec embeddings you have downloaded, use cosine similarity to find the most similar
word to each of these words: (a) “student”; (b) “Apple”; (c) “apple”. Report the most similar word
and its cosine similarity

In [27]:
similar_to_student = word2vec.most_similar('student', topn=1)
print(similar_to_student)
similar_to_Apple = word2vec.most_similar('Apple', topn=1)
print(similar_to_Apple)
similar_to_apple = word2vec.most_similar('apple', topn=1)
print(similar_to_apple)

[('students', 0.7294867038726807)]
[('Apple_AAPL', 0.7456986308097839)]
[('apples', 0.720359742641449)]


##### Question 1.2
(a) Describe the size (number of sentences) of the training, development and test file for CoNLL2003.
Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO,
etc.) you chose.

In [28]:
with open('eng.train', 'r') as train_file:
    train_data = train_file.read()

with open('eng.testa', 'r') as testa_file:
    dev_data = testa_file.read()

with open('eng.testb', 'r') as testb_file:
    test_data = testb_file.read()

train_sentences = train_data.split('\n\n')
print("Size of training files:", len(train_sentences))

dev_sentences = dev_data.split('\n\n')
print("Size of development files:", len(dev_sentences))

test_sentences = test_data.split('\n\n')
print("Size of test files:", len(test_sentences))

print("Include last marker or not:", train_sentences[-1])

label_set = set()
for sentence in train_sentences:
    lines = sentence.split('\n')
    for line in lines:
        if line.strip(): 
            parts = line.split()
            label = parts[-1]
            label_set.add(label)

label_list = list(label_set)
print("Labels:", label_list)

Size of training files: 14987
Size of development files: 3466
Size of test files: 3684
Include last marker or not: -DOCSTART- -X- O O

Labels: ['B-LOC', 'B-MISC', 'I-LOC', 'O', 'I-PER', 'B-ORG', 'I-MISC', 'I-ORG']


(b) Choose an example sentence from the training set of CoNLL2003 that has at least two named
entities with more than one word. Explain how to form complete named entities from the label
for each word, and list all the named entities in this sentence.

In [29]:
print(train_sentences[144])

" " O O
Rajavi NNP I-NP I-MISC
emphasised VBD I-VP O
that IN I-SBAR O
the DT I-NP O
Iranian JJ I-NP I-MISC
Resistance NN I-NP I-ORG
would MD I-VP O
continue VB I-VP O
to TO I-VP O
stand VB I-VP O
side NN I-NP O
by IN I-PP O
side NN I-NP O
with IN I-PP O
their PRP$ I-NP O
Kurdish JJ I-NP I-MISC
compatriots NNS I-NP O
and CC O O
the DT I-NP O
resistance NN I-NP O
movement NN I-NP O
in IN I-PP O
Iranian NNP I-NP I-LOC
Kurdistan NNP I-NP I-LOC
, , O O
" " O O
it PRP I-NP O
said VBD I-VP O
. . O O


##### Question 1.3

In [30]:
import pandas as pd

def convert_label(label):
    return label_list.index(label)

train_pairs = [
    (line.split()[0], convert_label(line.split()[-1].split()[-1]))
    for sentence in train_sentences
    for line in sentence.split('\n')
    if line.strip()
]

train_df = pd.DataFrame(train_pairs, columns=['word', 'label'])

dev_pairs = [
    (line.split()[0], convert_label(line.split()[-1].split()[-1]))
    for sentence in dev_sentences
    for line in sentence.split('\n')
    if line.strip()
]

dev_df = pd.DataFrame(dev_pairs, columns=['word', 'label'])

test_pairs = [
    (line.split()[0], convert_label(line.split()[-1].split()[-1]))
    for sentence in test_sentences
    for line in sentence.split('\n')
    if line.strip()
]

test_df = pd.DataFrame(test_pairs, columns=['word', 'label'])
print("Labels:", label_list)
print(train_sentences[0])
train_df.head(9)

Labels: ['B-LOC', 'B-MISC', 'I-LOC', 'O', 'I-PER', 'B-ORG', 'I-MISC', 'I-ORG']
EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O


Unnamed: 0,word,label
0,EU,7
1,rejects,3
2,German,6
3,call,3
4,to,3
5,boycott,3
6,British,6
7,lamb,3
8,.,3


In [31]:
import torch
import torch.nn as nn
import numpy as np

embedding_dim = word2vec.vector_size
np.random.seed(0)
unknown = np.random.rand(embedding_dim)

class NER(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_size, word2vec):
        super(NER, self).__init__()
        self.word2vec = word2vec
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.reLU = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.outputLayer = nn.Linear(2 * hidden_dim, output_size)

    def forward(self, word):
        word_embeddings = [self.word2vec[word] if word in self.word2vec else unknown for word in word]
        word_embeddings_array = np.stack(word_embeddings)
        word_vectors = torch.tensor(word_embeddings_array).to(dtype=torch.float32)
        lstm_output, _ = self.lstm(word_vectors)
        lstm_output = self.dropout(lstm_output)
        output = self.outputLayer(lstm_output)
        return output

In [32]:
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = self.data.iloc[idx]['word']
        label = self.data.iloc[idx]['label']
        return word, torch.tensor(label)
    
train_dataset = NERDataset(train_df)
test_dataset = NERDataset(dev_df)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [33]:
model = NER(embedding_dim=embedding_dim, hidden_dim=150, output_size=8, word2vec=word2vec)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [34]:
from sklearn.metrics import f1_score

def calculate_f1_score(predictions, labels):
    return f1_score(labels, predictions, average='micro')

best_f1_score = 0.0
best_model_state = None
patience = 5

for epoch in range(100):
    model.train() 
    total_loss = 0.0

    for batch in train_dataloader:
        words, labels = batch
        tag_score = model(words)
        loss = loss_fn(tag_score.view(-1, model.outputLayer.out_features), labels.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval() 
    dev_predictions = []
    dev_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            words, labels = batch
            tag_score = model(words)
            _, predicted = torch.max(tag_score, 1)
            dev_predictions.extend(predicted.view(-1).tolist())
            dev_labels.extend(labels.view(-1).tolist())

    f1_dev = calculate_f1_score(dev_predictions, dev_labels)

    print(f"Epoch {epoch + 1}: Loss={total_loss:.4f}, F1 Dev={f1_dev:.4f}")

    if f1_dev > best_f1_score:
        best_f1_score = f1_dev
        best_model_state = model.state_dict()
        no_improvement = 0
    else:
        no_improvement += 1

    if no_improvement >= patience:
        print(f"No improvement for {patience} epochs. Training stopped.")
        break

model.load_state_dict(best_model_state)

Epoch 1: Loss=1649.9436, F1 Dev=0.9736
Epoch 2: Loss=904.2576, F1 Dev=0.9779
Epoch 3: Loss=590.1367, F1 Dev=0.9787
Epoch 4: Loss=393.8825, F1 Dev=0.9788
Epoch 5: Loss=290.0817, F1 Dev=0.9784
Epoch 6: Loss=224.3826, F1 Dev=0.9781
Epoch 7: Loss=188.8434, F1 Dev=0.9785
Epoch 8: Loss=160.5177, F1 Dev=0.9795
Epoch 9: Loss=145.1345, F1 Dev=0.9788
Epoch 10: Loss=131.1922, F1 Dev=0.9784
Epoch 11: Loss=113.5317, F1 Dev=0.9789
Epoch 12: Loss=112.5197, F1 Dev=0.9792
Epoch 13: Loss=94.4065, F1 Dev=0.9791
No improvement for 5 epochs. Training stopped.


<All keys matched successfully>

In [39]:
model.eval()
test_output = model(["This", "is", "CZ4045", "NLP", "group", "project", "from", "NTU", "'s", "School", "of", "Computer", "Science", "and", "Engineering" "."])
_, predicted = torch.max(test_output, 1)

predicted_labels = [label_list[i] for i in predicted.tolist()]

print("Predicted:", predicted)
print("Predicted Labels:", predicted_labels)

Predicted: tensor([3, 3, 3, 3, 3, 3, 3, 7, 3, 3, 3, 7, 7, 3, 3])
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'O', 'O']


(a) Discuss how you deal with new words in the training set which are not found in the pretrained
dictionary. Likewise, how do you deal with new words in the test set which are not found in
either the pretrained dictionary or the training set? Show the corresponding code snippet.

(b) Describe what neural network you used to produce the final vector representation of each
word and what are the mathematical functions used for the forward computation (i.e., from
the pretrained word vectors to the final label of each word). Give the detailed setting of the
network including which parameters are being updated, what are their sizes, and what is the
length of the final vector representation of each word to be fed to the softmax classifier.


(c) Report how many epochs you used for training, as well as the running time

(d) Report the f1 score on the test set, as well as the f1 score on the development set for each
epoch during training.