In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks/hw3

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/hw3


# **Task 1:  Sequence Tagging with RNNs**

In this task, you will implement LSTM and Bi-LSTM architectures with PyTorch to perform part-of-speech tagging (a sequence tagging task).

### **Data**
We use a subset of the data from the CoNLL-2003 shared task on Named Entity Recognition (provided in the zip). It is pre-partioned into a training, development and test set.

The dataset consists of pre-tokenized sentences where every token is annotated with a part-of-speech tag, a syntactic chunk tag and a named entity tag. In this home exercise, we only use the IOB named entity recognition tag.

In [2]:
import torch
import torch.nn as nn
import os
import re
import numpy as np
from collections import defaultdict
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import random

In [3]:
def seed_everything(seed: int):

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    torch.set_default_tensor_type(torch.FloatTensor) #added to avoid datatype problems

In [4]:
seed_everything(seed=999)

## **Task 1.1: Pretrained Embeddings (5p)**

Download the pretrained, uncased GloVe embeddings with 6B tokens [glove.6B.zip](https://nlp.stanford.edu/projects/glove/) from Stanford.

For performance reasons, we will only use the 50-dimensional embeddings **glove.6B.50d.txt**.

Implement a function to read the embedding and another function to read the dataset.

In [5]:
# https://medium.com/analytics-vidhya/ner-tensorflow-2-2-0-9f10dcf5a0a
"""
  Store each sentence seperately. Each token of the sentences is stored with its corresponding IOB named entity recognition tag
"""

def split_text_label(filename):
  f = open(filename)
  split_labeled_text = []
  sentence = []
  for line in f:
    if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
       if len(sentence) > 0:
         split_labeled_text.append(sentence)
         sentence = []
       continue
    splits = line.split(' ')
    sentence.append([splits[0],splits[-1].rstrip("\n")])
  if len(sentence) > 0:
    split_labeled_text.append(sentence)
    sentence = []
  return split_labeled_text

In [6]:
def read_data(path):
    data = defaultdict(list)
    label = defaultdict(list)
    # TODO: YOUR CODE HERE

    text_label = split_text_label(filename=path)
    for idx, sent in enumerate(text_label):
      words = []
      iob_labels = []
      for word_iob in sent:
        words.append(word_iob[0]) #add word
        iob_labels.append(word_iob[1]) #add iob tag

      data[idx] = words
      label[idx] = iob_labels

    return data, label

def get_pretrained_embeddings(embedding_path):
    embeddings = defaultdict(list)
    # TODO: YOUR CODE HERE
    with open("glove.6B.50d.txt", "r") as file:
      for line in file:
        line = line.strip()
        word = line.split(" ")[0]
        vector = np.array([float(n) for n in line.split(" ")[1:]])

        embeddings[word] = vector

    return embeddings

In [7]:
embed = get_pretrained_embeddings(embedding_path="glove.6B.50d.txt")

In [8]:
embed["get"].shape[0]

50

In [9]:
out= split_text_label("data/ner_eng_bio.train")

In [10]:
out[2]

[['BRUSSELS', 'B-LOC'], ['1996-08-22', 'O']]

In [11]:
data, label = read_data("data/ner_eng_bio.train")

In [12]:
data[0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [13]:
label[0]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [14]:
counter = 0
with open("data/ner_eng_bio.train", "r") as file:
    for line in file:
      print(line.strip().split(" "))
      if counter == 10:
        break
      counter += 1

['EU', 'NNP', 'B-NP', 'B-ORG']
['rejects', 'VBZ', 'B-VP', 'O']
['German', 'JJ', 'B-NP', 'B-MISC']
['call', 'NN', 'I-NP', 'O']
['to', 'TO', 'B-VP', 'O']
['boycott', 'VB', 'I-VP', 'O']
['British', 'JJ', 'B-NP', 'B-MISC']
['lamb', 'NN', 'I-NP', 'O']
['.', '.', 'O', 'O']
['']
['Peter', 'NNP', 'B-NP', 'B-PER']


In [15]:
"""dataset = defaultdict(list)
labels = defaultdict(list)
with open("data/ner_eng_bio.train", "r") as file:
    sentence = []
    sentence_labels = []
    for line in file:
        line = line.strip()
        if line == "":
            if sentence:
                for word, label in zip(sentence, sentence_labels):
                    dataset["words"].append(word)
                    labels["tags"].append(label)
            sentence = []
            sentence_labels = []
        else:
            word, _, _, label = line.split()
            sentence.append(word)
            sentence_labels.append(label)"""

'dataset = defaultdict(list)\nlabels = defaultdict(list)\nwith open("data/ner_eng_bio.train", "r") as file:\n    sentence = []\n    sentence_labels = []\n    for line in file:\n        line = line.strip()\n        if line == "":\n            if sentence:\n                for word, label in zip(sentence, sentence_labels):\n                    dataset["words"].append(word)\n                    labels["tags"].append(label)\n            sentence = []\n            sentence_labels = []\n        else:\n            word, _, _, label = line.split()\n            sentence.append(word)\n            sentence_labels.append(label)'

## **Task 1.2: LSTM and Bi-LSTM Model (10p)**

We will use PyTorch to build our LSTM. Complete the `__init__()` and the `forward()` function of the CustomLSTM class. The model will have the following components:
- A single [LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html) layer which takes the embeddings as input and has 100-dimensional hidden layer. The LSTM is **not** bidirectional.
- A dropout layer with probability 0.1
- A linear layer with input size of 100 (the hidden layer size of the LSTM layer) and output size of the number of labels
- A [Sigmoid](https://pytorch.org/docs/stable/generated/torch.nn.Sigmoid.html) activation function

In [16]:
from torch.nn.modules import dropout
class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_labels):
        super(CustomLSTM, self).__init__()
        # TODO: YOUR CODE HERE
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_labels = num_labels

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.1, bidirectional=False)
        self.linear = nn.Linear(in_features=100, out_features=num_labels) # replace 100 with hidden_size maybe
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # TODO: YOUR CODE HERE
        #LSTM’s output corresponding to all timesteps
        output, (h_n, c_n) = self.lstm(x)
        last_hidden_state = output[-1] # can also be h_n   see https://towardsdatascience.com/implementation-differences-in-lstm-layers-tensorflow-vs-pytorch-77a31d742f74#:~:text=The%20output%20of%20the%20Pytorch,another%20tuple%20with%20two%20elements.
        linear_output = self.linear(last_hidden_state)
        output = self.sigmoid(linear_output)
        
        return output

## **Task 1.3: Training Model (10p)**

Complete the function `train` to train your model. The model will train with batch size of 1 (each sentence split by "\n" is a sample) for 10 epochs. You will train the model with the train dataset and use dev dataset to check the model's performance after each epoch. Calculate the macro f1 score of the model on the dev set. Return the losses and f1 scores for plotting.

**Hint**: you can check out this [link](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html#the-training-loop) to get to know more about how to train model with pytorch. For the f1 score you can use `sklearn.metrics.f1_score`, remember to set the `average` parameter to "macro".


In [17]:
def label_encode(labels):
    labels_list = {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7,
                   'O': 8}
    labels_encode = [labels_list[label] for label in labels]
    return labels_encode

In [18]:
# Change this to your path to the dataset
train = "data/ner_eng_bio.train" 
test = "data/ner_eng_bio.test"
dev = "data/ner_eng_bio.dev"

embeddings = get_pretrained_embeddings("embeddings/glove.6B.50d.txt")

train_data, train_label = read_data(train)
test_data, test_label = read_data(test)
dev_data, dev_label = read_data(dev)

EMBEDDING_SIZE = embeddings["get"].shape[0]

# Change the Hyperparameters here
input_size = EMBEDDING_SIZE
hidden_size = 100
num_layers = 2
num_labels = 9
epochs = 10

In [33]:
dev_data[0], dev_label[0]

(['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O'])

In [19]:
test_data[0]

['CRICKET',
 '-',
 'LEICESTERSHIRE',
 'TAKE',
 'OVER',
 'AT',
 'TOP',
 'AFTER',
 'INNINGS',
 'VICTORY',
 '.']

In [20]:
test_label[0]

['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [21]:
aa = []
bb = [1, 2, 3]
aa.append(bb)
aa

[[1, 2, 3]]

In [22]:
array = np.random.random((1, 3, 50))
array

array([[[0.80342804, 0.5275223 , 0.11911147, 0.63968144, 0.09092526,
         0.33222568, 0.42738095, 0.55438581, 0.62812652, 0.69739294,
         0.78994969, 0.13189035, 0.34277045, 0.20155961, 0.70732423,
         0.03339926, 0.90925004, 0.40516066, 0.76043547, 0.47375838,
         0.28671892, 0.75129249, 0.09708994, 0.41235779, 0.28163896,
         0.39027778, 0.87110921, 0.08124512, 0.55793117, 0.54753428,
         0.33220307, 0.97326881, 0.2862761 , 0.5082575 , 0.14795074,
         0.19643398, 0.84082001, 0.0037532 , 0.78262101, 0.83347772,
         0.93790734, 0.97260166, 0.83282304, 0.06581761, 0.40379256,
         0.37479349, 0.50750135, 0.97787696, 0.81899021, 0.18754124],
        [0.69804812, 0.68261077, 0.99909815, 0.48263116, 0.73059268,
         0.79518236, 0.26139168, 0.16107376, 0.69850315, 0.89950917,
         0.91515562, 0.31244902, 0.95412616, 0.7242641 , 0.02091039,
         0.72554552, 0.58165923, 0.9545687 , 0.74233195, 0.19750339,
         0.94900651, 0.85836332, 

In [23]:
def get_sentence_embeddings(sent: list, embeddings_dict: dict):
  out_sentence = []
  for token in sent:
    if token in embeddings_dict.keys():
      out_sentence.append(embeddings_dict[token])
    else:
      out_sentence.append(np.array([0]*50))
  return out_sentence

In [24]:
outt = get_sentence_embeddings(sent=train_data[0], embeddings_dict=embeddings)

In [25]:
len(train_data)

4434

In [26]:
torch.tensor(np.array([outt])).size()

torch.Size([1, 9, 50])

In [27]:
torch.tensor([[[1, 2, 3, 4]]]).size()

torch.Size([1, 1, 4])

In [31]:
model = CustomLSTM(input_size, hidden_size, num_layers, num_labels)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters()) 

def train(model, loss_fn, optimizer, train_data, dev_data, train_label, dev_label, epochs):
    logs_loss = []
    logs_f1_score = []

    # Training
    for epoch in range(epochs):
        # Train with train set
        model.train()
        # TODO: YOUR CODE HERE
        print('EPOCH {}:'.format(epoch + 1))
        train_loss = 0.0

        for k, sent in train_data.items():
          optimizer.zero_grad()


          sent_embedding = get_sentence_embeddings(sent=sent, embeddings_dict=embeddings)
          input_data = torch.tensor(np.array([sent_embedding]), dtype=torch.float32) # Remove the last label
          targets = torch.tensor(label_encode(train_label[k]))

          # Forward pass
          outputs = model(input_data)
          loss = loss_fn(outputs, targets)

          # Backward pass and optimization
          loss.backward()
          optimizer.step()

          train_loss += loss.item()
        average_loss = train_loss / len(train_data) #calculate average loss value of the whole dataset
        logs_loss.append(average_loss)

        print(f"Finished epoch {epoch+1}")

        


        # Evaluate with dev set
        """with torch.no_grad():
            model.eval()
            # TODO: YOUR CODE HERE
            dev_loss = 0.0
            dev_preds = []
            dev_targets = []

            for k, sent in dev_data.items():
              sent_embedding_dev = get_sentence_embeddings(sent=sent, embeddings_dict=embeddings)
              input_data_dev = torch.tensor(np.array([sent_embedding_dev]), dtype=torch.float32) # Remove the last label
              targets = torch.tensor(label_encode(dev_label[k]))

              # Forward pass
              outputs = model(input_data)
              loss = loss_fn(outputs, targets)
              dev_loss += loss.item()

              # Convert predictions and targets to numpy arrays
              predictions = torch.argmax(outputs, dim=1).numpy()
              targets = targets.numpy()

              dev_preds.extend(predictions)
              dev_targets.extend(targets)
            
            average_dev_loss = dev_loss / len(dev_data)

            # Calculate macro F1 score
            f1 = f1_score(dev_targets, dev_preds, average='macro')
            logs_f1_score.append(f1)

            print(f"Epoch {epoch+1}/{epochs}: Train Loss: {average_loss:.4f} | Dev Loss: {average_dev_loss:.4f} | Macro F1 Score: {f1:.4f}")"""

            # Evaluate with dev set
        with torch.no_grad():
            model.eval()
            dev_loss = 0.0
            dev_preds = []
            dev_targets = []

            for k, sent in dev_data.items():
                sent_embedding_dev = get_sentence_embeddings(sent=sent, embeddings_dict=embeddings)
                input_data_dev = torch.tensor(np.array([sent_embedding_dev]), dtype=torch.float32)
                targets = torch.tensor(label_encode(dev_label[k]))
                print(f"targets == {targets}")
                print(f"outputs == {outputs}")

                print(f"targets shape == {targets.size()}")
                print(f"outputs shape== {outputs.size()}")

                # Forward pass
                outputs = model(input_data_dev)
                loss = loss_fn(outputs, targets)
                dev_loss += loss.item()

                # Convert predictions and targets to numpy arrays
                predictions = torch.argmax(outputs, dim=1).numpy()
                targets = targets.squeeze().numpy()  # Squeeze the tensor to match the batch size

                dev_preds.extend(predictions)
                dev_targets.extend(targets)
            
            average_dev_loss = dev_loss / len(dev_data)

            # Calculate macro F1 score
            f1 = f1_score(dev_targets, dev_preds, average='macro')
            logs_f1_score.append(f1)

            print(f"Epoch {epoch+1}/{epochs}: Train Loss: {average_loss:.4f} | Dev Loss: {average_dev_loss:.4f} | Macro F1 Score: {f1:.4f}")


        
    return logs_loss, logs_f1_score

logs_loss, logs_f1_score = train(model, loss_fn, optimizer, train_data, dev_data, train_label, dev_label, epochs)

EPOCH 1:
Finished epoch 1
targets == tensor([8, 8, 0, 8, 8, 8, 8, 3, 8, 8, 8, 8])
outputs == tensor([[9.8053e-01, 2.8851e-04, 8.1864e-03, 1.9058e-01, 1.5434e-04, 1.4847e-04,
         2.4445e-04, 2.6630e-02, 9.9980e-01],
        [2.8801e-03, 4.4715e-07, 1.4070e-04, 1.0527e-05, 1.6215e-07, 1.1793e-07,
         2.4897e-07, 2.8863e-07, 1.0000e+00],
        [1.9258e-03, 3.6324e-07, 1.2095e-04, 7.3201e-06, 1.2927e-07, 9.3104e-08,
         1.9860e-07, 1.8727e-07, 1.0000e+00],
        [1.6932e-03, 2.9398e-07, 1.0501e-04, 6.0248e-06, 1.0274e-07, 7.3586e-08,
         1.5913e-07, 1.4235e-07, 1.0000e+00],
        [1.7331e-03, 3.0385e-07, 1.0737e-04, 6.2259e-06, 1.0656e-07, 7.6331e-08,
         1.6494e-07, 1.4907e-07, 1.0000e+00],
        [1.8395e-03, 3.6686e-07, 1.2166e-04, 7.1260e-06, 1.3075e-07, 9.4020e-08,
         1.9944e-07, 1.8347e-07, 1.0000e+00],
        [1.6599e-03, 2.9173e-07, 1.0445e-04, 5.9321e-06, 1.0178e-07, 7.2927e-08,
         1.5786e-07, 1.3977e-07, 1.0000e+00],
        [9.7832e-0

TypeError: ignored

## **Task 1.4: Visualizing (5p)**

Check the performance of the model on the test set and plot the training loss using `matplotlib.pyplot.plot`.

In [None]:
# Evaluate with test set
with torch.no_grad():
    model.eval()
    preds = []
    targets = []
    # TODO: YOUR CODE HERE

# Plot with matplotlib
