### Import Package

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/493.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/493.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.

In [2]:
import datasets
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import itertools
from torch.utils.data import DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from google.colab import drive
import sys
drive.mount('/content/drive')
sys.path.insert(0,'/content/drive/My Drive/Colab Notebooks')
from conlleval import evaluate

Mounted at /content/drive


In [None]:
dataset = datasets.load_dataset("conll2003")

In [4]:
word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))
word_frequency = {token: freq for token, freq in word_frequency.items() if freq > 3}
word2idx = {word: index for index, word in enumerate(word_frequency.keys(), start=2)}
word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1
print(word2idx)



In [None]:
new_dataset = (dataset.map(lambda x: {'input_ids': [word2idx.get(word, word2idx['[UNK]']) for word in x['tokens']]}))
new_dataset = new_dataset.remove_columns(['id', 'tokens', 'pos_tags', 'chunk_tags']).rename_column('ner_tags', 'labels')
print(new_dataset['train']['input_ids'][:3])
display(new_dataset)
train_data = new_dataset['train']
validation_data = new_dataset['validation']
test_data = new_dataset['test']
print(train_data)

# Task1: Bidirectional LSTM model

In [6]:
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, num_labels, linear_out, lstm_layers=1, dropout_value=0.33):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.linear = nn.Linear(in_features=lstm_hidden_dim * 2, out_features=linear_out)
        self.dropout = nn.Dropout(dropout_value)
        self.classifier = nn.Linear(in_features=linear_out, out_features=num_labels)
        self.elu = nn.ELU()

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, (hidden_states, cell_states) = self.lstm(embedded)
        hidden_states = torch.cat((hidden_states[-2,:,:], hidden_states[-1,:,:]), dim=1)
        dropout = self.dropout(lstm_out)
        linear_out = self.elu(self.linear(dropout))
        classifier_out = self.classifier(linear_out)

        return classifier_out  # Output scores for each label in the NER task

def collate_fn(data):
    labels = [torch.tensor(item['labels']) for item in data]
    input_ids = [torch.tensor(item['input_ids']) for item in data]
    # Pad sequences
    targets = pad_sequence(labels, batch_first=True, padding_value=9)
    features = pad_sequence(input_ids, batch_first=True, padding_value=0)
    return features, targets


In [7]:
vocab_size = len(word2idx)
embedding_dim = 100
lstm_hidden_dim = 256
linear_out = 128
num_labels = 10
batch_size = 4
num_epochs = 20
learning_rate = 0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMModel(vocab_size, embedding_dim, lstm_hidden_dim, num_labels, linear_out).to(device)
train_dataloader = DataLoader(train_data, collate_fn=collate_fn, batch_size=batch_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (input_data, labels) in enumerate(train_dataloader):
        input_data, labels = input_data.to(device), labels.to(device)

        # Zero the gradients, forward pass, compute loss, backward pass, optimize
        optimizer.zero_grad()
        outputs = model(input_data)
        predictions = outputs.view(-1, num_labels)
        labels = labels.view(-1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {total_loss / 100:.4f}')
    total_loss = 0

Epoch 1, Batch 3511, Loss: 10.2834
Epoch 2, Batch 3511, Loss: 3.9538
Epoch 3, Batch 3511, Loss: 2.3364
Epoch 4, Batch 3511, Loss: 1.5898
Epoch 5, Batch 3511, Loss: 1.1364
Epoch 6, Batch 3511, Loss: 0.8870
Epoch 7, Batch 3511, Loss: 0.7232
Epoch 8, Batch 3511, Loss: 0.5984
Epoch 9, Batch 3511, Loss: 0.5382
Epoch 10, Batch 3511, Loss: 0.4643
Epoch 11, Batch 3511, Loss: 0.3902
Epoch 12, Batch 3511, Loss: 0.3928
Epoch 13, Batch 3511, Loss: 0.3622
Epoch 14, Batch 3511, Loss: 0.3329
Epoch 15, Batch 3511, Loss: 0.3172
Epoch 16, Batch 3511, Loss: 0.3050
Epoch 17, Batch 3511, Loss: 0.3055
Epoch 18, Batch 3511, Loss: 0.2726
Epoch 19, Batch 3511, Loss: 0.2505
Epoch 20, Batch 3511, Loss: 0.2530


### What are the precision, recall, and F1 score on the validation data?

In [9]:
model_save_name = 'biLSTM.pt'
path = F"/content/drive/My Drive/Colab Notebooks/{model_save_name}"
torch.save(model.state_dict(), path)

In [10]:
model = BiLSTMModel(vocab_size, embedding_dim, lstm_hidden_dim, num_labels, linear_out).to(device)
model_save_name = 'biLSTM.pt'
path = F"/content/drive/My Drive/Colab Notebooks/{model_save_name}"
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [11]:
idx2tag = {0:'O', 1:'B-PER', 2:'I-PER', 3:'B-ORG', 4:'I-ORG', 5:'B-LOC', 6:'I-LOC', 7:'B-MISC', 8:'I-MISC', 9:'O'}
val_predicts = []
val_labels = []
model.eval()
validation_dataloader = DataLoader(validation_data, collate_fn=collate_fn, batch_size=1)
for batch_idx, (input_data, labels) in enumerate(validation_dataloader):
    input_data, labels = input_data.to(device), labels.to(device)
    predictions = model(input_data).view(-1, num_labels)
    labels = labels.view(-1)
    _, predicted = torch.max(predictions, 1)
    val_predicts.append([idx2tag[label] for label in predicted.tolist()])
    val_labels.append([idx2tag[label] for label in labels.tolist()])

precision, recall, f1 = evaluate(itertools.chain(*val_labels), itertools.chain(*val_predicts))


processed 51362 tokens with 5942 phrases; found: 5584 phrases; correct: 4579.
accuracy:  79.29%; (non-O)
accuracy:  95.85%; precision:  82.00%; recall:  77.06%; FB1:  79.46
              LOC: precision:  88.43%; recall:  82.42%; FB1:  85.32  1712
             MISC: precision:  82.13%; recall:  73.75%; FB1:  77.71  828
              ORG: precision:  73.71%; recall:  71.51%; FB1:  72.60  1301
              PER: precision:  81.81%; recall:  77.42%; FB1:  79.55  1743


### What are the precision, recall, and F1 score on the test data?

In [12]:
test_predicts = []
test_labels = []
model.eval()
test_dataloader = DataLoader(test_data, collate_fn=collate_fn, batch_size=1)
for batch_idx, (input_data, labels) in enumerate(test_dataloader):
    input_data, labels = input_data.to(device), labels.to(device)
    predictions = model(input_data).view(-1, num_labels)
    labels = labels.view(-1)
    _, predicted = torch.max(predictions, 1)
    test_predicts.append([idx2tag[label] for label in predicted.tolist()])
    test_labels.append([idx2tag[label] for label in labels.tolist()])

precision, recall, f1 = evaluate(itertools.chain(*test_labels), itertools.chain(*test_predicts))

processed 46435 tokens with 5648 phrases; found: 5229 phrases; correct: 3830.
accuracy:  73.05%; (non-O)
accuracy:  93.87%; precision:  73.25%; recall:  67.81%; FB1:  70.42
              LOC: precision:  83.05%; recall:  76.98%; FB1:  79.90  1546
             MISC: precision:  67.38%; recall:  62.96%; FB1:  65.10  656
              ORG: precision:  65.62%; recall:  59.42%; FB1:  62.37  1504
              PER: precision:  73.34%; recall:  69.08%; FB1:  71.15  1523


# Task2: Using GloVe word embeddings

In [13]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!ls -lat

--2023-11-08 01:33:48--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-11-08 01:33:48--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-11-08 01:33:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [15]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_file_path = 'glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)
glove_embeddings['[PAD]'] = np.zeros((100,), dtype="float32")
glove_vec = np.array([glove_embeddings[key] for key in glove_embeddings])
glove_embeddings['[UNK]'] = np.mean(glove_vec, axis=0, keepdims=True).reshape(100,)

vocab_size = len(word2idx)
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in word2idx.items():
    if word in glove_embeddings:
      embedding_matrix[index] = glove_embeddings[word]
    elif word.lower() in glove_embeddings:
      embedding_matrix[index] = (glove_embeddings[word.lower()] + model.embedding.weight[index].cpu().detach().numpy()) / 2
    else:
      embedding_matrix[index] = model.embedding.weight[index].cpu().detach().numpy()


In [16]:
class gloveBiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, num_labels, linear_out, lstm_layers=1, dropout_value=0.33):
        super(gloveBiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.dropout = nn.Dropout(dropout_value)
        self.linear = nn.Linear(in_features=lstm_hidden_dim * 2, out_features=linear_out)
        self.classifier = nn.Linear(in_features=linear_out, out_features=num_labels)
        self.elu = nn.ELU()

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, (hidden_states, cell_states) = self.lstm(embedded)
        hidden_states = torch.cat((hidden_states[-2,:,:], hidden_states[-1,:,:]), dim=1)
        dropout = self.dropout(lstm_out)
        linear_out = self.elu(self.linear(dropout))
        classifier_out = self.classifier(linear_out)

        return classifier_out  # Output scores for each label in the NER task

def collate_fn(data):
    labels = [torch.tensor(item['labels']) for item in data]
    input_ids = [torch.tensor(item['input_ids']) for item in data]
    # Pad sequences
    targets = pad_sequence(labels, batch_first=True, padding_value=9)
    features = pad_sequence(input_ids, batch_first=True, padding_value=0)
    return features, targets

In [22]:
vocab_size = len(word2idx)
embedding_dim = 100
lstm_hidden_dim = 256
linear_out = 128
num_labels = 10
batch_size = 64
num_epochs = 40
learning_rate = 0.01

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
glove_model = gloveBiLSTMModel(vocab_size, embedding_dim, lstm_hidden_dim, num_labels, linear_out).to(device)
train_dataloader = DataLoader(train_data, collate_fn=collate_fn, batch_size=batch_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(glove_model.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(glove_model.parameters(), lr=learning_rate, momentum=0.9)
scheduler = StepLR(optimizer, step_size=15, gamma=0.1)

for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (input_data, labels) in enumerate(train_dataloader):
        input_data, labels = input_data.to(device), labels.to(device)

        # Zero the gradients, forward pass, compute loss, backward pass, optimize
        optimizer.zero_grad()
        outputs = glove_model(input_data)
        predictions = outputs.view(-1, num_labels)
        labels = labels.view(-1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()

    print(f'Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {total_loss / 100:.4f}')
    total_loss = 0

Epoch 1, Batch 220, Loss: 0.4292
Epoch 2, Batch 220, Loss: 0.1310
Epoch 3, Batch 220, Loss: 0.0914
Epoch 4, Batch 220, Loss: 0.0716
Epoch 5, Batch 220, Loss: 0.0613
Epoch 6, Batch 220, Loss: 0.0533
Epoch 7, Batch 220, Loss: 0.0486
Epoch 8, Batch 220, Loss: 0.0427
Epoch 9, Batch 220, Loss: 0.0415
Epoch 10, Batch 220, Loss: 0.0361
Epoch 11, Batch 220, Loss: 0.0364
Epoch 12, Batch 220, Loss: 0.0376
Epoch 13, Batch 220, Loss: 0.0370
Epoch 14, Batch 220, Loss: 0.0314
Epoch 15, Batch 220, Loss: 0.0288
Epoch 16, Batch 220, Loss: 0.0198
Epoch 17, Batch 220, Loss: 0.0148
Epoch 18, Batch 220, Loss: 0.0122
Epoch 19, Batch 220, Loss: 0.0110
Epoch 20, Batch 220, Loss: 0.0096
Epoch 21, Batch 220, Loss: 0.0084
Epoch 22, Batch 220, Loss: 0.0079
Epoch 23, Batch 220, Loss: 0.0071
Epoch 24, Batch 220, Loss: 0.0064
Epoch 25, Batch 220, Loss: 0.0058
Epoch 26, Batch 220, Loss: 0.0055
Epoch 27, Batch 220, Loss: 0.0051
Epoch 28, Batch 220, Loss: 0.0045
Epoch 29, Batch 220, Loss: 0.0042
Epoch 30, Batch 220, Lo

In [None]:
# model_save_name = 'glove_biLSTM.pt'
# path = F"/content/drive/My Drive/Colab Notebooks/{model_save_name}"
# torch.save(glove_model.state_dict(), path)

In [None]:
# glove_model = gloveBiLSTMModel(vocab_size, embedding_dim, lstm_hidden_dim, num_labels, linear_out).to(device)
# model_save_name = 'glove_biLSTM.pt'
# path = F"/content/drive/My Drive/Colab Notebooks/{model_save_name}"
# glove_model.load_state_dict(torch.load(path))

### What is the precision, recall, and F1 score on the validation data?

In [23]:
idx2tag = {0:'O', 1:'B-PER', 2:'I-PER', 3:'B-ORG', 4:'I-ORG', 5:'B-LOC', 6:'I-LOC', 7:'B-MISC', 8:'I-MISC', 9:'O'}
val_predicts = []
val_labels = []
glove_model.eval()
validation_dataloader = DataLoader(validation_data, collate_fn=collate_fn, batch_size=1)
for batch_idx, (input_data, labels) in enumerate(validation_dataloader):
    input_data, labels = input_data.to(device), labels.to(device)
    predictions = glove_model(input_data).view(-1, num_labels)
    labels = labels.view(-1)
    _, predicted = torch.max(predictions, 1)
    val_predicts.append([idx2tag[label] for label in predicted.tolist()])
    val_labels.append([idx2tag[label] for label in labels.tolist()])

precision, recall, f1 = evaluate(itertools.chain(*val_labels), itertools.chain(*val_predicts))


processed 51362 tokens with 5942 phrases; found: 5818 phrases; correct: 4730.
accuracy:  81.99%; (non-O)
accuracy:  96.12%; precision:  81.30%; recall:  79.60%; FB1:  80.44
              LOC: precision:  90.12%; recall:  83.94%; FB1:  86.92  1711
             MISC: precision:  76.04%; recall:  75.05%; FB1:  75.55  910
              ORG: precision:  70.55%; recall:  73.97%; FB1:  72.22  1406
              PER: precision:  83.98%; recall:  81.65%; FB1:  82.80  1791


### What are the precision, recall, and F1 score on the test data?

In [19]:
test_predicts = []
test_labels = []
glove_model.eval()
test_dataloader = DataLoader(test_data, collate_fn=collate_fn, batch_size=1)
for batch_idx, (input_data, labels) in enumerate(test_dataloader):
    input_data, labels = input_data.to(device), labels.to(device)
    predictions = glove_model(input_data).view(-1, num_labels)
    labels = labels.view(-1)
    _, predicted = torch.max(predictions, 1)
    test_predicts.append([idx2tag[label] for label in predicted.tolist()])
    test_labels.append([idx2tag[label] for label in labels.tolist()])

precision, recall, f1 = evaluate(itertools.chain(*test_labels), itertools.chain(*test_predicts))

processed 46435 tokens with 5648 phrases; found: 5099 phrases; correct: 3997.
accuracy:  75.14%; (non-O)
accuracy:  94.52%; precision:  78.39%; recall:  70.77%; FB1:  74.38
              LOC: precision:  87.53%; recall:  77.04%; FB1:  81.95  1468
             MISC: precision:  72.24%; recall:  59.69%; FB1:  65.37  580
              ORG: precision:  73.50%; recall:  66.29%; FB1:  69.71  1498
              PER: precision:  76.75%; recall:  73.72%; FB1:  75.21  1553


### BiLSTM with Glove Embeddings outperforms the model without. Can you provide a rationale for this? <br>
ANS: Glove embeddings are pre-trained on large corpus and capture semantic relationships between words. By using these pre-trained word vectors as input to BiLSTM model, the model can benefit from these rich semantic representations.
