In [None]:
# class BILSTMForNER(nn.Module):
#     def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.33):
#         super(BILSTMForNER, self).__init__()
#         self.embedding = nn.Embedding(input_dim, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout if num_layers > 1 else 0)
#         self.fc = nn.Linear(hidden_dim * 2, 128)  # Linear output dim is 128
#         self.classifier = nn.Linear(128, output_dim)  # Adjusted to match the number of unique NER tags
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, x):
#         embedded = self.dropout(self.embedding(x))  # Apply dropout to embeddings
#         lstm_output, _ = self.lstm(embedded)
#         out = self.fc(lstm_output)
#         out = F.elu(out)  
#         out = self.classifier(out)
#         return out

# def task1_collate(data: list[dict], use_targets: bool) -> dict[str, torch.Tensor]:
#     word_encodings = [torch.Tensor(x['word_encodings']) for x in data]
#     word_encodings = pad_sequence(word_encodings, batch_first=True)

#     lengths = torch.IntTensor([len(x['word_encodings']) for x in data])

#     # capital_mask = [torch.Tensor(x['capital_mask']) for x in data]
#     # capital_mask = pad_sequence(capital_mask, batch_first = True)

#     item: dict[str, torch.Tensor] = {
#         'word_encodings': word_encodings,
#         'lengths': lengths,
#     }

#     if use_targets: 
#         tag_encodings = [torch.Tensor(x['tag_encodings']) for x in data] # type: ignore
#         tag_encodings = pad_sequence(tag_encodings, batch_first=True)
#         item |= {'tag_encodings': tag_encodings}
    
#     return item
    

In [1]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, List 
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataloader import default_collate
from sklearn.metrics import precision_score, recall_score, f1_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
def compute_metrics(preds, labels):
    """
    Computes the precision, recall, and F1 score from predictions and labels.
    
    Args:
    - preds (list of int): The list of predictions.
    - labels (list of int): The list of true labels.
    
    Returns:
    - precision (float): The precision of the predictions.
    - recall (float): The recall of the predictions.
    - f1 (float): The F1 score of the predictions.
    """
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    f1 = f1_score(labels, preds, average='weighted', zero_division=0)
    return precision, recall, f1


def build_vocab(data_files):
    word_vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_vocab = {}
    word_idx, tag_idx = 2, 0  

    for file_path in data_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if line:
                    _, word, tag = line.split()
                    if word not in word_vocab:
                        word_vocab[word] = word_idx
                        word_idx += 1
                    if tag not in tag_vocab:
                        tag_vocab[tag] = tag_idx
                        tag_idx += 1
    return word_vocab, tag_vocab


class BILSTMForNER(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.33):
        super(BILSTMForNER, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, 128)  
        self.classifier = nn.Linear(128, output_dim) 
        self.elu = nn.ELU() 

    def forward(self, x):
        # print(x.shape)
        x = self.embedding(x) #batch, sequence, outdim 
        # print('post lstm: ', x.shape)
        lstm_out, _ = self.lstm(x)
        out = self.elu(self.fc(lstm_out))  
        logits = self.classifier(out)  
        return logits

# batc , seq, embeddim

# class IndexedNERDataset(Dataset):
#     def __init__(self, file_path, word_vocab, tag_vocab):
#         self.word_vocab = word_vocab
#         self.tag_vocab = tag_vocab
#         self.data = []
#         self._load_data(file_path)
        
#     def _load_data(self, file_path):
#         with open(file_path, 'r', encoding='utf-8') as f:
#             sentence, tags = [], []
#             for line in f:
#                 line = line.strip()
#                 if line:
#                     _, word, tag = line.split()
#                     sentence.append(self.word_vocab.get(word, self.word_vocab['<UNK>']))
#                     tags.append(self.tag_vocab[tag])
#                 else:
#                     self.data.append((sentence, tags))
#                     sentence, tags = [], []
#             if sentence and tags:
#                 self.data.append((sentence, tags))
                
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sentence, tags = self.data[idx]
#         return torch.tensor(sentence, dtype=torch.long), torch.tensor(tags, dtype=torch.long)

class IndexedNERDataset(Dataset):
    def __init__(self, file_path, word_vocab, tag_vocab=None, use_tags=True):
        self.word_vocab = word_vocab
        self.tag_vocab = tag_vocab if use_tags else None
        self.use_tags = use_tags
        self.data = []
        self._load_data(file_path)
        
    def _load_data(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            sentence = []
            for line in f:
                line = line.strip()
                if line:
                    if self.use_tags:
                        _, word, tag = line.split()
                        tag_idx = self.tag_vocab.get(tag, -1)  # or some other default index
                    else:
                        word = line
                        tag_idx = -1  # Placeholder for test data without tags
                    sentence.append((self.word_vocab.get(word, self.word_vocab['<UNK>']), tag_idx))
                else:
                    self.data.append(sentence)
                    sentence = []
            if sentence:  # Handle the case where the file doesn't end with a newline
                self.data.append(sentence)
                
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, tags = zip(*self.data[idx])
        return torch.tensor(sentence, dtype=torch.long), torch.tensor(tags, dtype=torch.long)


def pad_collate(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_vocab['<PAD>'])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=-1)  # Use -1 or another unique index for padding in tags
    return sentences_padded, tags_padded


def evaluate_model(model, dataloader, criterion, device):
    """
    Evaluates the model on the given dataloader.
    """
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for inputs, labels in dataloader:
            # (batch sz, seq len) 
            # print('Val Input: ', inputs.shape)
            # print('Val label: ', labels.shape)
            # print('Val Input: ', inputs)
            # print('Val label: ', labels)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # print(torch.max(outputs, dim=2))
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            # print(outputs.view(-1, outputs.shape[-1]).shape,  labels.view(-1).shape)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=2)
            all_preds.extend(preds.view(-1).cpu().numpy())
            all_labels.extend(labels.view(-1).cpu().numpy())

    valid_indices = [i for i, label in enumerate(all_labels) if label != -1]
    valid_preds = [all_preds[i] for i in valid_indices]
    valid_labels = [all_labels[i] for i in valid_indices]

    accuracy = np.mean(np.array(valid_preds) == np.array(valid_labels))
    precision, recall, f1 = compute_metrics(valid_preds, valid_labels)

    return total_loss / len(dataloader), accuracy, precision, recall, f1


def train_model(model, dataloaders, optimizer, criterion, device, num_epochs=50, patience=10):
    best_val_f1 = -float('inf')
    patience_counter = 0
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in dataloaders['train']:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)

            a_max, a_indx = torch.max(outputs, dim=2)
            # print(torch.max(outputs, dim=2))
            # print('predition shape ',outputs.shape, 'label shape', labels.shape)
            # print('predition',outputs, 'label', labels)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()

            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        if epoch % 2 == 0: 
            print('indxs predicted: ', a_indx)
            
        train_loss, train_acc, train_precision, train_recall, train_f1 = evaluate_model(model, dataloaders['train'], criterion, device)
        val_loss, val_acc, val_precision, val_recall, val_f1 = evaluate_model(model, dataloaders['val'], criterion, device)

        # print(f'Epoch {epoch+1}:')
        print(f'Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}')
        print(f'Val - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}')

        # Early stopping based on validation F1 score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_files = ['../../data/lstm-data/train', '../../data/lstm-data/dev']
word_vocab, tag_vocab = build_vocab(data_files)

input_dim = len(word_vocab)  
output_dim = len(tag_vocab)

print('input dim: ', input_dim, 'output dim: ', output_dim)

model = BILSTMForNER(input_dim=input_dim, embedding_dim=100, hidden_dim=256, dropout=0.33, output_dim=output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=tag_vocab.get('<PAD>', -1))  
# optimizer = optim.SGD(model.parameters(), lr = 0.00005)
optimizer = optim.SGD(model.parameters(), lr = 0.1)

train_dataset = IndexedNERDataset('../../data/lstm-data/train', word_vocab, tag_vocab)
dev_dataset = IndexedNERDataset('../../data/lstm-data/dev', word_vocab, tag_vocab)
# test_dataset = IndexedNERDataset('../../data/lstm-data/test', word_vocab, tag_vocab)

dataloaders = {
    'train': DataLoader(train_dataset, batch_size = 16, shuffle=True, collate_fn=pad_collate),
    'val': DataLoader(dev_dataset, batch_size = 16, shuffle=False, collate_fn=pad_collate),
    # 'test': DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)
}

train_model(model, dataloaders, optimizer, criterion, device, num_epochs = 10)

input dim:  26886 output dim:  9




indxs predicted:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

Task 2: Using GloVe word embeddings
Task is to use the GloVe word embeddings to improve the BLSTM
in Task 1. The way we use the GloVe word embeddings is straight forward:
we initialize the embeddings in our neural network with the corresponding
vectors in GloVe. Note that GloVe is case-insensitive, but our NER model
should be case-sensitive because capitalization is an important information
for NER. You are asked to find a way to deal with this conflict. What are
the precision, recall and F1 score on the dev data? (hint: the reasonable F1
score on dev is 88%.

In [4]:
def load_glove_embeddings(path, word_vocab, embedding_dim):
    embedding_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]  
            vector = np.asarray(values[1:], "float32")
            embedding_dict[word] = vector
    
    vocab_size = len(word_vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, idx in word_vocab.items():
        embedding_vector = embedding_dict.get(word, embedding_dict.get(word.lower()))
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            embedding_matrix[idx] = np.random.randn(embedding_dim) 
    
    return torch.tensor(embedding_matrix, dtype=torch.float)

class IndexedNERDataset(Dataset):
    def __init__(self, file_path, word_vocab, tag_vocab=None, use_tags=True):
        self.word_vocab = word_vocab
        self.tag_vocab = tag_vocab
        self.use_tags = use_tags  
        self.data = []
        self._load_data(file_path)
        
    def _load_data(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            sentence, tags = [], []
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split()
                    word = parts[1] if self.use_tags else parts[0]
                    sentence.append(self.word_vocab.get(word, self.word_vocab['<UNK>']))
                    if self.use_tags:
                        tag = parts[2]
                        tags.append(self.tag_vocab[tag])
                else:
                    if self.use_tags:
                        self.data.append((sentence, tags))
                    else:
                        self.data.append((sentence,))  # Only add the sentence for test data
                    sentence, tags = [], []
            if sentence and (tags if self.use_tags else True):
                if self.use_tags:
                    self.data.append((sentence, tags))
                else:
                    self.data.append((sentence,))  # Only add the sentence for test data
                
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.use_tags:
            sentence, tags = self.data[idx]
            return torch.tensor(sentence, dtype=torch.long), torch.tensor(tags, dtype=torch.long)
        else:
            sentence = self.data[idx][0]
            return torch.tensor(sentence, dtype=torch.long), None  # No tags for test data


class BILSTMGlove(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, glove_embeddings = None, num_layers = 1, dropout = 0.33):
        super(BILSTMGlove, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        if glove_embeddings is not None:
            self.embedding.weight = nn.Parameter(glove_embeddings)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first = True, bidirectional = True, dropout = dropout)
        self.fc = nn.Linear(hidden_dim * 2, 128)
        self.classifier = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.elu = nn.ELU()

    def forward(self, x):
        x = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(x)
        out = self.elu(self.fc(lstm_out))  
        logits = self.classifier(out)  
        return logits

In [5]:
glove_path = '../../data/lstm-data/glove.6B.100d/glove.6B.100d.txt' 
glove_embeddings = load_glove_embeddings(glove_path, word_vocab, embedding_dim = 100)

In [7]:
glove_model = BILSTMGlove(input_dim = input_dim, embedding_dim=100, hidden_dim = 256, glove_embeddings = glove_embeddings, dropout=0.33, output_dim = output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=tag_vocab.get('<PAD>', -1))  
optimizer = optim.Adam(glove_model.parameters(), lr = 0.001)

train_model(glove_model, dataloaders, optimizer, criterion, device, num_epochs = 5)



indxs predicted:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [3, 1, 1, 1, 1, 1, 5, 8, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 5, 1, 3, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 2, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 2, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

## EVAL

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

def custom_eval(model, file_paths, word_vocab, output_paths, device, idx_to_tag):
    model.eval()
    model.to(device)
    
    # Loop over each file path and corresponding output path
    for file_path, output_path in zip(file_paths, output_paths):
        with open(file_path, 'r', encoding='utf-8') as f, open(output_path, 'w', encoding='utf-8') as out_f:
            sentences = []
            current_sentence = []
            for line in f:
                if line.strip():  # Non-empty line
                    parts = line.strip().split()
                    word = parts[-1]  # Get the last element, whether it's word or tag
                    current_sentence.append(word_vocab.get(word, word_vocab['<UNK>']))
                elif current_sentence:  # Empty line and current sentence is not empty
                    sentences.append(current_sentence)
                    current_sentence = []

            # Add the last sentence if the file doesn't end with a newline
            if current_sentence:
                sentences.append(current_sentence)

            # Predict and write to file
            for sentence in sentences:
                sentence_tensor = torch.tensor([sentence], dtype=torch.long, device=device)
                outputs = model(sentence_tensor)
                _, preds = torch.max(outputs, dim=2)
                pred_tags = [idx_to_tag[pred.item()] for pred in preds[0]]  # Convert indices to tags

                # Write predictions to the output file
                for i, word_idx in enumerate(sentence):
                    word = list(word_vocab.keys())[list(word_vocab.values()).index(word_idx)]  # Inverse lookup
                    tag = pred_tags[i]
                    out_f.write(f"{i+1}\t{word}\t{tag}\n")
                out_f.write("\n")  # New line after each sentence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load('best_model.pth')
model.load_state_dict(checkpoint)

idx_to_tag = {idx: tag for tag, idx in tag_vocab.items()}  # Create reverse mapping
file_paths = ['../../data/lstm-data/test']#, '../../data/lstm-data/dev']
output_paths = ['./test_preds']#,'./dev_preds']
custom_eval(model, file_paths, word_vocab, output_paths, device, idx_to_tag)


Bonus: LSTM-CNN model 
The bonus task is to equip the BLSTM model in Task 2 with a CNN module
to capture character-level information (see slides page 45 in lecture 12 for the
network architecture). The character embedding dimension is set to 30. You
need to tune other hyper-parameters of CNN module, such as the number of
CNN layers, the kernel size and output dimension of each CNN layer. What
are the precision, recall and F1 score on the dev data? Predicting the NER
tags of the sentences in the test data and output the predictions in a file
named pred, in the same format of training data. (hint: the bonus points are
assigned based on the ranking of your model F1 score on the test data).