<h1> fastText Dataset 2</h1>

In [1]:
# import libraries

import os
import sys
import numpy as np
import pandas as pd
import matplotlib as plt
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from transformers import TrainingArguments
from sklearn.metrics import f1_score
import json
import wandb
from tqdm import tqdm

# import warnings
# warnings.filterwarnings(action='ignore')

In [2]:
mps_device = torch.device("mps")
# define helper functions

def argmax(vec):

    """ 
    argmax as the name suggests finds the value 
    of the argument that gives the maximum value
    of a target function
    """

    # return max value of all elements in the input tensor
    _, idx = torch.max(vec, 1) 
    return idx.item()

def prepare_sequence(seq, to_ix):
    idxs = []
    for word in seq:
        if word not in to_ix.keys():
            idxs.append(to_ix['<UNK>'])
        else:
            idxs.append(to_ix[word])
            
    return torch.tensor(idxs, dtype=torch.long, device=mps_device)

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]

    # view returns a new tensor w/ a different shape
    # maintaining the number of elements

    # expand simply expands a tensor to a larger size
    # say ([1], [2], [3]).expand(3,3) would give
    # ([1,1,1],
    #  [2,2,2],
    #  [3,3,3])

    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [3]:
mps_device = torch.device("mps")

class BiLSTM_CRF(nn.Module):
    #BILSTM CRF is a subclass inheriting from the (nn.Module) superclass

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, word_embeds):
        super(BiLSTM_CRF, self).__init__() #call the init method of the superclass (nn.Module)

        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        # lookup table that stores embeddings
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim).to(mps_device)
        self.word_embeds = word_embeds

        #define the lstm
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True).to(mps_device)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size).to(mps_device)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size)).to(mps_device)

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(mps_device),
                torch.randn(2, 1, self.hidden_dim // 2).to(mps_device))

    def _forward_alg(self, feats):

        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(mps_device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats

    def _score_sentence(self, feats, tags):

        START_TAG = "<START>"
        STOP_TAG = "<STOP>"

        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(mps_device)
        # tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])

        tags_tensor = torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long, device=mps_device)
        tags_tensor = torch.cat([tags_tensor, tags.to(mps_device)])
        
        for i, feat in enumerate(feats):

            score = score + self.transitions[tags_tensor[i + 1], tags_tensor[i]] + feat[tags_tensor[i + 1]]

        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags_tensor[-1]]
        return score

    def _viterbi_decode(self, feats):

        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000., device=mps_device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
            
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [4]:
mps_device = torch.device("mps")

START_TAG = "<START>"
STOP_TAG = "<STOP>"

# Load training data
training_data = []

# getting training sentence tokens
training_file_path = "/Users/mo/Desktop/repos/nlp_a2/task_2/dataset/train_bio.json"

with open(training_file_path, "r") as f:
    data = json.load(f)
    for entry in data.values():
        sentence = entry["text"].split()
        tags = entry["labels"]
        training_data.append((sentence, tags))

word_to_ix = {"<UNK>": 0}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {
    "B": 0,
    "I": 1,
    "O": 2,
    START_TAG: 3,
    STOP_TAG: 4
}

print(word_to_ix)

#Load validation data
validation_data = []

# getting validation sentence tokens
val_file_path = "/Users/mo/Desktop/repos/nlp_a2/task_2/dataset/val_bio.json"

with open(val_file_path, "r") as f:
    data = json.load(f)
    for entry in data.values():
        sentence = entry["text"].split()
        tags = entry["labels"]
        validation_data.append((sentence, tags))


#sanity check
train_check = True
for sentence, tags in training_data:
    if (len(sentence) != len(tags)):
        train_check = False
        break

if (train_check):
    print("good 2 go")
else:
    print("train prob")

val_check = True
for sentence, tags in validation_data:
    if (len(sentence) != len(tags)):
        val_check = False
        break

if (val_check):
    print("good 2 go")
else:
    print("val prob")

good 2 go
good 2 go


In [5]:
from gensim.models import KeyedVectors

EMBEDDING_DIM = 100
HIDDEN_DIM = 4

# Load FastText embeddings from .vec file
fasttext_model = KeyedVectors.load_word2vec_format('/Users/mo/Downloads/wiki-news-300d-1M-subword.vec')

# Init torch embedding layer
word_embeds = nn.Embedding(len(word_to_ix), EMBEDDING_DIM).to(mps_device)

# Load FastText embeddings of size EMBEDDING_DIM into torch embedding layer
for word, idx in word_to_ix.items():
    if word in fasttext_model:
        word_embeds.weight.data[idx] = torch.tensor(fasttext_model[word][:EMBEDDING_DIM], device=mps_device)


In [6]:
# load the model

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, word_embeds).to(mps_device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)

In [7]:
#check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix).to(mps_device)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long).to(mps_device)
    print(model(precheck_sent))

(tensor(12.8562, device='mps:0'), [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [8]:
def run_epochs(model, optimizer, word_to_ix, tag_to_ix, train_data, val_data, run_name):

    # train_loader, val_loader = get_data(tokenizer)

    # Initialize W&B
    wandb.login(key="7ef2e84866a68a6cd33c90b1fa55c8cf8ab2d6e7", relogin=True)
    wandb.init(project="nlp_a2", name="BiLSTM_CRF_fasttext_t2")
    wandb.watch(model)

    # Train and evaluate the model
    num_epochs = 10
    best_val_f1 = 0
    best_val_loss = 100

    for epoch in tqdm(range(num_epochs), desc="Epochs"):

        model.train()
        train_loss = 0
        train_f1 = 0

        for sentence, tags in train_data:

            model.zero_grad()

            sentence_in = prepare_sequence(sentence, word_to_ix).to(mps_device)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).to(mps_device)

            loss = model.neg_log_likelihood(sentence_in, targets)

            loss.backward()
            optimizer.step()

            train_loss += loss

            # Calculate F1-score
            predicted = model(sentence_in)

            targets = targets.cpu().numpy()

            train_f1 += f1_score(targets, predicted[1], average="macro")

        model.eval()
        val_loss = 0
        val_f1 = 0
        with torch.no_grad():

            for sentence, tags in val_data:
                    
                    sentence_in = prepare_sequence(sentence, word_to_ix).to(mps_device)
                    targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).to(mps_device)
    
                    loss = model.neg_log_likelihood(sentence_in, targets)
    
                    val_loss += loss
    
                    # Calculate F1-score
                    predicted = model(sentence_in)
                    targets = targets.cpu().numpy()
                    val_f1 += f1_score(targets, predicted[1], average="macro")


        # Log metrics to W&B
        train_loss /= len(train_data)
        val_loss /= len(val_data)
        train_f1 /= len(train_data)
        val_f1 /= len(val_data)

        wandb.log(
            {
                "epoch": epoch + 1,
                "train_loss": train_loss,
                "val_loss": val_loss,
                "train_f1": train_f1,
                "val_f1": val_f1,
            }
        )

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        # Check if training should be stopped
        if epochs_without_improvement >= 3:
            print(f"Stopping early at epoch {epoch+1} due to no improvement.")
            break

        print(
            f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}, Train F1: {train_f1}, Val F1: {val_f1}"
        )

    wandb.finish()
    torch.save(model.state_dict(), run_name + ".pt")


In [9]:
run_epochs(model, optimizer, word_to_ix, tag_to_ix, training_data, validation_data, "BiLSTM_CRF_fasttext_t2")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/mo/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkaif21067[0m ([33mbigmeow[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epochs:  10%|█         | 1/10 [04:27<40:11, 267.93s/it]

Epoch 1/10, Train Loss: tensor([6.3614], device='mps:0', grad_fn=<DivBackward0>), Val Loss: tensor([4.7120], device='mps:0'), Train F1: 0.6227587885717877, Val F1: 0.6498382833790032


Epochs:  20%|██        | 2/10 [08:33<33:57, 254.67s/it]

Epoch 2/10, Train Loss: tensor([3.7270], device='mps:0', grad_fn=<DivBackward0>), Val Loss: tensor([4.8503], device='mps:0'), Train F1: 0.7082883130830572, Val F1: 0.6632981456393727


Epochs:  30%|███       | 3/10 [12:43<29:29, 252.82s/it]

Epoch 3/10, Train Loss: tensor([3.1330], device='mps:0', grad_fn=<DivBackward0>), Val Loss: tensor([5.2668], device='mps:0'), Train F1: 0.7651903102347027, Val F1: 0.6786104702216371


Epochs:  30%|███       | 3/10 [16:18<38:03, 326.28s/it]

Stopping early at epoch 4 due to no improvement.





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▆█
train_f1,▁▄▆█
train_loss,█▃▂▁
val_f1,▁▄▇█
val_loss,▁▂▇█

0,1
epoch,4.0
train_f1,0.81638
train_loss,2.70271
val_f1,0.68216
val_loss,5.41003


In [10]:
# load test data

test_data = []

# getting test sentence tokens
test_file_path = "/Users/mo/Desktop/repos/nlp_a2/task_2/dataset/test_bio.json"

with open(test_file_path, "r") as f:
    data = json.load(f)
    for entry in data.values():
        sentence = entry["text"].split()
        tags = entry["labels"]
        test_data.append((sentence, tags))

# load the model
model.load_state_dict(torch.load("BiLSTM_CRF_fasttext_t2.pt"))

# calculate f1 score

test_f1 = 0
with torch.no_grad():
    for sentence, tags in test_data:
        sentence_in = prepare_sequence(sentence, word_to_ix).to(mps_device)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).to(mps_device)

        predicted = model(sentence_in)
        targets = targets.cpu().numpy()

        test_f1 += f1_score(targets, predicted[1], average="macro")

test_f1 /= len(test_data)
print(f"Test F1: {test_f1}")

Test F1: 0.65668631505872
