# Install, Imports & `Init`

In [1]:
!pip install transformers datasets evaluate accelerate seqeval sklearn-crfsuite mlflow ipywidgets tqdm -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.5 MB/s[0m 

In [72]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from collections import Counter


from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
import evaluate
from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

import sklearn_crfsuite

import random
import os
import mlflow

os.environ["MLFLOW_EXPERIMENT_NAME"] = "NER Task"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

seqeval = evaluate.load("seqeval")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


label_list = ['O','B-corporation','I-corporation','B-creative-work','I-creative-work','B-group','I-group','B-location','I-location','B-person','I-person','B-product','I-product']

label2id = dict(zip(label_list, range(len(label_list))))
id2label = {v:k for k,v in label2id.items()}


NUM_LABELS = len(label_list)
MAX_LEN = 196
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-05

MODEL_NAME = "distilbert-base-uncased"

# Helpers

In [74]:
def load_split_data(file, SEED, return_splits = True):
    """
    Load the data from text file and return it in List of List format with train test split
    """
    with open(file,"r") as f: raw_data = [x.strip().split("\t") for x in f.readlines()]

    tweets_list = []
    entities_list = []

    temp_ent = []
    temp_words = []

    for index, lis in enumerate(raw_data):
        try:
            if lis == [""]:
                assert len(temp_words) == len(temp_ent), "Sanity Check: Irregular Length"
                tweets_list.append(temp_words)
                entities_list.append(temp_ent)

                temp_words = []
                temp_ent = []
            else:
                (word,entity) = lis

                word = word.strip()
                if not len(word): continue

                entity = entity.strip()

                temp_words.append(word)
                temp_ent.append(entity)


        except Exception as e:
            print(e, index, lis)

    assert len(tweets_list) == len(entities_list), "entity text length mismatch"

    if return_splits: return train_test_split(tweets_list, entities_list, test_size=0.2, random_state = SEED)
    return tweets_list, entities_list


def convert_label_to_int(entities_list):
  """
  Convert ["O", "I-PER"...] to their respective ids
  """
  return [[label2id[label] for label in label_list] for label_list in entities_list]


def aligned_tokenization_for_NER(input_data, label_all_tokens = False):

    tokenized_inputs = tokenizer(input_data["tokens"], truncation = True, is_split_into_words = True, )#padding = 'max_length', max_length = MAX_LEN)

    labels = []
    for i, label in enumerate(input_data["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100 to be ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx: # label ONLY for the first token of each word
                label_ids.append(label[word_idx])

            else: # For the other tokens in a word, set the label to either the current label or -100, depending on
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    for k in results.keys(): # piece taken from https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

# Data Pre-Processing, Sanity checking

In [75]:
train_tokens, val_tokens, train_ent, val_ent = load_split_data("./train.txt", SEED)

train_labels = convert_label_to_int(train_ent)
val_labels = convert_label_to_int(val_ent)

# Sanity Checking
len(train_tokens) == len(train_labels) == len(train_ent), "Sanity Check failed"
for i in range(len(train_tokens)):
  assert len(train_tokens[i]) == len(train_labels[i]) == len(train_ent[i]), "Sanity Check Failed"


## [`sklearn` + `CRF`](https://github.com/TeamHG-Memex/sklearn-crfsuite) (Not maintained. Won't run)

In [10]:
# def run_CRF(train_tokens, val_tokens, train_ent, val_ent, model):
#   train_sentences = [list(zip(train_tokens[i],train_ent[i])) for i in range(len(train_tokens))]
#   val_sentences = [list(zip(val_tokens[i],val_ent[i])) for i in range(len(val_tokens))]


#   X_train = [sent2features(s) for s in train_sentences]
#   y_train = [sent2labels(s) for s in train_sentences]

#   X_val = [sent2features(s) for s in train_sentences]
#   y_val = [sent2labels(s) for s in train_sentences]

#   model.fit(X_train, y_train)
#   preds = model.predict(X_val)

#   classes = np.unique(y_train)
#   classes = classes.tolist()
#   new_classes = classes.copy()
#   new_classes.pop() # remove "O" for better visibility

#   print(classification_report(y_pred=preds, y_true=y_val, labels=new_classes), "\n","-"*50)
#   return preds


# def word2features(sent, i):
#     word = sent[i][0]

#     features = {
#         'bias': 1.0,
#         'word.lower()': word.lower(),
#         'word[-3:]': word[-3:],
#         'word[-2:]': word[-2:],
#         'word.isupper()': word.isupper(),
#         'word.istitle()': word.istitle(),
#         'word.isdigit()': word.isdigit(),
#     }
#     if i > 0:
#         word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
#         features.update({
#             '-1:word.lower()': word1.lower(),
#             '-1:word.istitle()': word1.istitle(),
#             '-1:word.isupper()': word1.isupper(),
#         })
#     else:
#         features['BOS'] = True
#     if i < len(sent)-1:
#         word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
#         features.update({
#             '+1:word.lower()': word1.lower(),
#             '+1:word.istitle()': word1.istitle(),
#             '+1:word.isupper()': word1.isupper(),
#         })
#     else:
#         features['EOS'] = True
#     return features


# def sent2features(sent):
#     return [word2features(sent, i) for i in range(len(sent))]

# def sent2tokens(sent):
#     return [item[0] for item in sent]

# def sent2labels(sent):
#     return [item[1] for item in sent]

In [11]:
# model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=20, all_possible_transitions = False)

# preds = run_CRF(train_tokens, val_tokens, train_ent, val_ent, model)

  ar = np.asanyarray(ar)


ValueError: ignored

# Intial training attempt using [Bi-LSTM + CRF + Viterbi Decoding](https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html)

Training the first Deep Learning model from scratch to check if there are any gains using [Bi-LSTM + CRF + Viterbi Decoding](https://jovian.com/abdulmajee/bilstm-crf)

**TOO SLOW TO USE**


## Helpers

In [66]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, device):
        super(BiLSTM_CRF, self).__init__()
        self.device = device
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        a = torch.randn(2, 1, self.hidden_dim // 2).to(self.device)
        b = torch.randn(2, 1, self.hidden_dim // 2).to(self.device)
        return (a,b)

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(self.device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(self.device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(self.device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(self.device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence).to(self.device)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

## Data Formatting

In [71]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 128
HIDDEN_DIM = 64
BATCH = 64


train_tokens, val_tokens, train_ent, val_ent

training_data = list(zip(train_tokens, train_ent)) # in the format [([word1, word2],[tag1,tag2]),([word1,word2,word3],[tag1,tag2,tag3])]
validation_data = list(zip(val_tokens, val_ent))

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = dict(zip(label_list, range(len(label_list))))
tag_to_ix[START_TAG] = 13
tag_to_ix[STOP_TAG] =  14


class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, training_data):
      self.length = 0
      self.sentences = []
      self.targets = []
      for sentence, tags in training_data:
        self.sentences.append(prepare_sequence(sentence, word_to_ix)) # Input to tensors
        self.targets.append(torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long))

        self.length += 1

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.sentences[idx], self.targets[idx]


# Batch training data
t_d = CustomDataset(training_data)
train_data_loader = DataLoader(t_d, batch_size=1, shuffle=True)
# val_data_loader = CustomDataset(validation_data)

## Training

In [None]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, device = device)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-4, weight_decay=1e-2)
model = model.to(device)

# INIT model once for loading everything beforehand
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print(model(precheck_sent.to(device)))


for epoch in range(3): # epochs
    for s, t in tqdm(train_data_loader):
        model.zero_grad() # clear grds

        loss = model.neg_log_likelihood(s[0].to(device), t[0].to(device)) # forward -> loss

        loss.backward()
        optimizer.step()

# # Check predictions after training
# with torch.no_grad():
#     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
#     print(model(precheck_sent))