<a href="https://colab.research.google.com/github/gupta24789/named-entity-recognition/blob/main/ner_char_lstm_crf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip -d embeddings/

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
import random
import itertools
import pandas as pd
import numpy as np


from pathlib import Path
from pprint import pprint


import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from torch import optim
import torch.autograd as autograd
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from seqeval import metrics

## Download Data

In [None]:
# Path('data/train').mkdir(parents = True, exist_ok= True)
# Path('data/val').mkdir(parents = True, exist_ok= True)
# Path('data/test').mkdir(parents = True, exist_ok= True)

# os.system("cd data/train && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/train/sentences.txt")
# os.system("cd data/train && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/train/labels.txt")
# os.system("cd data/val && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/val/sentences.txt")
# os.system("cd data/val && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/val/labels.txt")
# os.system("cd data/test && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/test/sentences.txt")
# os.system("cd data/test && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/test/labels.txt")

## Set Seed

In [None]:
seed = 121
random.seed(seed)
torch.manual_seed(seed)
pl.seed_everything(seed)

Seed set to 121


121

## Load Data

In [None]:
## train
train_sents = open("data/train/sentences.txt","r").readlines()
train_tags = open("data/train/labels.txt","r").readlines()
## val
val_sents = open("data/val/sentences.txt","r").readlines()
val_tags = open("data/val/labels.txt","r").readlines()
## test
test_sents = open("data/test/sentences.txt","r").readlines()
test_tags = open("data/test/labels.txt","r").readlines()

In [None]:
train_sents[:2]

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .\n',
 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "\n']

In [None]:
train_tags[:2]

['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O\n',
 'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O\n']

In [None]:
X_train = [sent.strip().split(" ") for sent in train_sents]
y_train = [tag.strip().split(" ") for tag in train_tags]

X_val = [sent.strip().split(" ") for sent in val_sents]
y_val = [tag.strip().split(" ") for tag in val_tags]

X_test = [sent.strip().split(" ") for sent in test_sents]
y_test = [tag.strip().split(" ") for tag in test_tags]

In [None]:
pprint((X_train[:2]), compact=True)

[['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London',
  'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the',
  'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'],
 ['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined',
  'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans',
  'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop',
  'the', 'Bombings', '.', '"']]


In [None]:
pprint(y_train[:1], compact=True)

[['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O',
  'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']]


## Create Vocab

In [None]:
## char vocab
special_words = ['__PAD__','__UNK__']
char_vocab = list(set(itertools.chain.from_iterable(X_train + X_val + X_test)))
char_vocab = list(set(itertools.chain.from_iterable([list(w) for w in char_vocab])))
char_vocab = special_words + char_vocab
print(f'Char Vocab : {len(char_vocab)}')

char2idx = {char:i for i, char in enumerate(char_vocab)}
idx2char = {i:char for char,i in char2idx.items()}

CHAR_UNK_ID = char2idx['__UNK__']
CHAR_PAD_ID = char2idx['__PAD__']

Char Vocab : 100


In [None]:
special_words = ['__PAD__','__UNK__']
vocab = list(set(itertools.chain.from_iterable(X_train + X_val + X_test)))
vocab = special_words + vocab
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

## TAGS
tags = list(set(itertools.chain.from_iterable(y_train)))
tags = ['__PAD__'] + tags
tag2idx = {w:i for i,w in enumerate(tags)}
idx2tag = {i:w for w,i in tag2idx.items()}


print(f"vocab size : {len(vocab)}")
print(f"tags : {len(tag2idx)}")
pprint(tag2idx, compact=True)

PAD_ID = word2idx['__PAD__']
UNK_ID = word2idx['__UNK__']

print(f"PAD ID : {PAD_ID}")

vocab size : 35180
tags : 18
{'B-art': 10,
 'B-eve': 1,
 'B-geo': 14,
 'B-gpe': 8,
 'B-nat': 15,
 'B-org': 17,
 'B-per': 12,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 9,
 'I-geo': 16,
 'I-gpe': 7,
 'I-nat': 11,
 'I-org': 13,
 'I-per': 6,
 'I-tim': 4,
 'O': 3,
 '__PAD__': 0}
PAD ID : 0


## Encode sent & tags

In [None]:
def to_sent_number(sent_list):
    encoded = []
    for w in sent_list:
        encoded.append(word2idx.get(w, UNK_ID))
    return encoded


def to_tag_number(tag_list):
    encoded = []
    for tag in tag_list:
        encoded.append(tag2idx[tag])
    return encoded


In [None]:
def to_char_number(sent_list):
    seq_char_list = []
    for word in sent_list:
        char_list = list(word)
        char_id = list()
        for char in char_list:
            char_id.append(char2idx.get(char, CHAR_UNK_ID))
        seq_char_list.append(char_id)
    return seq_char_list

In [None]:
X_train_encoded = [to_sent_number(sent) for sent in X_train]
X_train_char_encoded = [to_char_number(sent) for sent in X_train]
y_train_encoded = [to_tag_number(tags) for tags in y_train]

X_val_encoded = [to_sent_number(sent) for sent in X_val]
X_val_char_encoded = [to_char_number(sent) for sent in X_val]
y_val_encoded = [to_tag_number(tags) for tags in y_val]

X_test_encoded = [to_sent_number(sent) for sent in X_test]
X_test_char_encoded = [to_char_number(sent) for sent in X_test]
y_test_encoded = [to_tag_number(tags) for tags in y_test]

In [None]:
pprint(X_train_encoded[:2], compact=True)

[[33310, 24860, 5335, 10800, 18229, 14130, 23898, 3316, 10469, 21081, 1959,
  26865, 11063, 16187, 16721, 21081, 512, 24860, 5134, 6718, 23818, 30872,
  31208, 13700],
 [26277, 24860, 25750, 18760, 26865, 21081, 25119, 929, 21081, 2960, 9586,
  22833, 18919, 24136, 16845, 22997, 11629, 18016, 5771, 13250, 33204, 25971,
  18016, 16187, 18016, 1152, 21081, 11259, 13700, 18016]]


In [None]:
pprint(X_train_char_encoded[:2], compact=True)

[[[44, 75, 51, 37, 78, 33, 36, 28, 78], [51, 54],
  [28, 85, 25, 51, 36, 78, 58, 19, 33, 58, 51, 19, 78], [75, 33, 43, 85],
  [25, 33, 19, 11, 75, 85, 28], [58, 75, 19, 51, 37, 59, 75],
  [50, 51, 36, 28, 51, 36], [58, 51], [26, 19, 51, 58, 85, 78, 58],
  [58, 75, 85], [79, 33, 19], [29, 36], [38, 19, 33, 89], [33, 36, 28],
  [28, 85, 25, 33, 36, 28], [58, 75, 85],
  [79, 29, 58, 75, 28, 19, 33, 79, 33, 10], [51, 54],
  [53, 19, 29, 58, 29, 78, 75], [58, 19, 51, 51, 26, 78], [54, 19, 51, 25],
  [58, 75, 33, 58], [11, 51, 37, 36, 58, 19, 72], [48]],
 [[87, 33, 25, 29, 10, 29, 85, 78], [51, 54], [78, 51, 10, 28, 29, 85, 19, 78],
  [82, 29, 10, 10, 85, 28], [29, 36], [58, 75, 85],
  [11, 51, 36, 54, 10, 29, 11, 58], [17, 51, 29, 36, 85, 28], [58, 75, 85],
  [26, 19, 51, 58, 85, 78, 58, 85, 19, 78], [79, 75, 51],
  [11, 33, 19, 19, 29, 85, 28], [61, 33, 36, 36, 85, 19, 78], [79, 29, 58, 75],
  [78, 37, 11, 75], [78, 10, 51, 59, 33, 36, 78], [33, 78], [32],
  [53, 37, 78, 75], [70, 37, 25, 

In [None]:
pprint(y_train_encoded[:2], compact=True)

[[3, 3, 3, 3, 3, 3, 14, 3, 3, 3, 3, 3, 14, 3, 3, 3, 3, 3, 8, 3, 3, 3, 3, 3],
 [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 12, 3, 3, 3, 3, 3, 3, 3,
  3, 3, 3, 3]]


## Data Loaders

In [None]:
def pad_char(chars):
    batch_size = len(chars)
    max_seq_len = max(map(len, chars))
    pad_chars = [chars[idx] + [[0]] * (max_seq_len - len(chars[idx])) for idx in range(len(chars))]
    length_list = [list(map(len, pad_char)) for pad_char in pad_chars]
    max_word_len = max(map(max, length_list))
    char_seq_tensor = torch.zeros((batch_size, max_seq_len, max_word_len)).long()
    char_seq_lengths = torch.LongTensor(length_list)
    for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
        for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
            char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word)

    return char_seq_tensor

In [None]:
def custom_collate(batch):
    sent = [torch.tensor(item[0]) for item in batch]
    char_input = [item[1] for item in batch]
    tag = [torch.tensor(item[2]) for item in batch]
    lengths = torch.tensor([len(item[0]) for item in batch])

    padded_sent = nn.utils.rnn.pad_sequence(sent, batch_first=True, padding_value=PAD_ID)
    padded_tag = nn.utils.rnn.pad_sequence(tag, batch_first=True, padding_value=PAD_ID)
    padded_char = pad_char(char_input)

    batch = {"sent": padded_sent, "tag": padded_tag, "char": padded_char, "lengths": lengths}
    return batch

In [None]:
train_dl = DataLoader(list(zip(X_train_encoded,X_train_char_encoded, y_train_encoded)), batch_size = 2, shuffle = False, collate_fn = custom_collate )

In [None]:
example = next(iter(train_dl))

In [None]:
example['sent'].shape, example['tag'].shape,  example['char'].shape, example['lengths'].shape

(torch.Size([2, 30]),
 torch.Size([2, 30]),
 torch.Size([2, 30, 13]),
 torch.Size([2]))

In [None]:
example['sent']

tensor([[33310, 24860,  5335, 10800, 18229, 14130, 23898,  3316, 10469, 21081,
          1959, 26865, 11063, 16187, 16721, 21081,   512, 24860,  5134,  6718,
         23818, 30872, 31208, 13700,     0,     0,     0,     0,     0,     0],
        [26277, 24860, 25750, 18760, 26865, 21081, 25119,   929, 21081,  2960,
          9586, 22833, 18919, 24136, 16845, 22997, 11629, 18016,  5771, 13250,
         33204, 25971, 18016, 16187, 18016,  1152, 21081, 11259, 13700, 18016]])

In [None]:
example['tag']

tensor([[ 3,  3,  3,  3,  3,  3, 14,  3,  3,  3,  3,  3, 14,  3,  3,  3,  3,  3,
          8,  3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0],
        [ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         12,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3]])

In [None]:
example['char']

tensor([[[44, 75, 51, 37, 78, 33, 36, 28, 78,  0,  0,  0,  0],
         [51, 54,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [28, 85, 25, 51, 36, 78, 58, 19, 33, 58, 51, 19, 78],
         [75, 33, 43, 85,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [25, 33, 19, 11, 75, 85, 28,  0,  0,  0,  0,  0,  0],
         [58, 75, 19, 51, 37, 59, 75,  0,  0,  0,  0,  0,  0],
         [50, 51, 36, 28, 51, 36,  0,  0,  0,  0,  0,  0,  0],
         [58, 51,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [26, 19, 51, 58, 85, 78, 58,  0,  0,  0,  0,  0,  0],
         [58, 75, 85,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [79, 33, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [29, 36,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [38, 19, 33, 89,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [33, 36, 28,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [28, 85, 25, 33, 36, 28,  0,  0,  0,  0,  0,  0,  0],
         [58, 75, 85,  0,  0,  0,  0,  0,  0,  0,  0,  

In [None]:
example['lengths']

tensor([24, 30])

In [None]:
## dataloaders
batch_size = 32
train_dl = DataLoader(list(zip(X_train_encoded,X_train_char_encoded, y_train_encoded)), batch_size = batch_size, shuffle = True, collate_fn = custom_collate )
val_dl = DataLoader(list(zip(X_val_encoded,X_val_char_encoded, y_val_encoded)), batch_size = batch_size, shuffle = False, collate_fn = custom_collate )
test_dl = DataLoader(list(zip(X_test_encoded,X_test_char_encoded,y_test_encoded)), batch_size = batch_size, shuffle = False, collate_fn = custom_collate )

## Pretrained Vec

In [None]:
## Pretrained Vectors
def load_pretrain_emb(filepath):
    lines = open(filepath,"r").readlines()
    embedd_dict = {}
    for line in lines:
        if len(line)>0:
            tokens = line.strip().split(" ")
            word = tokens[0]
            vec = tokens[1:]
            vec = np.array(vec).astype(float)
            embedd_dict[word]= vec

    return embedd_dict

def build_pretrain_embedding(filepath, vocab, emb_dim):
    embedd_dict = load_pretrain_emb(filepath)

    df_list = []

    for w,i in vocab.items():
        if w in embedd_dict:
            df_list.append(torch.tensor(embedd_dict[w]))
        elif w.lower() in embedd_dict:
            df_list.append(embedd_dict[w.lower()])
        else:
            random_vec = np.random.normal(size = (emb_dim))
            df_list.append(random_vec)


    return torch.tensor(df_list)



weights = build_pretrain_embedding("embeddings/glove.6B.100d.txt", word2idx, emb_dim=100)
weights.shape

torch.Size([35180, 100])

## CharCNN

In [None]:
class CharCNN(nn.Module):
    def __init__(self, alphabet_size, embedding_dim, hidden_dim, dropout):
        super(CharCNN, self).__init__()
        print("build char sequence feature extractor: CNN ...")
        self.hidden_dim = hidden_dim
        self.char_drop = nn.Dropout(dropout)
        self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim, padding_idx= CHAR_PAD_ID)
        self.char_embeddings.weight.data.copy_(torch.from_numpy(CharCNN.random_embedding(alphabet_size, embedding_dim)))
        # self.char_cnn = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=3, padding=1)
        self.linear = nn.Linear(embedding_dim, hidden_dim)

    @staticmethod
    def random_embedding(vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        pretrain_emb[0, :] = np.zeros((1, embedding_dim))
        return pretrain_emb

    def forward(self, input):
        batch_size = input.size(0)
        char_embeds = self.char_drop(self.char_embeddings(input))
        # char_embeds = char_embeds.transpose(2, 1).contiguous()
        # char_embeds = torch.mean(char_embeds, dim = 2).permute(0,2,1)
        # print(f"char_embeds : {char_embeds.shape}")
        # char_out = self.char_cnn(char_embeds)
        # char_out = F.max_pool1d(char_out, char_out.size(2)).contiguous().view(batch_size, -1)
        char_out = torch.mean(char_embeds, dim = 2)
        char_out = self.linear(char_out)
        return char_out

## CRF

In [None]:
START_TAG = -2
STOP_TAG = -1


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec, m_size):
    """
    calculate log of exp sum
    args:
        vec (batch_size, vanishing_dim, hidden_dim) : input tensor
        m_size : hidden_dim
    return:
        batch_size, hidden_dim
    """
    _, idx = torch.max(vec, 1)  # B * 1 * M
    max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size)  # B * M
    return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1,
                                                                                                                m_size)  # B * M


class CRF(nn.Module):

    def __init__(self, tagset_size, gpu):
        super(CRF, self).__init__()
        print("build CRF...")
        self.gpu = gpu
        # Matrix of transition parameters.  Entry i,j is the score of transitioning from i to j.
        self.tagset_size = tagset_size
        # # We add 2 here, because of START_TAG and STOP_TAG
        # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
        init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2)
        init_transitions[:, START_TAG] = -10000.0
        init_transitions[STOP_TAG, :] = -10000.0
        init_transitions[:, 0] = -10000.0
        init_transitions[0, :] = -10000.0
        # if self.gpu:
        #     init_transitions = init_transitions.cuda()
        self.transitions = nn.Parameter(init_transitions)

        # self.transitions = nn.Parameter(torch.Tensor(self.tagset_size+2, self.tagset_size+2))
        # self.transitions.data.zero_()

    def _calculate_PZ(self, feats, mask):
        """
            input:
                feats: (batch, seq_len, self.tag_size+2)
                masks: (batch, seq_len)
        """
        batch_size = feats.size(0)
        seq_len = feats.size(1)
        tag_size = feats.size(2)
        # print feats.view(seq_len, tag_size)
        assert (tag_size == self.tagset_size + 2)
        mask = mask.transpose(1, 0).contiguous()
        ins_num = seq_len * batch_size
        ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
        feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
        ## need to consider start
        scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
        scores = scores.view(seq_len, batch_size, tag_size, tag_size)
        # build iter
        seq_iter = enumerate(scores)
        _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
        # only need start from start_tag
        partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size, 1)  # bat_size * to_target_size

        ## add start score (from start to all tag, duplicate to batch_size)
        # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1)
        # iter over last scores
        for idx, cur_values in seq_iter:
            # previous to_target is current from_target
            # partition: previous results log(exp(from_target)), #(batch_size * from_target)
            # cur_values: bat_size * from_target * to_target

            cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
                                                                                                  tag_size)
            cur_partition = log_sum_exp(cur_values, tag_size)
            # print cur_partition.data

            # (bat_size * from_target * to_target) -> (bat_size * to_target)
            # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
            mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size)

            ## effective updated partition part, only keep the partition value of mask value = 1
            masked_cur_partition = cur_partition.masked_select(mask_idx)
            ## let mask_idx broadcastable, to disable warning
            mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1)

            ## replace the partition where the maskvalue=1, other partition value keeps the same
            partition.masked_scatter_(mask_idx, masked_cur_partition)
        # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG
        cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size,
                                                                         tag_size) + partition.contiguous().view(
            batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
        cur_partition = log_sum_exp(cur_values, tag_size)
        final_partition = cur_partition[:, STOP_TAG]
        return final_partition.sum(), scores

    def _viterbi_decode(self, feats, mask):
        """
            input:
                feats: (batch, seq_len, self.tag_size+2)
                mask: (batch, seq_len)
            output:
                decode_idx: (batch, seq_len) decoded sequence
                path_score: (batch, 1) corresponding score for each sequence (to be implementated)
        """
        batch_size = feats.size(0)
        seq_len = feats.size(1)
        tag_size = feats.size(2)
        assert (tag_size == self.tagset_size + 2)
        ## calculate sentence length for each sentence
        length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
        ## mask to (seq_len, batch_size)
        mask = mask.transpose(1, 0).contiguous()
        ins_num = seq_len * batch_size
        ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
        feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
        ## need to consider start
        scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
        scores = scores.view(seq_len, batch_size, tag_size, tag_size)

        # build iter
        seq_iter = enumerate(scores)
        ## record the position of best score
        back_points = list()
        partition_history = list()
        ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
        # mask = 1 + (-1)*mask
        mask = (1 - mask.long()).byte()
        _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
        # only need start from start_tag
        partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size)  # bat_size * to_target_size
        # print "init part:",partition.size()
        partition_history.append(partition)
        # iter over last scores
        for idx, cur_values in seq_iter:
            # previous to_target is current from_target
            # partition: previous results log(exp(from_target)), #(batch_size * from_target)
            # cur_values: batch_size * from_target * to_target
            cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
                                                                                                  tag_size)
            ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
            # print "cur value:", cur_values.size()
            partition, cur_bp = torch.max(cur_values, 1)
            # print "partsize:",partition.size()
            # exit(0)
            # print partition
            # print cur_bp
            # print "one best, ",idx
            partition_history.append(partition)
            ## cur_bp: (batch_size, tag_size) max source score position in current tag
            ## set padded label as 0, which will be filtered in post processing
            cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
            back_points.append(cur_bp)
        # exit(0)
        ### add score to final STOP_TAG
        partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1,
                                                                                                    0).contiguous()  ## (batch_size, seq_len. tag_size)
        ### get the last position for each setences, and select the last partitions using gather()
        last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1
        last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1)
        ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
        last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size,
                                                                                                    tag_size).expand(
            batch_size, tag_size, tag_size)
        _, last_bp = torch.max(last_values, 1)
        pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
        if self.gpu:
            pad_zero = pad_zero.cuda()
        back_points.append(pad_zero)
        back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size)

        ## select end ids in STOP_TAG
        pointer = last_bp[:, STOP_TAG]
        insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size)
        back_points = back_points.transpose(1, 0).contiguous()
        ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
        # print "lp:",last_position
        # print "il:",insert_last
        back_points.scatter_(1, last_position, insert_last)
        # print "bp:",back_points
        # exit(0)
        back_points = back_points.transpose(1, 0).contiguous()
        ## decode from the end, padded position ids are 0, which will be filtered if following evaluation
        decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
        if self.gpu:
            decode_idx = decode_idx.cuda()
        decode_idx[-1] = pointer.detach()
        for idx in range(len(back_points) - 2, -1, -1):
            pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1))
            decode_idx[idx] = pointer.detach().view(batch_size)
        path_score = None
        decode_idx = decode_idx.transpose(1, 0)
        return path_score, decode_idx

    def _score_sentence(self, scores, mask, tags):
        """
            input:
                scores: variable (seq_len, batch, tag_size, tag_size)
                mask: (batch, seq_len)
                tags: tensor  (batch, seq_len)
            output:
                score: sum of score for gold sequences within whole batch
        """
        # Gives the score of a provided tag sequence
        batch_size = scores.size(1)
        seq_len = scores.size(0)
        tag_size = scores.size(2)
        ## convert tag value into a new format, recorded label bigram information to index
        new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len))
        if self.gpu:
            new_tags = new_tags.cuda()
        for idx in range(seq_len):
            if idx == 0:
                ## start -> first score
                new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0]

            else:
                new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx]

        ## transition for label to STOP_TAG
        end_transition = self.transitions[:, STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size)
        ## length for batch,  last word position = length - 1
        length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
        ## index the label id of last word
        end_ids = torch.gather(tags, 1, length_mask - 1)

        ## index the transition score for end_id to STOP_TAG
        end_energy = torch.gather(end_transition, 1, end_ids)

        ## convert tag as (seq_len, batch_size, 1)
        new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1)
        ### need convert tags id to search from 400 positions of scores
        tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len,
                                                                                         batch_size)  # seq_len * bat_size
        ## mask transpose to (seq_len, batch_size)
        tg_energy = tg_energy.masked_select(mask.transpose(1, 0))

        # ## calculate the score from START_TAG to first label
        # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size)
        # start_energy = torch.gather(start_transition, 1, tags[0,:])

        ## add all score together
        # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum()
        gold_score = tg_energy.sum() + end_energy.sum()
        return gold_score

    def neg_log_likelihood_loss(self, feats, mask, tags):
        # nonegative log likelihood
        batch_size = feats.size(0)
        forward_score, scores = self._calculate_PZ(feats, mask)
        gold_score = self._score_sentence(scores, mask, tags)
        # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0]
        # exit(0)
        return forward_score - gold_score

## Build Model

In [None]:
class NERModel(pl.LightningModule):
    """
    if you will you NLLLoss then you have to use log_softmax in forward else use CrossEntropy
    """
    def __init__(self, vocab_size, word_emb_dim, hidden_dim, n_tags, alphabet_size, char_embedding_dim, char_hidden_dim,
                learning_rate, dropout, bidirectional = False, n_layers = 1,
                 use_pretrained = True, use_crf = True, use_gpu = True, use_char = True):
        super().__init__()
        self.use_crf = use_crf
        self.crf = CRF(tagset_size= n_tags, gpu= use_gpu)
        self.use_char = use_char
        self.char_feature = CharCNN(alphabet_size, char_embedding_dim, char_hidden_dim, dropout)
        self.input_dim = word_emb_dim
        self.learning_rate = learning_rate
        self.bidirectional = bidirectional

        if self.use_char:
            self.input_dim += char_hidden_dim

        # metrics
        self.train_f1 = []
        self.val_f1 = []
        self.val_loss = []
        self.test_f1 =[]
        self.test_precision = []
        self.test_recall = []

        ## define loss
        if self.use_crf:
            self.loss_fn = self.crf.neg_log_likelihood_loss
        else:
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)

        ## embedding layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim= word_emb_dim, padding_idx= PAD_ID)
        if use_pretrained:
            self.embedding.weight.data.copy_(weights)
        else:
            self.embedding.weight.data.copy_(torch.from_numpy(self.random_embedding(vocab_size, word_emb_dim)))

        ## lstm layer
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, batch_first=True, bidirectional=bidirectional, dropout = dropout, num_layers = n_layers)

        ## last layer
        if self.use_crf:
            self.hidden2tag = nn.Linear(hidden_dim * 2, n_tags + 2)
        else:
            self.hidden2tag = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, n_tags)

        ## other layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(1, vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        return pretrain_emb

    def forward(self, sent, char_inputs, lengths, verbose = False):

        ## layers
        word_embedding = self.embedding(sent)
        # print(f"word_embedding shape : {word_embedding.shape}")
        # word_embedding : [batch size, seq_len, emb dim]

        word_list = [word_embedding]
        if self.use_char:
            char_features = self.char_feature(char_inputs)
            word_list.append(char_features)
        embedded = torch.cat(word_list, 2)
        # print(f"final embedding shape : {embedded.shape}")


        packed_input = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'), batch_first = True, enforce_sorted = False)
        x, xlengths = nn.utils.rnn.pad_packed_sequence(packed_input, batch_first = True)

        output, (hidden, cell) = self.lstm(x)
        output = self.dropout(output)
        logits = self.hidden2tag(output)

        if self.use_crf:
            logits = logits
        else:
            logits = logits.permute(0,2,1)

        if verbose:
            print(f"Sent : {sent.shape}")
            print(f'length : {lengths.shape}')
            print(f'x : {x.shape}')
            print(f'xlengths : {xlengths.shape}')
            print(f'embedded : {embedded.shape}')
            print(f'output : {output.shape}')
            print(f'hidden : {hidden.shape}')
            print(f'cell : {cell.shape}')
            print(f'logits : {logits.shape}')

        return logits

    def calculate_metrics(self, y_true, y_pred, mask):

        y_true = y_true  * mask
        y_pred = y_pred * mask

        ## metrics
        y_true = y_true.cpu().numpy().tolist()
        y_pred = y_pred.cpu().numpy().tolist()
        y_true_label = [[idx2tag[tag] for tag in sent_tag] for sent_tag in y_true]
        y_pred_label = [[idx2tag[tag] for tag in sent_tag] for sent_tag in y_pred]

        f1_score = metrics.f1_score(y_true_label, y_pred_label)
        precision = metrics.precision_score(y_true_label, y_pred_label)
        recall = metrics.recall_score(y_true_label, y_pred_label)
        return f1_score, precision, recall

    def _shared_step(self, batch):
        sents, tags, char, lengths = batch['sent'], batch['tag'],batch['char'], batch['lengths']
        mask = (tags != PAD_ID)
        logits = self(sents, char, lengths)

        if self.use_crf:
            loss = self.loss_fn(logits, mask, tags)
            _, preds = self.crf._viterbi_decode(logits, mask)
        else:
            loss = self.loss_fn(logits, tags)
            _ , preds = torch.max(logits, dim = 1)

        ## calculate metrics
        f1_score, precision, recall = self.calculate_metrics(preds, tags, mask)
        return loss, f1_score, precision, recall

    def training_step(self, batch):
        loss, f1_score, precision, recall = self._shared_step(batch)
        self.train_f1.append(f1_score)
        self.log_dict({"train_loss": loss, "train_f1": np.mean(self.train_f1)}, on_step = False, on_epoch = True, prog_bar=  True)
        return loss

    def validation_step(self, batch):
        loss, f1_score, precision, recall = self._shared_step(batch)
        self.val_f1.append(f1_score)
        self.val_loss.append(loss.cpu().item())
        self.log_dict({"val_loss": loss, "val_f1": np.mean(self.val_f1)}, on_step = False, on_epoch = True, prog_bar=  True)
        return loss

    def on_training_epoch_end(self):
        self.train_f1 =[]

    def on_validation_epoch_end(self):
        print(f'Epoch : {self.current_epoch} Loss : {np.mean(self.val_loss)} F1 : {np.mean(self.val_f1)}')
        self.val_f1 =[]
        self.val_loss = []

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr = self.learning_rate)
        return optimizer

    def test_step(self, batch, batch_idx):
        sents, tags, char, lengths = batch['sent'], batch['tag'], batch['char'], batch['lengths']
        mask = (tags != PAD_ID)
        logits = self(sents, char, lengths)

        if self.use_crf:
            _, preds = self.crf._viterbi_decode(logits, mask)
        else:
            _ , preds = torch.max(logits, dim = 1)

        ## calculate metrics
        f1_score, precision, recall = self.calculate_metrics(preds, tags, mask)
        self.test_f1.append(f1_score)
        self.test_precision.append(precision)
        self.test_recall.append(recall)

    def on_test_epoch_end(self):
        print(f'F1 : {np.mean(self.test_f1)} Precision : {np.mean(self.test_precision)} Recall : {np.mean(self.test_recall)}')
        self.test_f1 = []
        self.test_precision = []
        self.test_recall = []

In [None]:
# model= NERModel(vocab_size = len(word2idx),
#                 word_emb_dim = 100,
#                 hidden_dim = 64,
#                 n_tags = len(tag2idx),
#                 alphabet_size = len(char2idx),
#                 char_embedding_dim = 40,
#                 char_hidden_dim = 50,
#                 learning_rate = 1e-3,
#                 dropout = 0.3,
#                 bidirectional = True,
#                 n_layers = 2,
#                 use_pretrained= True,
#                 use_crf= True
#                 )


# logits = model(example['sent'], example['char'], example['lengths'], verbose = True)
# true_label = example['tag']
# print(f"True label shape : {true_label.shape}")

# model.crf.gpu = False
# mask = (true_label != PAD_ID)
# loss = model.loss_fn(logits, mask, true_label)
# print(loss)

# scores, tag_seq = model.crf._viterbi_decode(logits, mask)
# print(tag_seq)

In [None]:
## Model Training
model= NERModel(vocab_size = len(word2idx),
                word_emb_dim = 100,
                hidden_dim = 64,
                n_tags = len(tag2idx),
                alphabet_size = len(char2idx),
                char_embedding_dim = 40,
                char_hidden_dim = 50,
                learning_rate = 1e-3,
                dropout = 0.3,
                bidirectional = True,
                n_layers = 2,
                use_pretrained= True,
                use_crf= True,
                use_char=True
                )

callbacks = pl.callbacks.ModelCheckpoint(dirpath = "checkpoints_logs",
                                         filename = '{epoch}-{val_loss:.2f}-{val_f1:.2f}',
                                          mode = "min",
                                          monitor = "val_loss",
                                          save_last = True,
                                          save_top_k=-1)


trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=5,
           check_val_every_n_epoch = 1,
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/saurabh/mydata/checkpoints_logs exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type      | Params
-------------------------------------------
0 | crf          | CRF       | 400   
1 | char_feature | CharCNN   | 6.0 K 
2 | embedding    | Embedding | 3.5 M 
3 | lstm         | LSTM      | 209 K 
4 | hidden2tag   | Linear    | 2.6 K 
5 | relu         | ReLU      | 0     
6 | dropout      | Dropout   | 0     
-------------------------------------------
3.7 M     Trainable params
0         Non-trainable params
3.7 M     Total params
14.948    Total estimated model params size (MB)


build CRF...
build char sequence feature extractor: CNN ...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch : 0 Loss : 2029.4188232421875 F1 : 0.11759074259074259


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Loss : 69.60270399305556 F1 : 0.8696362915523735


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Loss : 58.025661892361114 F1 : 0.876385727250741


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 2 Loss : 52.52622178819445 F1 : 0.8808323863121444


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Loss : 49.305659722222224 F1 : 0.8860889184786327


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch : 4 Loss : 49.89532552083333 F1 : 0.88397360896727


In [None]:
## F1 : 0.8868604345604159 Precision : 0.886774525815462 Recall : 0.8872570023491803
trainer.test(model, dataloaders= test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


F1 : 0.8868604345604159 Precision : 0.886774525815462 Recall : 0.8872570023491803


[{}]

## Predict

In [None]:
model = model.eval()

In [None]:
def process_data(text):
    text = text.strip().split(" ")
    lengths = len(text)
    word_encoded = []
    for w in text:
        word_encoded.append(word2idx.get(w, PAD_ID))

    text_tensor = torch.tensor(word_encoded).view(1, -1)
    lengths = torch.tensor([lengths])

    ## char
    char = to_char_number(text)
    char = [[torch.tensor(_id) for _id in _char_id] for _char_id in [char]]
    char_tensor = pad_char(char)

    return text_tensor, char_tensor, lengths


i = random.choices(list(range(len(test_sents))))[0]
text = test_sents[i]
true_label = test_tags[i].strip().split(" ")
text_tensor, char_tensor, lengths = process_data(text)
print(text_tensor.shape, char_tensor.shape, lengths.shape)

logits = model(text_tensor, char_tensor, lengths)
mask = torch.ones_like(text_tensor)

model.crf.gpu = False
_, preds = model.crf._viterbi_decode(logits, mask)

preds = preds.numpy()[0]
pred_labels = [idx2tag[p] for p in preds]

for w, p, t in zip(text.split(" "), pred_labels, true_label):
    print(f"{w:<20}  -->  {p:<5} --> {t:<5}")

torch.Size([1, 33]) torch.Size([1, 33, 10]) torch.Size([1])
Rashid                -->  B-org --> B-per
Abu                   -->  I-org --> I-per
Shbak                 -->  I-org --> I-per
,                     -->  O     --> O    
a                     -->  O     --> O    
senior                -->  O     --> O    
security              -->  O     --> O    
official              -->  O     --> O    
said                  -->  O     --> O    
Saturday              -->  B-tim --> B-tim
the                   -->  O     --> O    
measures              -->  O     --> O    
include               -->  O     --> O    
the                   -->  O     --> O    
disbanding            -->  O     --> O    
of                    -->  O     --> O    
the                   -->  O     --> O    
Department            -->  B-org --> B-org
of                    -->  I-org --> I-org
Protection            -->  I-org --> I-org
and                   -->  I-org --> I-org
Security              -->  I-org --> 

  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
