<a href="https://colab.research.google.com/github/gupta24789/named-entity-recognition/blob/main/ner_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip -d embeddings/

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
import random
import itertools
import pandas as pd
import numpy as np


from pathlib import Path
from pprint import pprint


import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from torch import optim
import torch.autograd as autograd
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from seqeval import metrics

## Download Data

In [None]:
# Path('data/train').mkdir(parents = True, exist_ok= True)
# Path('data/val').mkdir(parents = True, exist_ok= True)
# Path('data/test').mkdir(parents = True, exist_ok= True)

# os.system("cd data/train && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/train/sentences.txt")
# os.system("cd data/train && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/train/labels.txt")
# os.system("cd data/val && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/val/sentences.txt")
# os.system("cd data/val && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/val/labels.txt")
# os.system("cd data/test && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/test/sentences.txt")
# os.system("cd data/test && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/test/labels.txt")

## Set Seed

In [None]:
seed = 121
random.seed(seed)
torch.manual_seed(seed)
pl.seed_everything(seed)

Seed set to 121


121

## Load Data

In [None]:
## train
train_sents = open("data/train/sentences.txt","r").readlines()
train_tags = open("data/train/labels.txt","r").readlines()
## val
val_sents = open("data/val/sentences.txt","r").readlines()
val_tags = open("data/val/labels.txt","r").readlines()
## test
test_sents = open("data/test/sentences.txt","r").readlines()
test_tags = open("data/test/labels.txt","r").readlines()

In [None]:
train_sents[:2]

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .\n',
 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "\n']

In [None]:
train_tags[:2]

['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O\n',
 'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O\n']

In [None]:
X_train = [sent.strip().split(" ") for sent in train_sents]
y_train = [tag.strip().split(" ") for tag in train_tags]

X_val = [sent.strip().split(" ") for sent in val_sents]
y_val = [tag.strip().split(" ") for tag in val_tags]

X_test = [sent.strip().split(" ") for sent in test_sents]
y_test = [tag.strip().split(" ") for tag in test_tags]

In [None]:
pprint((X_train[:2]), compact=True)

[['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London',
  'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the',
  'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'],
 ['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined',
  'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans',
  'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop',
  'the', 'Bombings', '.', '"']]


In [None]:
pprint(y_train[:1], compact=True)

[['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O',
  'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']]


## Create Vocab

In [None]:
special_words = ['__PAD__','__UNK__']
vocab = list(set(itertools.chain.from_iterable(X_train + X_val + X_test)))
vocab = special_words + vocab
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

## TAGS
tags = list(set(itertools.chain.from_iterable(y_train)))
tags = ['__PAD__'] + tags
tag2idx = {w:i for i,w in enumerate(tags)}
idx2tag = {i:w for w,i in tag2idx.items()}


print(f"vocab size : {len(vocab)}")
print(f"tags : {len(tag2idx)}")
pprint(tag2idx, compact=True)

PAD_ID = word2idx['__PAD__']
UNK_ID = word2idx['__UNK__']

print(f"PAD ID : {PAD_ID}")

vocab size : 35180
tags : 18
{'B-art': 2,
 'B-eve': 8,
 'B-geo': 7,
 'B-gpe': 17,
 'B-nat': 16,
 'B-org': 12,
 'B-per': 3,
 'B-tim': 14,
 'I-art': 1,
 'I-eve': 9,
 'I-geo': 15,
 'I-gpe': 6,
 'I-nat': 10,
 'I-org': 5,
 'I-per': 4,
 'I-tim': 11,
 'O': 13,
 '__PAD__': 0}
PAD ID : 0


## Encode sent & tags

In [None]:
def to_sent_number(sent_list):
    encoded = []
    for w in sent_list:
        encoded.append(word2idx.get(w, UNK_ID))
    return encoded


def to_tag_number(tag_list):
    encoded = []
    for tag in tag_list:
        encoded.append(tag2idx[tag])
    return encoded


In [None]:
X_train_encoded = [to_sent_number(sent) for sent in X_train]
y_train_encoded = [to_tag_number(tags) for tags in y_train]

X_val_encoded = [to_sent_number(sent) for sent in X_val]
y_val_encoded = [to_tag_number(tags) for tags in y_val]

X_test_encoded = [to_sent_number(sent) for sent in X_test]
y_test_encoded = [to_tag_number(tags) for tags in y_test]

In [None]:
pprint(X_train_encoded[:2], compact=True)

[[7138, 23213, 31249, 28708, 19796, 21565, 6163, 24430, 33070, 16029, 19865,
  17668, 13723, 32161, 26917, 16029, 8684, 23213, 15795, 13467, 20324, 13974,
  6309, 17678],
 [11001, 23213, 4125, 12289, 17668, 16029, 20597, 10298, 16029, 29374, 7042,
  13520, 33280, 28632, 10046, 10710, 28399, 7218, 28474, 15327, 18029, 5774,
  7218, 32161, 7218, 31619, 16029, 33168, 17678, 7218]]


In [None]:
pprint(y_train_encoded[:2], compact=True)

[[13, 13, 13, 13, 13, 13, 7, 13, 13, 13, 13, 13, 7, 13, 13, 13, 13, 13, 17, 13,
  13, 13, 13, 13],
 [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 3, 13,
  13, 13, 13, 13, 13, 13, 13, 13, 13, 13]]


## Data Loaders

In [None]:
def custom_collate(batch):
    sent = [torch.tensor(item[0]) for item in batch]
    tag = [torch.tensor(item[1]) for item in batch]
    lengths = torch.tensor([len(item[0]) for item in batch])

    padded_sent = nn.utils.rnn.pad_sequence(sent, batch_first=True, padding_value=PAD_ID)
    padded_tag = nn.utils.rnn.pad_sequence(tag, batch_first=True, padding_value=PAD_ID)

    batch = {"sent": padded_sent, "tag": padded_tag, "lengths": lengths}
    return batch

In [None]:
train_dl = DataLoader(list(zip(X_train_encoded,y_train_encoded)), batch_size = 2, shuffle = False, collate_fn = custom_collate )

In [None]:
example = next(iter(train_dl))
example['sent'].shape, example['tag'].shape,  example['lengths'].shape

(torch.Size([2, 30]), torch.Size([2, 30]), torch.Size([2]))

In [None]:
example['sent']

tensor([[ 7138, 23213, 31249, 28708, 19796, 21565,  6163, 24430, 33070, 16029,
         19865, 17668, 13723, 32161, 26917, 16029,  8684, 23213, 15795, 13467,
         20324, 13974,  6309, 17678,     0,     0,     0,     0,     0,     0],
        [11001, 23213,  4125, 12289, 17668, 16029, 20597, 10298, 16029, 29374,
          7042, 13520, 33280, 28632, 10046, 10710, 28399,  7218, 28474, 15327,
         18029,  5774,  7218, 32161,  7218, 31619, 16029, 33168, 17678,  7218]])

In [None]:
example['tag']

tensor([[13, 13, 13, 13, 13, 13,  7, 13, 13, 13, 13, 13,  7, 13, 13, 13, 13, 13,
         17, 13, 13, 13, 13, 13,  0,  0,  0,  0,  0,  0],
        [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
          3, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]])

In [None]:
example['lengths']

tensor([24, 30])

In [None]:
## dataloaders
batch_size = 32
train_dl = DataLoader(list(zip(X_train_encoded,y_train_encoded)), batch_size = batch_size, shuffle = True, collate_fn = custom_collate )
val_dl = DataLoader(list(zip(X_val_encoded,y_val_encoded)), batch_size = batch_size, shuffle = False, collate_fn = custom_collate )
test_dl = DataLoader(list(zip(X_test_encoded,y_test_encoded)), batch_size = batch_size, shuffle = False, collate_fn = custom_collate )

In [None]:
## Pretrained Vectors
def load_pretrain_emb(filepath):
    lines = open(filepath,"r").readlines()
    embedd_dict = {}
    for line in lines:
        if len(line)>0:
            tokens = line.strip().split(" ")
            word = tokens[0]
            vec = tokens[1:]
            vec = np.array(vec).astype(float)
            embedd_dict[word]= vec

    return embedd_dict

def build_pretrain_embedding(filepath, vocab, emb_dim):
    embedd_dict = load_pretrain_emb(filepath)

    df_list = []

    for w,i in vocab.items():
        if w in embedd_dict:
            df_list.append(torch.tensor(embedd_dict[w]))
        elif w.lower() in embedd_dict:
            df_list.append(embedd_dict[w.lower()])
        else:
            random_vec = np.random.normal(size = (emb_dim))
            df_list.append(random_vec)


    return torch.tensor(df_list)



weights = build_pretrain_embedding("embeddings/glove.6B.100d.txt", word2idx, emb_dim=100)
weights.shape

  return torch.tensor(df_list)


torch.Size([35180, 100])

## Build Model

In [None]:
class NERModel(pl.LightningModule):
    """
    if you will you NLLLoss then you have to use log_softmax in forward else use CrossEntropy
    """
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_tags, learning_rate, dropout, bidirectional = False, n_layers = 1, use_pretrained = False):
        super().__init__()
        self.learning_rate = learning_rate
        self.bidirectional = bidirectional

        # metrics
        self.train_f1 = []
        self.val_f1 = []
        self.val_loss = []
        self.test_f1 =[]
        self.test_precision = []
        self.test_recall = []

        ## define loss
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)


        ## layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim= emb_dim, padding_idx= PAD_ID)
        if use_pretrained:
            self.embedding.weight.data.copy_(weights)
        else:
            self.embedding.weight.data.copy_(torch.from_numpy(self.random_embedding(vocab_size, emb_dim)))

        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=bidirectional, dropout = dropout, num_layers = n_layers)
        self.hidden2tag = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, n_tags)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(1, vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        return pretrain_emb

    def forward(self, sent, lengths, verbose = False):

        packed_input = nn.utils.rnn.pack_padded_sequence(sent, lengths.to('cpu'), batch_first = True, enforce_sorted = False)
        x, xlengths = nn.utils.rnn.pad_packed_sequence(packed_input, batch_first = True)

        ## layers
        embedded = self.embedding(x)
        # embedded : [batch size, seq_len, emb dim]
        output, (hidden, cell) = self.lstm(embedded)
        # output : [batch size,seq_len,  2*hidden dim]
        # hidden : [bidirectional * num layers, batch size, hidden dim]  ### if bidirectional == True then multiply by 2 else 1
        # cell : [bidirectional * num layers, batch size, hidden dim]


        output = self.dropout(output)
        logits = self.hidden2tag(output)
         # logits : [ batch size, seq_len, num_class]
        logits = logits.permute(0,2,1)
        # logits :[batch size, num class, seq len]
        # logits = F.log_softmax(logits, dim=1)

        if verbose:
            print(f"Sent : {sent.shape}")
            print(f'length : {lengths.shape}')
            print(f'x : {x.shape}')
            print(f'xlengths : {xlengths.shape}')
            print(f'embedded : {embedded.shape}')
            print(f'output : {output.shape}')
            print(f'hidden : {hidden.shape}')
            print(f'cell : {cell.shape}')
            print(f'logits : {logits.shape}')

        return logits

    def calculate_metrics(self, y_true, y_pred, mask):

        y_true = y_true  * mask
        y_pred = y_pred * mask

        ## metrics
        y_true = y_true.cpu().numpy().tolist()
        y_pred = y_pred.cpu().numpy().tolist()
        y_true_label = [[idx2tag[tag] for tag in sent_tag] for sent_tag in y_true]
        y_pred_label = [[idx2tag[tag] for tag in sent_tag] for sent_tag in y_pred]

        f1_score = metrics.f1_score(y_true_label, y_pred_label)
        precision = metrics.precision_score(y_true_label, y_pred_label)
        recall = metrics.recall_score(y_true_label, y_pred_label)
        return f1_score, precision, recall

    def _shared_step(self, batch):
        sents, tags, lengths = batch['sent'], batch['tag'], batch['lengths']
        mask = (tags != PAD_ID)
         # mask = (y_true != PAD_ID) * (y_true != tag2idx['O'])
        logits = self(sents, lengths)

        loss = self.loss_fn(logits, tags)
        _ , preds = torch.max(logits, dim = 1)

        ## calculate metrics
        f1_score, precision, recall = self.calculate_metrics(preds, tags, mask)
        return loss, f1_score, precision, recall

    def training_step(self, batch):
        loss, f1_score, precision, recall = self._shared_step(batch)
        self.train_f1.append(f1_score)
        self.log_dict({"train_loss": loss, "train_f1": np.mean(self.train_f1)}, on_step = False, on_epoch = True, prog_bar=  True)
        return loss

    def validation_step(self, batch):
        loss, f1_score, precision, recall = self._shared_step(batch)
        self.val_f1.append(f1_score)
        self.val_loss.append(loss.cpu().item())
        self.log_dict({"val_loss": loss, "val_f1": np.mean(self.val_f1)}, on_step = False, on_epoch = True, prog_bar=  True)
        return loss

    def on_training_epoch_end(self):
        self.train_f1 =[]

    def on_validation_epoch_end(self):
        print(f'Epoch : {self.current_epoch} Loss : {np.mean(self.val_loss)} F1 : {np.mean(self.val_f1)}')
        self.val_f1 =[]
        self.val_loss = []

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr = self.learning_rate)
        return optimizer

    def test_step(self, batch, batch_idx):
        sents, tags, lengths = batch['sent'], batch['tag'], batch['lengths']
        mask = (tags != PAD_ID)
        logits = self(sents, lengths)
        _ , preds = torch.max(logits, dim = 1)

        ## calculate metrics
        f1_score, precision, recall = self.calculate_metrics(preds, tags, mask)
        self.test_f1.append(f1_score)
        self.test_precision.append(precision)
        self.test_recall.append(recall)

    def on_test_epoch_end(self):
        print(f'F1 : {np.mean(self.test_f1)} Precision : {np.mean(self.test_precision)} Recall : {np.mean(self.test_recall)}')
        self.test_f1 = []
        self.test_precision = []
        self.test_recall = []

In [None]:
# model= NERModel(vocab_size = len(word2idx),
#                 emb_dim = 100,
#                 hidden_dim = 64,
#                 n_tags = len(tag2idx),
#                 learning_rate = 1e-3,
#                 dropout = 0.3,
#                 bidirectional = True,
#                 n_layers = 2,
#                 use_pretrained=True
#                 )

# logits = model(example['sent'], example['lengths'], verbose = True)
# true_label = example['tag']
# print(f"True label shape : {true_label.shape}")
# loss = model.loss_fn(logits, true_label)
# print(loss)
# _ , pred_label = torch.max(logits, dim = 1)

In [None]:
## Model Training
model= NERModel(vocab_size = len(word2idx),
                emb_dim = 100,
                hidden_dim = 100,
                n_tags = len(tag2idx),
                learning_rate = 1e-3,
                dropout = 0.5,
                bidirectional = True,
                n_layers = 2,
                use_pretrained= True
                )

callbacks = pl.callbacks.ModelCheckpoint(dirpath = "checkpoints_logs",
                                         filename = '{epoch}-{val_loss:.2f}-{val_f1:.2f}',
                                          mode = "min",
                                          monitor = "val_loss",
                                          save_last = True,
                                          save_top_k=-1)


trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=5,
           check_val_every_n_epoch = 1,
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/saurabh/mydata/checkpoints_logs exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
---------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch : 0 Loss : 2.8791871070861816 F1 : 0.1052684133559239


/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Loss : 0.12112652530272802 F1 : 0.8512130343027688


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Loss : 0.1085385206176175 F1 : 0.8686529266721666


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 2 Loss : 0.09986486341390345 F1 : 0.8730747940696316


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Loss : 0.09947697059147888 F1 : 0.8763026366238288


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch : 4 Loss : 0.10616053546054496 F1 : 0.8730404871427234


In [None]:
## F1 : 0.8752422424553526 Precision : 0.8752619576276566 Recall : 0.8756151814430803
trainer.test(model, dataloaders= test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

F1 : 0.8752422424553526 Precision : 0.8752619576276566 Recall : 0.8756151814430803


[{}]

## Predict

In [None]:
model = model.eval()

In [None]:
def process_data(text):
    text = text.strip().split(" ")
    lengths = len(text)
    encoded = []
    for w in text:
        encoded.append(word2idx.get(w, PAD_ID))

    text_tensor = torch.tensor(encoded).view(1, -1)
    lengths = torch.tensor([lengths])
    return text_tensor, lengths

In [None]:
i = random.choices(list(range(len(test_sents))))[0]
text = test_sents[i]
true_label = test_tags[i].strip().split(" ")
text_tensor, lengths = process_data(text)
print(text_tensor.shape, lengths.shape)

logits = model(text_tensor, lengths)
_ , preds = torch.max(logits, dim = 1)
preds = preds.numpy()[0]
pred_labels = [idx2tag[p] for p in preds]

for w, p, t in zip(text.split(" "), pred_labels, true_label):
    print(f"{w:<10}  -->  {p:<5} --> {t:<5}")

torch.Size([1, 27]) torch.Size([1])
Chinese     -->  B-gpe --> B-gpe
worker      -->  O     --> O    
sews        -->  O     --> O    
clothing    -->  O     --> O    
at          -->  O     --> O    
a           -->  O     --> O    
garment     -->  O     --> O    
factory     -->  O     --> O    
in          -->  O     --> O    
Beijing     -->  B-geo --> B-geo
China       -->  I-geo --> I-geo
is          -->  O     --> O    
criticizing  -->  O     --> O    
the         -->  O     --> O    
European    -->  B-org --> B-org
Union       -->  I-org --> I-org
's          -->  O     --> O    
decision    -->  O     --> O    
to          -->  O     --> O    
investigate  -->  O     --> O    
surging     -->  O     --> O    
imports     -->  O     --> O    
of          -->  O     --> O    
Chinese     -->  B-gpe --> B-gpe
textile     -->  O     --> O    
products    -->  O     --> O    
.
          -->  O     --> O    
