<a href="https://colab.research.google.com/github/ducpc99/RNNs_for_Named_Entity_Recognition/blob/main/RNNs_for_Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def load_conll2003_data(file_path):
    sentences, tags = [], []
    sentence, sentence_tags = [], []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    tags.append(sentence_tags)
                    sentence, sentence_tags = [], []
            else:
                word, _, _, tag = line.split()
                sentence.append(word)
                sentence_tags.append(tag)
    return sentences, tags

train_sentences, train_tags = load_conll2003_data('/content/drive/MyDrive/Colab Notebooks/Deeplearning/Data/Named-Entity/train.txt')
test_sentences, test_tags = load_conll2003_data('/content/drive/MyDrive/Colab Notebooks/Deeplearning/Data/Named-Entity/test.txt')

In [4]:
def build_vocab(data):
    vocab = {"": 0, "": 1}
    for sentence in data:
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

word_vocab = build_vocab(train_sentences)
tag_vocab = build_vocab(train_tags)
tag_reverse_vocab = {v: k for k, v in tag_vocab.items()}

def encode_data(sentences, tags, word_vocab, tag_vocab, max_len=50):
    encoded_sentences, encoded_tags = [], []
    for sent, tag in zip(sentences, tags):
        encoded_sentences.append([word_vocab.get(w, 1) for w in sent][:max_len] + [0] * (max_len - len(sent)))
        encoded_tags.append([tag_vocab[t] for t in tag][:max_len] + [0] * (max_len - len(tag)))
    return np.array(encoded_sentences), np.array(encoded_tags)

train_inputs, train_labels = encode_data(train_sentences, train_tags, word_vocab, tag_vocab)
test_inputs, test_labels = encode_data(test_sentences, test_tags, word_vocab, tag_vocab)

In [5]:
class NERDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.labels[idx])

train_dataset = NERDataset(train_inputs, train_labels)
test_dataset = NERDataset(test_inputs, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

class RNNs_NER(nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim, hidden_dim):
        super(RNNs_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tag_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [6]:
vocab_size = len(word_vocab)
tag_size = len(tag_vocab)
model = RNNs_NER(vocab_size, tag_size, embedding_dim=100, hidden_dim=128)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

def evaluate_model(model, data_loader, tag_reverse_vocab):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in data_loader:
            outputs = model(inputs)
            preds = outputs.argmax(dim=-1).view(-1).cpu().numpy()
            labels = labels.view(-1).cpu().numpy()
            mask = labels != 0
            all_preds.extend(preds[mask])
            all_labels.extend(labels[mask])

    unique_labels = list(set(all_labels))

    target_names = [tag_reverse_vocab[label] for label in unique_labels if label in tag_reverse_vocab]

    print(classification_report(all_labels, all_preds, target_names=target_names, labels=unique_labels))

def train_model(model, data_loader, criterion, optimizer, num_epochs=30):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in tqdm(data_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(data_loader)}")

In [7]:
train_model(model, train_loader, criterion, optimizer)

100%|██████████| 469/469 [00:20<00:00, 22.87it/s]


Epoch 1, Loss: 0.2399472160252935


100%|██████████| 469/469 [00:19<00:00, 24.27it/s]


Epoch 2, Loss: 0.11619721662038679


100%|██████████| 469/469 [00:18<00:00, 25.35it/s]


Epoch 3, Loss: 0.07554435961123214


100%|██████████| 469/469 [00:18<00:00, 25.36it/s]


Epoch 4, Loss: 0.05127805513518451


100%|██████████| 469/469 [00:18<00:00, 25.98it/s]


Epoch 5, Loss: 0.035656885194308215


100%|██████████| 469/469 [00:19<00:00, 24.20it/s]


Epoch 6, Loss: 0.024763380953752155


100%|██████████| 469/469 [00:18<00:00, 25.87it/s]


Epoch 7, Loss: 0.01723615110301768


100%|██████████| 469/469 [00:18<00:00, 25.79it/s]


Epoch 8, Loss: 0.012004453826622208


100%|██████████| 469/469 [00:19<00:00, 23.78it/s]


Epoch 9, Loss: 0.008263017616367765


100%|██████████| 469/469 [00:19<00:00, 24.30it/s]


Epoch 10, Loss: 0.005844222363218594


100%|██████████| 469/469 [00:18<00:00, 25.75it/s]


Epoch 11, Loss: 0.004281552743662133


100%|██████████| 469/469 [00:18<00:00, 25.80it/s]


Epoch 12, Loss: 0.0032225267953926457


100%|██████████| 469/469 [00:18<00:00, 25.06it/s]


Epoch 13, Loss: 0.002542340305203429


100%|██████████| 469/469 [00:19<00:00, 24.46it/s]


Epoch 14, Loss: 0.00214735919265012


100%|██████████| 469/469 [00:18<00:00, 25.84it/s]


Epoch 15, Loss: 0.001982301802385344


100%|██████████| 469/469 [00:18<00:00, 25.48it/s]


Epoch 16, Loss: 0.0018900798013516222


100%|██████████| 469/469 [00:18<00:00, 25.14it/s]


Epoch 17, Loss: 0.0018384496290817546


100%|██████████| 469/469 [00:18<00:00, 24.81it/s]


Epoch 18, Loss: 0.0016784007882239915


100%|██████████| 469/469 [00:18<00:00, 25.77it/s]


Epoch 19, Loss: 0.001726276150316686


100%|██████████| 469/469 [00:17<00:00, 26.19it/s]


Epoch 20, Loss: 0.0015055545704650394


100%|██████████| 469/469 [00:18<00:00, 25.45it/s]


Epoch 21, Loss: 0.0014832711253230629


100%|██████████| 469/469 [00:19<00:00, 24.25it/s]


Epoch 22, Loss: 0.0014501084693612457


100%|██████████| 469/469 [00:18<00:00, 25.79it/s]


Epoch 23, Loss: 0.0014056410268582066


100%|██████████| 469/469 [00:18<00:00, 25.98it/s]


Epoch 24, Loss: 0.001348549639739442


100%|██████████| 469/469 [00:18<00:00, 25.03it/s]


Epoch 25, Loss: 0.0013483718045687958


100%|██████████| 469/469 [00:18<00:00, 24.72it/s]


Epoch 26, Loss: 0.0013320337779686733


100%|██████████| 469/469 [00:18<00:00, 25.42it/s]


Epoch 27, Loss: 0.0014096482243322306


100%|██████████| 469/469 [00:18<00:00, 24.89it/s]


Epoch 28, Loss: 0.002674466759678342


100%|██████████| 469/469 [00:18<00:00, 24.79it/s]


Epoch 29, Loss: 0.0014446133466106458


100%|██████████| 469/469 [00:18<00:00, 25.21it/s]

Epoch 30, Loss: 0.0012652704427464627





In [8]:
evaluate_model(model, test_loader, tag_reverse_vocab)

              precision    recall  f1-score   support

           O       0.93      0.99      0.96     38378
       B-ORG       0.79      0.57      0.66      1658
      B-MISC       0.72      0.65      0.68       701
       B-PER       0.86      0.46      0.60      1580
       I-PER       0.93      0.25      0.40      1111
       B-LOC       0.84      0.77      0.80      1656
       I-ORG       0.87      0.52      0.65       827
      I-MISC       0.65      0.54      0.59       216
       I-LOC       0.72      0.58      0.64       255

    accuracy                           0.92     46382
   macro avg       0.81      0.59      0.67     46382
weighted avg       0.91      0.92      0.91     46382

