# Named Entity Recognition

In [1]:
! pip install seqeval==0.0.10

Collecting seqeval==0.0.10
  Downloading seqeval-0.0.10-py3-none-any.whl.metadata (3.4 kB)
Downloading seqeval-0.0.10-py3-none-any.whl (7.5 kB)
Installing collected packages: seqeval
Successfully installed seqeval-0.0.10


In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, f1_score
from tqdm import tqdm

## Read and Preproc data

In [3]:
def loading_data(data_path):
    
    data = pd.read_csv(data_path)
    
    data.dropna(inplace=True)
    print("Number of rows : ",data.shape[0]," and the number of columns : ",data.shape[1])
    
    return data

In [4]:
data = loading_data("/kaggle/input/named-entity-recognition-ner-corpus/ner.csv")

Number of rows :  47959  and the number of columns :  4


In [5]:
data.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [6]:
def preproc(data):
    data['POS'] = data['POS'].apply(lambda x: eval(x))
    data['Tag'] = data['Tag'].apply(lambda x: eval(x))
    
    return data

In [7]:
data = preproc(data)

In [8]:
data.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [9]:
X = data['Sentence'].values
y_pos = data['POS'].values
y_tag = data['Tag'].values

## Распознование именнованных сущностей

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_tag,test_size=0.2,random_state=42)
print("Train Data size:", len(X_train))
print("Test Data size", len(X_test))

Train Data size: 38367
Test Data size 9592


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter

In [12]:
class MyTokenizer():
    def __init__(self, max_words):
        self.max_words = max_words
    
    def fit(self, texts):
        words = Counter()
        for text in texts:
            for word in text.split():
                words[word] += 1
        top_words = sorted(words.items(), key=lambda item: item[1], reverse=True)[:self.max_words]
        
        self.vocab = {word[0]: i+1 for i, word in enumerate(top_words)}
        self.vocab['<PAD>'] = 0
        self.vocab['<UNK>'] = self.max_words+1
        self.decode_vocab = {v: k for k, v in self.vocab.items()}

    def token_to_id(self, token):
        if token in self.vocab:
            return self.vocab[token]
        else:
            return self.vocab['<UNK>']
    
    def encode(self, text):
        arr = []
        for word in text.split():
            arr.append(self.token_to_id(word))
        return arr
    
    def decode(self, sequence):
        arr = []
        for token in sequence:
            if token in self.decode_vocab:
                arr.append(self.decode_vocab[token])
            else:
                arr.append('<UNK>')
        return ' '.join(arr)

In [13]:
tokenizer = MyTokenizer(25000)

In [14]:
tokenizer.fit(X_train)

In [15]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, tag2idx, max_len=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        tags = set([val for sublist in self.labels for val in sublist])
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.texts)
    
    def encode_labels(self, labels):
        arr = []
        for label in labels:
            arr.append(self.tag2idx[label])
        return arr

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer.encode(text)
        encoded_label = self.encode_labels(label)
        
        if len(encoded_text) > self.max_len:
            encoded_text = encoded_text[:self.max_len]
            encoded_label = encoded_label[:self.max_len]
        else: 
            encoded_text += [self.tokenizer.token_to_id("<PAD>")] * (self.max_len - len(encoded_text))
            encoded_label += [self.tag2idx['O']] * (self.max_len - len(encoded_label))
        
        return torch.tensor(encoded_text), torch.tensor(encoded_label)

In [16]:
tags = set([val for sublist in y_train for val in sublist])
tag2idx = {k: v for v,k in enumerate(tags)}
idx2tag = {v: k for k, v in tag2idx.items()}
tag2idx

{'B-geo': 0,
 'B-art': 1,
 'B-eve': 2,
 'I-art': 3,
 'O': 4,
 'I-geo': 5,
 'I-org': 6,
 'B-gpe': 7,
 'B-tim': 8,
 'B-per': 9,
 'I-eve': 10,
 'B-org': 11,
 'I-tim': 12,
 'B-nat': 13,
 'I-nat': 14,
 'I-gpe': 15,
 'I-per': 16}

In [17]:
train_dataset = TextDataset(X_train, y_train, tokenizer, tag2idx)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)

test_dataset = TextDataset(X_test, y_test, tokenizer, tag2idx)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, drop_last=True)

In [18]:
train_dataset[14931]

(tensor([12600,    23,     1,    24,   977,   200,  2178,    10, 23702, 23703,
          2590,    28,  7644,     1,  5968,     8,    25,    35,   336,  1987,
           669, 16575,   436,     5,  3542,    22,   308,   429,     2,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([ 4,  4,  4,  0,  4,  4, 11,  4, 11,  6,  6,  4,  4,  4,  4,  4,  4,  4,
          4,  4,  4,  4,  4,  4,  0,  4,  8, 12,  4,  4,  4,  4,  4,  4,  4,  4,
          4,  4,  4,  4,  4,  4,  4,  4,  4,  

In [19]:
class NER_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1000, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        return self.fc(output)

In [20]:
INPUT_DIM = len(tokenizer.vocab)+1
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(train_dataset.tag2idx)
N_LAYERS = 2
PAD_IDX = tokenizer.token_to_id("<PAD>")
UNDEF_IDX = tag2idx['O']

model_lstm = NER_LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, PAD_IDX)

In [21]:
model_lstm

NER_LSTM(
  (embedding): Embedding(26003, 100, padding_idx=0)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=17, bias=True)
)

In [22]:
myit = iter(train_dataloader)
t = next(myit)

In [23]:
lengths = torch.tensor([100 for _ in range(16)])
a, b = t
predictions = model_lstm(a, lengths)

In [24]:
predictions.argmax(dim=-1)

tensor([[13, 13, 13,  ...,  2, 13, 13],
        [ 2,  2,  2,  ...,  2, 13, 13],
        [ 3,  3,  3,  ...,  2, 13, 13],
        ...,
        [13, 13, 13,  ...,  2, 13, 13],
        [13, 13, 13,  ...,  2, 13, 13],
        [ 2,  2,  2,  ...,  2, 13, 13]])

In [25]:
model_lstm

NER_LSTM(
  (embedding): Embedding(26003, 100, padding_idx=0)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=17, bias=True)
)

In [26]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    lengths = torch.tensor([100 for _ in range(16)])
    for batch in tqdm(dataloader):
        texts, labels = batch
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts, lengths)
        loss = criterion(predictions.transpose(-1, -2), labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [27]:
UNDEF_IDX

4

In [28]:
idx2tag

{0: 'B-geo',
 1: 'B-art',
 2: 'B-eve',
 3: 'I-art',
 4: 'O',
 5: 'I-geo',
 6: 'I-org',
 7: 'B-gpe',
 8: 'B-tim',
 9: 'B-per',
 10: 'I-eve',
 11: 'B-org',
 12: 'I-tim',
 13: 'B-nat',
 14: 'I-nat',
 15: 'I-gpe',
 16: 'I-per'}

In [29]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    lengths = torch.tensor([100 for _ in range(16)])
    with torch.no_grad():
        for batch in tqdm(dataloader):
            texts, labels = batch
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts, lengths)
            
            loss = criterion(predictions.transpose(-1, -2), labels)
            epoch_loss += loss.item()
            
            predictions = predictions.argmax(dim=-1).cpu().numpy()
            labels = labels.cpu().numpy()
            for pred, label in zip(predictions, labels):
                all_preds.append([idx2tag[p] for p in pred ])
                all_labels.append([idx2tag[l] for l in label])
    f1 = f1_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)
    
    return epoch_loss / len(dataloader), f1, report

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [31]:
optimizer = optim.Adam(model_lstm.parameters())
criterion = nn.CrossEntropyLoss()
model_lstm = model_lstm.to(device)
criterion = criterion.to(device)

In [32]:
max(list(tokenizer.vocab.values()))

25001

In [33]:
test_loss, f1, report = evaluate_model(model_lstm, test_dataloader, criterion, device)

100%|██████████| 599/599 [00:05<00:00, 111.09it/s]


In [34]:
print(report)

           precision    recall  f1-score   support

      geo       0.00      0.00      0.00      7658
      tim       0.03      0.00      0.00      4044
      org       0.00      0.00      0.00      3904
      per       0.00      0.01      0.01      3386
      gpe       0.01      0.03      0.01      3170
      nat       0.00      0.22      0.00        50
      eve       0.00      0.13      0.00        60
      art       0.00      0.00      0.00        86

micro avg       0.00      0.01      0.00     22358
macro avg       0.01      0.01      0.00     22358



In [35]:
NUM_EPOCHS = 4
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model_lstm, train_dataloader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {train_loss:.4f}')

    test_loss, f1, report = evaluate_model(model_lstm, test_dataloader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test f1: {f1:.4f}')
    print(report)

100%|██████████| 2397/2397 [00:42<00:00, 56.62it/s]


Epoch 1/4, Training Loss: 0.0600


100%|██████████| 599/599 [00:04<00:00, 122.26it/s]


Test Loss: 0.0298, Test f1: 0.7753
           precision    recall  f1-score   support

      geo       0.84      0.84      0.84      7658
      tim       0.83      0.80      0.82      4046
      per       0.64      0.67      0.66      3386
      gpe       0.93      0.91      0.92      3172
      art       0.00      0.00      0.00        86
      org       0.67      0.55      0.60      3910
      nat       0.00      0.00      0.00        50
      eve       0.67      0.13      0.22        60

micro avg       0.79      0.76      0.78     22368
macro avg       0.79      0.76      0.77     22368



100%|██████████| 2397/2397 [00:42<00:00, 56.49it/s]


Epoch 2/4, Training Loss: 0.0231


100%|██████████| 599/599 [00:04<00:00, 121.38it/s]


Test Loss: 0.0243, Test f1: 0.8037
           precision    recall  f1-score   support

      org       0.68      0.60      0.64      3910
      geo       0.85      0.87      0.86      7660
      gpe       0.95      0.93      0.94      3171
      tim       0.85      0.85      0.85      4046
      per       0.69      0.72      0.71      3383
      eve       0.28      0.18      0.22        60
      art       0.00      0.00      0.00        86
      nat       0.26      0.12      0.16        50

micro avg       0.81      0.80      0.80     22366
macro avg       0.81      0.80      0.80     22366



100%|██████████| 2397/2397 [00:43<00:00, 55.73it/s]


Epoch 3/4, Training Loss: 0.0156


100%|██████████| 599/599 [00:04<00:00, 121.68it/s]


Test Loss: 0.0247, Test f1: 0.8085
           precision    recall  f1-score   support

      geo       0.86      0.87      0.86      7659
      gpe       0.95      0.93      0.94      3172
      org       0.66      0.65      0.65      3913
      tim       0.88      0.83      0.85      4043
      per       0.73      0.70      0.71      3387
      nat       0.50      0.24      0.32        50
      eve       0.34      0.20      0.25        60
      art       0.00      0.00      0.00        86

micro avg       0.82      0.80      0.81     22370
macro avg       0.81      0.80      0.81     22370



100%|██████████| 2397/2397 [00:42<00:00, 55.98it/s]


Epoch 4/4, Training Loss: 0.0105


100%|██████████| 599/599 [00:05<00:00, 119.29it/s]


Test Loss: 0.0263, Test f1: 0.8058
           precision    recall  f1-score   support

      gpe       0.96      0.92      0.94      3173
      geo       0.83      0.88      0.86      7662
      tim       0.85      0.84      0.85      4047
      org       0.67      0.63      0.65      3913
      per       0.71      0.72      0.71      3386
      nat       0.62      0.26      0.37        50
      art       0.13      0.07      0.09        86
      eve       0.31      0.20      0.24        60

micro avg       0.81      0.81      0.81     22377
macro avg       0.80      0.81      0.80     22377



## 

#### BERT

In [36]:
from transformers import BertTokenizer, BertForTokenClassification

In [37]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=OUTPUT_DIM)  # Set NUM_LABELS according to your dataset


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
class BERT_CLF(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.lin1 = nn.Linear(input_dim, input_dim)
        self.lin2 = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, seq):
        output = self.lin2(self.dropout(self.lin1(seq)))
        return output

In [39]:
bert_clf = BERT_CLF(768,17)
bert_model.classifier=bert_clf

In [40]:
bert_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [41]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = bert_tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [42]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(X, y_tag)]

In [43]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [44]:
import tensorflow as tf

2024-07-12 14:00:17.221128: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 14:00:17.221239: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 14:00:17.398506: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [45]:
MAX_LEN = 100

In [46]:
input_ids = tf.keras.utils.pad_sequences([bert_tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", value=0.0, truncating="post", padding="post")
tags = tf.keras.utils.pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx["O"], padding="post", dtype="long", truncating="post")

In [47]:
X_train, X_test, y_train, y_test = train_test_split(input_ids, tags,test_size=0.2,random_state=42)
print("Train Data size:", len(X_train))
print("Test Data size", len(X_test))

Train Data size: 38367
Test Data size 9592


In [48]:
class BertDataset(Dataset):
    def __init__(self, inputs_ids, labels):
        super(BertDataset, self).__init__()
        
        self.texts = inputs_ids
        self.target = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        
        inputs = self.texts[index]
        
        mask = [float(i != 0.0) for i in inputs]

        return {
            'ids': torch.tensor(inputs, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'target': torch.tensor(self.target[index], dtype=torch.long)
            }

    

bert_dataset_train = BertDataset(X_train, y_train)
bert_dataset_test = BertDataset(X_test, y_test)

bert_dataloader_train = DataLoader(dataset=bert_dataset_train,batch_size=16)
bert_dataloader_test = DataLoader(dataset=bert_dataset_test,batch_size=16)

In [49]:
bert_dataset_train[0]

{'ids': tensor([ 1996,  5388,  1011,  2095,  1011,  2214,  2280, 12941,  2758,  2002,
          3024,  2592,  2000,  2019,  2880,  2012,  1996,  5611,  8408,  1998,
          2000,  2048,  2372,  1997,  1037, 19670,  2177,  2170,  1996,  2137,
          3956,  2270,  3821,  2837,  1012,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0

In [50]:
bert_model = bert_model.to(device)

In [51]:
for param in bert_model.bert.parameters():
    param.requires_grad = False

In [52]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [53]:
NUM_EPOCHS = 10
optimizer_bert = AdamW(bert_model.parameters(), lr=3e-5, eps=1e-8)
total_steps = len(train_dataloader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer_bert, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()



In [54]:
def train_model_bert(model, dataloader, optimizer, device):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader):
        b_input_ids = batch['ids']
        b_masks = batch['mask']
        b_labels = batch['target']
        
        outputs = model(b_input_ids.to(device), token_type_ids=None, attention_mask=b_masks.to(device), labels=b_labels.to(device))
        loss = criterion(outputs[1].transpose(-1, -2), b_labels.to(device))
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

In [55]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels 
      
    # Get predictions from the logits
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens)
    str_labels = [
        [label_names[t] for t in label if t!=-100] for label in labels
    ]

    str_preds = [
        [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = metric.compute(predictions=str_preds, references=str_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"], 
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]  
    }

In [56]:
def evaluate_model_bert(model, dataloader, device):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    lengths = torch.tensor([100 for _ in range(16)])
    with torch.no_grad():
        for batch in tqdm(dataloader):
            b_input_ids = batch['ids']
            b_masks = batch['mask']
            b_labels = batch['target']

            outputs = model(b_input_ids.to(device), token_type_ids=None, attention_mask=b_masks.to(device), labels=b_labels.to(device))
            
            loss = criterion(outputs[1].transpose(-1, -2), b_labels.to(device))
            epoch_loss += loss.item()
            
            predictions = outputs[1].argmax(dim=-1).cpu().numpy()
            labels = b_labels.cpu().numpy()
            
            for pred, label in zip(predictions, labels):
                all_preds.append([idx2tag[p] for p in pred ])
                all_labels.append([idx2tag[l] for l in label])
    
    f1 = f1_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)
    
    return epoch_loss / len(dataloader), f1, report

In [57]:
myit = iter(bert_dataloader_train)
batch = next(myit)
b_input_ids = batch['ids']
b_masks = batch['mask']
b_labels = batch['target']

In [58]:
outputs = bert_model(b_input_ids.to(device), token_type_ids=None, attention_mask=b_masks.to(device), labels=b_labels.to(device))

In [59]:
outputs[1].argmax(dim=-1)

tensor([[ 3,  9,  9,  ...,  8,  8,  8],
        [ 6,  1,  8,  ...,  9,  9, 11],
        [ 4,  9,  4,  ...,  4,  4,  4],
        ...,
        [ 8,  8,  9,  ...,  8,  3,  9],
        [ 4,  3,  9,  ...,  9,  9,  8],
        [ 0,  4,  4,  ...,  4,  4,  2]], device='cuda:0')

In [60]:
for epoch in range(NUM_EPOCHS):
    train_loss = train_model_bert(bert_model, bert_dataloader_train, optimizer_bert, device)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {train_loss:.4f}')

    test_loss, f1, report = evaluate_model_bert(bert_model, bert_dataloader_test, device)
    print(f'Test Loss: {test_loss:.4f}, Test f1: {f1:.4f}')
    print(report)

100%|██████████| 2398/2398 [03:48<00:00, 10.49it/s]


Epoch 1/10, Training Loss: 0.2385


100%|██████████| 600/600 [00:55<00:00, 10.80it/s]


Test Loss: 0.1087, Test f1: 0.4285
           precision    recall  f1-score   support

      per       0.33      0.27      0.30      5186
      art       0.00      0.00      0.00       146
      geo       0.51      0.68      0.58     11526
      org       0.31      0.14      0.19      6390
      tim       0.53      0.31      0.39      4380
      gpe       0.75      0.23      0.35      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.47      0.39      0.43     31247
macro avg       0.46      0.39      0.40     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.49it/s]


Epoch 2/10, Training Loss: 0.1170


100%|██████████| 600/600 [00:55<00:00, 10.79it/s]


Test Loss: 0.0883, Test f1: 0.5387
           precision    recall  f1-score   support

      per       0.44      0.43      0.44      5186
      art       0.00      0.00      0.00       146
      geo       0.59      0.73      0.65     11526
      org       0.37      0.24      0.29      6390
      tim       0.63      0.55      0.59      4380
      gpe       0.71      0.57      0.63      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.55      0.53      0.54     31247
macro avg       0.53      0.53      0.52     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.47it/s]


Epoch 3/10, Training Loss: 0.1057


100%|██████████| 600/600 [00:55<00:00, 10.80it/s]


Test Loss: 0.0823, Test f1: 0.5691
           precision    recall  f1-score   support

      per       0.49      0.48      0.48      5186
      art       0.00      0.00      0.00       146
      geo       0.62      0.74      0.67     11526
      org       0.40      0.27      0.32      6390
      tim       0.66      0.59      0.63      4380
      gpe       0.73      0.61      0.67      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.58      0.56      0.57     31247
macro avg       0.57      0.56      0.56     31247



100%|██████████| 2398/2398 [03:49<00:00, 10.46it/s]


Epoch 4/10, Training Loss: 0.1017


100%|██████████| 600/600 [00:55<00:00, 10.85it/s]


Test Loss: 0.0798, Test f1: 0.5805
           precision    recall  f1-score   support

      per       0.50      0.49      0.49      5186
      art       0.00      0.00      0.00       146
      geo       0.63      0.74      0.68     11526
      org       0.41      0.28      0.33      6390
      tim       0.68      0.61      0.64      4380
      gpe       0.74      0.64      0.68      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.59      0.57      0.58     31247
macro avg       0.58      0.57      0.57     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.48it/s]


Epoch 5/10, Training Loss: 0.0999


100%|██████████| 600/600 [00:55<00:00, 10.72it/s]


Test Loss: 0.0783, Test f1: 0.5878
           precision    recall  f1-score   support

      per       0.50      0.50      0.50      5186
      art       0.00      0.00      0.00       146
      geo       0.64      0.75      0.69     11526
      org       0.42      0.30      0.35      6390
      tim       0.69      0.62      0.65      4380
      gpe       0.74      0.65      0.69      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.60      0.58      0.59     31247
macro avg       0.58      0.58      0.58     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.48it/s]


Epoch 6/10, Training Loss: 0.0987


100%|██████████| 600/600 [00:55<00:00, 10.81it/s]


Test Loss: 0.0774, Test f1: 0.5945
           precision    recall  f1-score   support

      per       0.50      0.52      0.51      5186
      art       0.00      0.00      0.00       146
      geo       0.65      0.75      0.69     11526
      org       0.42      0.30      0.35      6390
      tim       0.70      0.62      0.66      4380
      gpe       0.74      0.66      0.70      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.60      0.59      0.59     31247
macro avg       0.59      0.59      0.58     31247



100%|██████████| 2398/2398 [03:49<00:00, 10.47it/s]


Epoch 7/10, Training Loss: 0.0978


100%|██████████| 600/600 [00:55<00:00, 10.83it/s]


Test Loss: 0.0770, Test f1: 0.5968
           precision    recall  f1-score   support

      per       0.51      0.53      0.52      5186
      art       0.00      0.00      0.00       146
      geo       0.65      0.75      0.69     11526
      org       0.42      0.31      0.36      6390
      tim       0.70      0.63      0.66      4380
      gpe       0.74      0.66      0.70      3453
      nat       0.00      0.00      0.00        98
      eve       0.00      0.00      0.00        68

micro avg       0.60      0.59      0.60     31247
macro avg       0.59      0.59      0.59     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.49it/s]


Epoch 8/10, Training Loss: 0.0974


100%|██████████| 600/600 [00:55<00:00, 10.80it/s]


Test Loss: 0.0766, Test f1: 0.5993
           precision    recall  f1-score   support

      per       0.51      0.53      0.52      5186
      art       0.00      0.00      0.00       146
      geo       0.65      0.75      0.70     11526
      org       0.42      0.32      0.37      6390
      tim       0.70      0.63      0.66      4380
      gpe       0.74      0.66      0.70      3453
      nat       0.00      0.00      0.00        98
      eve       0.33      0.01      0.03        68

micro avg       0.61      0.59      0.60     31247
macro avg       0.59      0.59      0.59     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.48it/s]


Epoch 9/10, Training Loss: 0.0967


100%|██████████| 600/600 [00:55<00:00, 10.82it/s]


Test Loss: 0.0762, Test f1: 0.5996
           precision    recall  f1-score   support

      per       0.51      0.53      0.52      5186
      art       0.00      0.00      0.00       146
      geo       0.66      0.75      0.70     11526
      org       0.42      0.33      0.37      6390
      tim       0.71      0.63      0.67      4380
      gpe       0.74      0.67      0.70      3453
      nat       0.00      0.00      0.00        98
      eve       0.33      0.01      0.03        68

micro avg       0.61      0.59      0.60     31247
macro avg       0.60      0.59      0.59     31247



100%|██████████| 2398/2398 [03:48<00:00, 10.49it/s]


Epoch 10/10, Training Loss: 0.0967


100%|██████████| 600/600 [00:55<00:00, 10.81it/s]


Test Loss: 0.0761, Test f1: 0.6001
           precision    recall  f1-score   support

      per       0.52      0.53      0.52      5186
      art       0.00      0.00      0.00       146
      geo       0.66      0.74      0.70     11526
      org       0.42      0.33      0.37      6390
      tim       0.71      0.63      0.67      4380
      gpe       0.74      0.67      0.70      3453
      nat       0.00      0.00      0.00        98
      eve       0.33      0.01      0.03        68

micro avg       0.61      0.59      0.60     31247
macro avg       0.60      0.59      0.59     31247

