In [2]:
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
    pipeline, TrainingArguments, Trainer, DataCollatorForTokenClassification)
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("ner.csv")
data.dropna(inplace=True)
del data['POS']
del data['Sentence #']
data = data.rename(columns={'Sentence': 'tokens','Tag': 'ner_tags'})

In [4]:
%%capture output
import ast

for i in range(len(data)):
    tags = ast.literal_eval(data['ner_tags'][i])
    data['ner_tags'][i] = [str(word.upper()) for word in tags]

In [5]:
entity_codes = set([val for sublist in data['ner_tags'].values for val in sublist])
entity_codes

{'B-ART',
 'B-EVE',
 'B-GEO',
 'B-GPE',
 'B-NAT',
 'B-ORG',
 'B-PER',
 'B-TIM',
 'I-ART',
 'I-EVE',
 'I-GEO',
 'I-GPE',
 'I-NAT',
 'I-ORG',
 'I-PER',
 'I-TIM',
 'O'}

In [6]:
label2id = {}
dict([ (elem, 0) for elem in entity_codes ])
k=0
for i in entity_codes:
    label2id[i]=k
    k+=1
label2id['UNK']=k 
id2label = {y: x for x, y in label2id.items()}
print(label2id,"\n",id2label)

{'B-PER': 0, 'I-GPE': 1, 'I-EVE': 2, 'B-EVE': 3, 'B-ART': 4, 'B-GEO': 5, 'I-GEO': 6, 'I-ART': 7, 'B-GPE': 8, 'B-NAT': 9, 'O': 10, 'I-PER': 11, 'I-NAT': 12, 'B-TIM': 13, 'B-ORG': 14, 'I-TIM': 15, 'I-ORG': 16, 'UNK': 17} 
 {0: 'B-PER', 1: 'I-GPE', 2: 'I-EVE', 3: 'B-EVE', 4: 'B-ART', 5: 'B-GEO', 6: 'I-GEO', 7: 'I-ART', 8: 'B-GPE', 9: 'B-NAT', 10: 'O', 11: 'I-PER', 12: 'I-NAT', 13: 'B-TIM', 14: 'B-ORG', 15: 'I-TIM', 16: 'I-ORG', 17: 'UNK'}


In [7]:
%%capture output
data['labels'] = data['ner_tags']
for i in range(len(data)):
    data['tokens'][i] = data['tokens'][i].lower().split()
    data['labels'][i] = [label2id[x] for x in data['labels'][i]]
    if len(data['ner_tags'][i]) != len(data['tokens'][i]):
        data.drop([i],inplace=True)
data['labels'] = data['labels'].apply(lambda x: [int(i) for i in x])

In [8]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2)

In [9]:
import torch
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(data['tokens'], specials=["<unk>"])

X = torch.nn.utils.rnn.pad_sequence([torch.tensor([vocab[y] for y in x]) for x in data_train['tokens']], batch_first=True, padding_value = vocab["<unk>"]).to("cuda:0")
Y = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in data_train['labels'].to_list()], batch_first=True, padding_value = 17).to("cuda:0")

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F

batch_size=128
dataset = TensorDataset(X, Y)
train_loader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

class LSTMModel(nn.Module):
    def __init__(self, input_dim = len(vocab), embedding_dim = 200, hidden_dim = 200, output_dim = len(label2id)):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim,  batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm3 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm4 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm1_out, _ = self.lstm1(embedded)
        lstm2_out, _ = self.lstm2(lstm1_out)
        lstm3_out, _ = self.lstm3(lstm2_out)
        lstm4_out, _ = self.lstm4(lstm3_out)
        tag_space = self.fc(lstm4_out)
        return tag_space

# Инициализация модели, функции потерь и оптимизатора
model = LSTMModel()
model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Обучение модели
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for sentences, tags in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(sentences)
        loss = criterion(outputs.view(-1, len(label2id)), tags.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_size
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}')

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:03<00:00,  4.73it/s]


Epoch 1/10, Loss: 3141.937636613846


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:03<00:00,  4.74it/s]


Epoch 2/10, Loss: 982.0556359291077


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:03<00:00,  4.73it/s]


Epoch 3/10, Loss: 692.4617780447006


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:04<00:00,  4.67it/s]


Epoch 4/10, Loss: 532.6275644302368


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:05<00:00,  4.59it/s]


Epoch 5/10, Loss: 427.3578935265541


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:04<00:00,  4.65it/s]


Epoch 6/10, Loss: 353.73111110925674


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:04<00:00,  4.62it/s]


Epoch 7/10, Loss: 302.8136973977089


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:06<00:00,  4.50it/s]


Epoch 8/10, Loss: 273.10426196455956


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:06<00:00,  4.50it/s]


Epoch 9/10, Loss: 243.9323025047779


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:05<00:00,  4.61it/s]

Epoch 10/10, Loss: 214.25425785779953





In [10]:
model.eval()
predicted_labels = []

for i in tqdm(data_test['tokens'].to_list()):
    a = torch.tensor([vocab[y.lower()] for y in i]).to("cuda:0")
    with torch.no_grad():
        outputs = model(a)
    _, predicted = torch.max(outputs, 1)
    predicted_labels.append([id2label[x] for x in predicted.tolist()])


100%|█████████████████████████████████████████████████████████████████████████████| 9591/9591 [00:56<00:00, 168.36it/s]


In [11]:
k=0
for v1,v2 in zip(data_test['ner_tags'].to_list(), predicted_labels):
    if v1==v2:
        k+=1
"{:.2%}".format(k/len(data_test.values))

'57.56%'