In [48]:
%%capture
!pip install razdel
!pip install compress-fasttext

In [3]:
from typing import Dict, List, Tuple, Union, Callable, DefaultDict, Optional
from datasets import load_dataset
from razdel import tokenize
import string
from tqdm import tqdm
from sklearn.metrics import f1_score
import compress_fasttext
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from random import random



In [4]:
truncate = 25
batch_size = 32
embedding_dim = 300
hidden_size = 128
num_layers = 1
dropout = 0
bidirectional = True
lr = 0.001
weight_decay = 0
amsgrad = False
clip_grad_norm = 0.1
n_epoch = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
def func(element):
    del element['relations'], element['links']
    return element
def get_ent_data(ent):
    ent = ent.split('\t')
    words = ent[2]
    mid = ent[1].split(' ')
    entity = mid[0]
    beg = mid[1]
    end = mid[2]
    return words, entity, beg, end

def get_tokens(s):
    tokens = []
    for token in tokenize(s.lower()):
            if token.text in string.punctuation.replace('.', '')+'—«»ъ':
                pass
            else:
                tokens.append(token.text)
    return tokens

def preprocess_and_write_in_file(filename, iterator):
    my_file = open(filename, 'a+')
    limit = 50
    for sample in tqdm(iterator):
        entities = []
        for ent in sample['entities']:
            words, entity_name, beg, end = get_ent_data(ent)
            try:
                end = int(end)
            except:
                continue
            words_list = words.split(' ')
            words_list_len = len(words_list)
            if len(words_list) != 1:
                for i, the_word in enumerate(words_list):
                    if i == 0:
                        entities.append((the_word, 'B_'+entity_name, beg, '-1'))
                    elif i == words_list_len - 1:
                        entities.append((the_word, 'E_'+entity_name, '-1', end))
                    else:
                        entities.append((the_word, 'M_'+entity_name, '-1', '-1'))
            else:
                entities.append((words, entity_name, beg, end))

        tokens = get_tokens(sample['text'])

        result = []
        for token in tokens:
            for idx, entity in enumerate(entities):
                name = entity[1]
                start = int(entity[2])
                stop = int(entity[3])
                if token.start == start or token.stop == stop:
                    result.append((token.text, name))
                    break
                if idx == len(entities)-1:
                    result.append((token.text, None))
        for idx in range(1, len(result)):
            previous_ent_name = str(result[idx-1][1])
            current_ent_name = str(result[idx][1])
            current_ent_word = str(result[idx][0])
            if (previous_ent_name[:2] == 'B_' or previous_ent_name[:2] == 'M_') and current_ent_name == 'None':
                result[idx] = (current_ent_word,'M_'+previous_ent_name[2:])
        for idx in range(len(result)):
            current_ent_name = str(result[idx][1])
            current_ent_word = str(result[idx][0])
            if current_ent_name == 'None':
                result[idx] = (current_ent_word, 'O')
                continue
            if current_ent_name[:2] == 'B_':
                continue
            if current_ent_name[:2] == 'M_' or current_ent_name[:2] == 'E_':
                result[idx] = (current_ent_word, 'I_'+current_ent_name[2:])
                continue
            result[idx] = (current_ent_word, 'B_'+current_ent_name)
        
        for res in result:
            my_file.write(res[0]+' '+res[1]+'\n')
def to_indexes(element: Tuple[str,str]) -> Tuple[int, int]:
    word = element[0]
    label = element[1]
    if word in token2id:
        word_id = token2id[word]
    else:
        word_id = 0
    label_id = l2i[label]
    return word_id, label_id
    
def count_unks(elements: List[Tuple[str, str]]) -> int:
    counter = 0
    for element in elements:
        word_id = element[0]
        if word_id == 0:
            counter += 1
    return counter

def pad_sentence(sentence, padding = 32):
    l = len(sentence)
    if l < padding:
        return sentence + [(1, 0) for _ in range(padding-l)]
    else:
        return sentence[:padding]

In [6]:
fasttext = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    'https://github.com/avidale/compress-fasttext/releases/download/gensim-4-draft/geowac_tokens_sg_300_5_2020-100K-20K-100.bin'
)

In [7]:
train_data = []
valid_data = []
with open('/kaggle/input/nerdataset/train_ds.txt') as openfileobject:
    for line in openfileobject:
        train_data.append((line[:-1].split(' ')[0], line[:-1].split(' ')[1]))
with open('/kaggle/input/nerdataset/val_ds.txt') as openfileobject:
    for line in openfileobject:
        valid_data.append((line[:-1].split(' ')[0], line[:-1].split(' ')[1]))
label_names = load_dataset('MalakhovIlya/NEREL', 'ent_types')['ent_types']['type']

print('TRAIN_DATASET:')
print(train_data[20:25], 'etc')
print('LABEL_NAMES:') 
print(label_names[:5], 'etc')

Downloading builder script:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading and preparing dataset nerel_builder/ent_types to /root/.cache/huggingface/datasets/MalakhovIlya___nerel_builder/ent_types/1.1.0/e399bfb732badee345c987fd415ea0b17c085de7bd8c4a85b56d7c5e7205df2a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Generating ent_types split: 0 examples [00:00, ? examples/s]

Dataset nerel_builder downloaded and prepared to /root/.cache/huggingface/datasets/MalakhovIlya___nerel_builder/ent_types/1.1.0/e399bfb732badee345c987fd415ea0b17c085de7bd8c4a85b56d7c5e7205df2a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

TRAIN_DATASET:
[('правоохранительные', 'B_ORGANIZATION'), ('органы', 'I_ORGANIZATION'), ('киргизии', 'I_ORGANIZATION'), ('обнаружили', 'O'), ('в', 'O')] etc
LABEL_NAMES:
['AGE', 'AWARD', 'CITY', 'COUNTRY', 'CRIME'] etc


In [8]:
token2count = {}
token2id = {'unk' : 0, 'pad' : 1}
id2token = {0 : 'unk',1 : 'pad'}
id2vec = {0 : fasttext.get_mean_vector(list(fasttext.key_to_index)), 1 : fasttext['пикассо']}

for word, entity_name in train_data+valid_data:
    if word in token2count:
        token2count[word] += 1
    else:
        token2count[word] = 1    

i = 2
for word in token2count:
    #if token2count[word] > 1 or (token2count[word] == 1 and random() > 0.5):
    token2id[word] = i
    id2token[i] = word
    i+= 1
    
for idx,token in enumerate(token2id):
    if idx > 1:
        id2vec[idx] = fasttext[token]
vocab_size = len(token2id)

l2i = {'O':0}
i = 1
for label in label_names:
    l2i['B_'+label] = i
    l2i['I_'+label] = i+1
    i += 2
i2l = {v: k for k, v in l2i.items()}

In [9]:
print('tok2count:', token2count['дом'])
print('tok2id:', token2id['дом'])
print('id2tok:', id2token[194])
print('i2l:', i2l[3])
print('l2i:', l2i['B_AWARD'])

tok2count: 39
tok2id: 194
id2tok: дом
i2l: B_AWARD
l2i: 3


In [45]:
class NERDataset(Dataset):
    def __init__(self, dataset: List[Tuple[str, str]]):
        self.dataset = list(map(to_indexes,dataset))
        X = []
        y = []
        start = 0
        for idx, (token, label) in enumerate(self.dataset): 
            if token == token2id['.']:
                padded = pad_sentence(self.dataset[start:idx+1])
                X.append([padded[i][0] for i in range(len(padded))])
                y.append([padded[i][1] for i in range(len(padded))])
                start = idx + 1
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx: int):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.y[idx])

In [11]:
train_ds = NERDataset(train_data)
valid_ds = NERDataset(valid_data)
train_loader = DataLoader(
    dataset = train_ds,
    batch_size = batch_size,
    drop_last = True
)
valid_loader = DataLoader(
    dataset = valid_ds,
    batch_size = batch_size,
    drop_last = True
)

In [12]:
class EmbeddingPreTrained(nn.Module):
    
    def __init__(self, embedding_matrix: np.ndarray, freeze: bool = True):
        super(EmbeddingPreTrained, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix), 
            freeze = freeze
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.embedding(x)

In [13]:
class DynamicRNN(nn.Module):
    def __init__(self,
                 rnn_unit: nn.Module,
                 input_size: int,
                 hidden_size: int,
                 num_layers: int,
                 dropout: float,
                 bidirectional: bool):
        super(DynamicRNN, self).__init__()
        self.rnn = rnn_unit(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=bidirectional,
            batch_first = True)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        packed_x = pack_padded_sequence(x, [32 for _ in range(x.size(0))], batch_first = True)
        packed_rnn_out, hidden = self.rnn(packed_x)
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first = True)
        return rnn_out

In [14]:
class Attention(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float
    ):
        super(Attention, self).__init__()
        self.attention = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            dropout = dropout)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.transpose(0, 1)
        attn, _ = self.attention(query = x, key = x, value = x)
        attn = attn.transpose(0, 1)
        return attn

In [15]:
class LinearHead(nn.Module):
    def __init__(self, linear_head: nn.Module):
        super(LinearHead, self).__init__()
        self.linear_head = linear_head
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear_head(x)

In [16]:
class BiLSTM(nn.Module):
    def __init__(
        self,
        embedding_layer: nn.Module,
        rnn_layer: nn.Module,
        #attention_layer: nn.Module,
        linear_head: nn.Module
    ):
        super(BiLSTM, self).__init__()
        self.embedding = embedding_layer
        self.rnn = rnn_layer
        #self.attention = attention_layer
        self.linear_head = linear_head
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = self.rnn(x)
        #x = self.attention(x)
        x = self.linear_head(x)
        return x

In [17]:
embedding_matrix = []
for i in range(vocab_size):
    embedding_matrix.append(id2vec[i])
embedding_matrix = np.array(embedding_matrix, dtype='float32')
embeddingLayer = EmbeddingPreTrained(embedding_matrix)

In [18]:
rnnLayer = DynamicRNN(
    rnn_unit = nn.GRU,
    input_size = embedding_dim,
    hidden_size = hidden_size,
    num_layers = num_layers,
    dropout = dropout,
    bidirectional = bidirectional
)

In [19]:
linearLayer = LinearHead(linear_head = nn.Linear(in_features = 2*hidden_size,out_features = len(l2i)))
#attentionLayer = Attention(2*hidden_size, num_heads, dropout)

In [20]:
model = BiLSTM(embedding_layer = embeddingLayer,
               rnn_layer = rnnLayer,
               #attention_layer = attentionLayer,
               linear_head = linearLayer).to(device)
criterion = nn.CrossEntropyLoss()

In [21]:
def train_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    criterion: Callable,
    optimizer: torch.optim.Optimizer, 
    device: torch.device,
    clip_grad_norm: float
):
    model.train()
    losses = []
    scores = []
    for x,y in dataloader:
        x = x.to(device)
        y = y.to(device)
        out = model(x)
        loss = criterion(out.transpose(-1, -2), y)
        loss.backward()
        nn.utils.clip_grad_norm_(
            model.parameters(),
            max_norm = clip_grad_norm,
            norm_type = 2)
        optimizer.step()
        optimizer.zero_grad()
        y_true = to_numpy(y)
        y_pred = np.argmax(to_numpy(out), axis = -1)
        losses.append(loss.item())
        scores.append(f1_score(y_true = y_true.flatten(), y_pred = y_pred.flatten(), average = 'weighted'))
    return np.mean(losses), np.mean(scores)

In [22]:
def validate_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    criterion: Callable,
    device: torch.device
):
    model.eval()
    losses = []
    scores = []
    for x,y in dataloader:
        idx = 6
        x = x.to(device)
        y = y.to(device)
        with torch.no_grad():
            out = model(x)
            loss = criterion(out.transpose(-2, -1), y)
        y_true = to_numpy(y)
        y_pred = np.argmax(to_numpy(out), axis = -1)
        #for k in range(32):
        #    print(id2token[x[idx][k].item()], i2l[y[idx][k].item()], i2l[y_pred[idx][k]])
        losses.append(loss.item())
        scores.append(f1_score(y_true = y_true.flatten(), y_pred = y_pred.flatten(), average = 'weighted'))
    return np.mean(losses), np.mean(scores)

In [23]:
def train(
    model: nn.Module, 
    train_loader: DataLoader,
    valid_loader: DataLoader,
    criterion: Callable,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
    clip_grad_norm: float,
    n_epoch: int,
    verbose: bool = True
):
    for epoch in range(n_epoch):
        if verbose:
            print('epoch:', epoch)
        train_metrics = train_epoch(
            model = model,
            dataloader = train_loader,
            criterion = criterion,
            optimizer = optimizer,
            device = device,
            clip_grad_norm = clip_grad_norm)
        if verbose:
            print('train_metrics:', train_metrics)
        valid_metrics = validate_epoch(
            model = model,
            dataloader = valid_loader,
            criterion = criterion,
            device = device)
        if verbose:
            print('valid_metrcs:', valid_metrics)

In [24]:
train(model = model,
      train_loader = train_loader,
      valid_loader = valid_loader,
      criterion = criterion,
      optimizer = torch.optim.Adam(params = model.parameters(),
                                   lr = lr, weight_decay = weight_decay, 
                                   amsgrad = amsgrad),
      device = device,
      clip_grad_norm = clip_grad_norm,
      n_epoch = n_epoch)

epoch: 0
train_metrics: (0.62878701522036, 0.8239779305785878)
valid_metrcs: (0.3630113892066173, 0.8814987798913783)
epoch: 1
train_metrics: (0.31474846685335184, 0.8975659570426273)
valid_metrcs: (0.3018117409486037, 0.899474351384861)
epoch: 2
train_metrics: (0.26380026021858505, 0.9129870542471878)
valid_metrcs: (0.2777468367264821, 0.9073915741946922)
epoch: 3
train_metrics: (0.23265961722928993, 0.9225636692320576)
valid_metrcs: (0.2656653997225639, 0.9119592699360454)
epoch: 4
train_metrics: (0.20917786550004383, 0.9301064086429867)
valid_metrcs: (0.25935049718007064, 0.9150121148283549)
epoch: 5
train_metrics: (0.1893832782385242, 0.9366837356983286)
valid_metrcs: (0.2566666677594185, 0.916381345317771)
epoch: 6
train_metrics: (0.1714846767437228, 0.9428216003650559)
valid_metrcs: (0.256955603376413, 0.9170873969641647)
epoch: 7
train_metrics: (0.1546484922744645, 0.9485531356798376)
valid_metrcs: (0.26032004696436417, 0.9170569705128366)
epoch: 8
train_metrics: (0.138662627595

In [35]:
examples = ['Я на днях побывал в Нижнем Новгороде и меня встретил местный администратор.',
            'В 12 вечера ко мне приехал сантехник Алексей и рассказал о том, что его сын болеет ветрянкой.',
            'На улице Ильича открылась новая компания по очистке воды "Байкал".',
            'В российском посольстве в США произошел теракт с использованием химического оружия.',
            'Якутский спортсмен занял третье место в соревнованиях по стрельбе из лука в Токио на прошлой неделе.',
            'Американский боксер Мохаммед Али умер в 2016 году от болезни Паркинсона.']

In [47]:
example = examples[4]
example_ds = []
for token in get_tokens(example):
    example_ds.append((token, 'O'))
example_dataset = NERDataset(example_ds)
example_loader = DataLoader(dataset = example_dataset)
for x,y in example_loader:
    x = x.to(device)
    y = y.to(device)
    with torch.no_grad():
        out = model(x)
        loss = criterion(out.transpose(-2, -1), y)
    y_true = to_numpy(y)
    y_pred = np.argmax(to_numpy(out), axis = -1)
    for k in range(32):
        print(id2token[x[0][k].item()], i2l[y_pred[0][k]])
    break

unk O
спортсмен B_PROFESSION
занял O
третье B_ORDINAL
место I_AWARD
в O
соревнованиях B_EVENT
по I_EVENT
стрельбе I_EVENT
из I_EVENT
unk I_EVENT
в O
токио B_CITY
на B_DATE
прошлой I_DATE
неделе I_DATE
. O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
pad O
