In [120]:
from gensim.models.fasttext import FastText
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from torchmetrics.classification import MulticlassAccuracy

In [117]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        # RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True, nonlinearity='tanh')
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, embedded_input):
        # Initialize hidden state with zeros
        hidden_state = torch.zeros(1, 1, self.hidden_size).to(device)
        # Pass the input sequence through the RNN layer
        rnn_output, hidden_state = self.rnn(embedded_input, hidden_state)
        # Reshape the output to be of shape (batch_size * sequence_length, hidden_size)
        rnn_output = rnn_output.contiguous().view(-1, self.hidden_size)
        # Pass the RNN output through the fully connected layer to get the predicted tags
        predicted_tags = self.fc(rnn_output)
        
        return predicted_tags

In [17]:
class NERDataset(Dataset):
    def __init__(self, embedded_sentences):
        self.embedded_sentences = embedded_sentences
    def __len__(self):
        return len(self.embedded_sentences)
    def __getitem__(self, idx):
        return self.embedded_sentences[idx]

In [18]:
data = []
labels = []
with open("rus.tsv", "r", encoding="utf-8") as file:
    for line in file:
        tokens = list(line.strip().split("\t"))
        data.append(tokens[0])
        labels.append(tokens[-1])

In [19]:
label_unique = list(set(labels))
label_unique.remove('')

In [20]:
model = FastText(sentences=data, window=5, min_count=1, workers=4, sg=1)

In [21]:
embedded_input = [model.wv.get_vector(word) for word in data]

In [22]:
targets = torch.Tensor(np.array(embedded_input))
labels = LabelEncoder.fit_transform(targets, labels)

In [23]:
targets = list(zip(targets, labels))

In [114]:
input_size = 100
hidden_size = 8
output_size = len(label_unique)+1
num_epochs = 1500
bs = 512
lr = 8e-3
wd = 6e-3
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [123]:
nn_model = RNN(input_size, hidden_size, output_size)
nn_model = nn_model.to(device)
dataloader = DataLoader(targets, batch_size=bs, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(nn_model.parameters(), lr=lr, weight_decay=wd)
acc = MulticlassAccuracy(num_classes=output_size).to(device)

In [124]:
for epoch in range(num_epochs):
    total_loss = 0
    total_acc = 0
    for batch, gt in dataloader:
        batch, gt = batch.to(device), gt.to(device)
        optimizer.zero_grad()
        batch = batch.reshape(1,-1,100)
        outputs = nn_model(batch)
        loss = criterion(outputs.reshape(-1, output_size), gt.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(acc(outputs, gt))
    print(f'Epoch {epoch+1}: Loss {total_loss/len(dataloader):.4f}')

tensor(0.0409, device='cuda:0')
tensor(0.0450, device='cuda:0')
tensor(0.0625, device='cuda:0')
tensor(0.0755, device='cuda:0')
tensor(0.0480, device='cuda:0')
tensor(0.0500, device='cuda:0')
tensor(0.0356, device='cuda:0')
tensor(0.0431, device='cuda:0')
tensor(0.0455, device='cuda:0')
tensor(0.0464, device='cuda:0')
tensor(0.0380, device='cuda:0')
tensor(0.0367, device='cuda:0')
tensor(0.0617, device='cuda:0')
tensor(0.0619, device='cuda:0')
tensor(0.0434, device='cuda:0')
tensor(0.0416, device='cuda:0')
tensor(0.0500, device='cuda:0')
tensor(0.0453, device='cuda:0')
tensor(0.0417, device='cuda:0')
tensor(0.0476, device='cuda:0')
tensor(0.0500, device='cuda:0')
tensor(0.0455, device='cuda:0')
tensor(0.0500, device='cuda:0')
tensor(0.0556, device='cuda:0')
tensor(0.0556, device='cuda:0')
tensor(0.0500, device='cuda:0')
tensor(0.0526, device='cuda:0')
tensor(0.0476, device='cuda:0')
tensor(0.0385, device='cuda:0')
tensor(0.0500, device='cuda:0')
tensor(0.0625, device='cuda:0')
tensor(0

KeyboardInterrupt: 