In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from collections import Counter

import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import Compose

from tqdm import tqdm

In [None]:
class NamesDataset(Dataset):
    def __init__(self, file_path, transforms=None):
        data = pd.read_csv(file_path)
        self.labels = ["Прізвище", "Ім'я", "По батькові", 'NaN']
        self.samples = data.values
        self.transforms = transforms
        self.tokenize(self.samples)
        
    def encode_target(self, sample):
        resulting_seq = list()
        for idx, i in enumerate(sample):
          if i != self.labels[-1]:
            resulting_seq.append(idx)
          else:
            resulting_seq.append(3)
        return resulting_seq
    
    def encode_sample(self, sample):
        sample_enc = [self.sample_dict[el] for el in sample]
        return sample_enc
        
    def tokenize(self, samples):
        self.sample_dict = {}
        self.sample_dict['NaN'] = 0
        counter = 1
        for i in range(len(samples)):
            for j in range(len(samples[i])):
                if samples[i][j] not in self.sample_dict.keys():
                    self.sample_dict[samples[i][j]] = counter
                    counter += 1
        
    def __getitem__(self, idx):
        sample = self.encode_sample(self.samples[idx])
        trg = self.encode_target(self.samples[idx])
        if self.transforms is not None:
            for transformation in self.transforms:
                sample, trg = transformation(sample, trg)
        return torch.tensor(sample), torch.tensor(trg)
    
    def __len__(self):
        return len(self.samples)

In [None]:
class RandomShuffle(object):
    def __init__(self, p=0.5):
        self.prob = p

    def __call__(self, sample, target):
        if np.random.rand() < self.prob:
            np.random.shuffle(target)
            aug_sample = [sample[i] for i in target]
            return aug_sample, target
        return sample, target

In [None]:
class RandomErase(object):
  def __init__(self, p=0.5):
      self.prob = p
  
  def __call__(self, sample, target):
      if np.random.rand() < self.prob:
          idx_to_erase = np.random.randint(0, len(sample))
          sample[idx_to_erase] = 0
          target[idx_to_erase] =  3
          return sample, target
      return sample, target

In [None]:
!ls

concat.csv  first_set.xlsx  sample_data  second_set.xlsx  third_set.xlsx


In [None]:
data = pd.read_csv('concat.csv')
data.head()

Unnamed: 0,Прізвище,Ім’я,По батькові
0,Король,Тетяна,Олександрівна
1,Король,Сергій,Володимирович
2,Пащенко,Катерина,Михайлівна
3,Щоголева,Лідія,Степанівна
4,Солоха,Олег,Миколайович


In [None]:
#data[data[data.columns[0]].isna()]

In [None]:
dataset = NamesDataset(
    'concat.csv', 
    transforms=[RandomShuffle(p=0.5),RandomErase(p=0.5)]
)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [None]:
sample, trg = next(iter(dataloader))
print(sample, trg, sep='\n')

tensor([[ 2,  1,  3],
        [ 0,  1,  5],
        [ 6,  7,  0],
        [ 0, 10, 11],
        [12, 13, 14],
        [12, 15, 16],
        [18, 17, 19],
        [17, 20, 21],
        [22,  7, 17],
        [24, 23,  0],
        [23, 26, 27],
        [28, 29,  0],
        [27, 31, 32],
        [34, 33, 31],
        [35, 31, 36],
        [37, 38, 39]])
tensor([[1, 0, 2],
        [3, 0, 2],
        [0, 1, 3],
        [3, 1, 2],
        [0, 1, 2],
        [0, 1, 2],
        [1, 0, 2],
        [0, 1, 2],
        [2, 1, 0],
        [1, 0, 3],
        [0, 1, 2],
        [0, 1, 3],
        [2, 0, 1],
        [2, 1, 0],
        [1, 0, 2],
        [0, 1, 2]])


In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = self.log_softmax(tag_space)
        return tag_scores

In [None]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 300
N_CLASSES = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(dataset.sample_dict), N_CLASSES).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(model, dataloader, optim, criterion, device):
    model.train()
    
    epoch_loss = 0.0
    
    for _, (src, trg) in tqdm(enumerate(dataloader), total=len(dataloader)):
        src, trg = src.to(device), trg.to(device)
        
        optimizer.zero_grad()
        output = model(src)
        #print(output.shape, trg.shape)
        loss = criterion(output.permute(0, 2, 1), trg)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    print(f'\nTrain loss: {epoch_loss / len(dataloader)}')

In [None]:
def compute_seq_acc(output, trg):
    preds = torch.argmax(output, dim=1)[:, :len(trg[0])]
    
    total_correct = 0
    total_len = 0

    for j in range(len(preds)):
      correct = 0
      for i in range(len(preds[j])):
        if preds[j][i] == trg[j][i]:
          correct += 1
      total_correct += correct
      total_len += len(preds[i])

    #print(f'Batch acc: {total_correct / total_len}')
    return total_correct / total_len * 100

In [None]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    
    epoch_loss = 0.0
    epoch_acc = list()
    
    for idx, (src, trg) in enumerate(dataloader):
        if idx == 100: break
        src, trg = src.to(device), trg.to(device)
        output = model(src)
        loss = criterion(output.permute(0, 2, 1), trg)
        epoch_loss += loss.item()
        epoch_acc = np.append(epoch_acc, compute_seq_acc(output, trg))
    print(f'\nEval loss: {epoch_loss / len(dataloader)}, Eval acc: {np.mean(epoch_acc)}')

In [None]:
for i in range(10):
    train_model(model, dataloader, optimizer, loss_function, device)
    evaluate(model, dataloader, loss_function, device)

100%|██████████| 7465/7465 [00:28<00:00, 260.87it/s]



Train loss: 0.026122428348577184


  0%|          | 28/7465 [00:00<00:27, 275.08it/s]


Eval loss: 9.50609588408353e-05, Eval acc: 68.08333333333334


100%|██████████| 7465/7465 [00:28<00:00, 261.62it/s]



Train loss: 0.004422206421710651


  0%|          | 28/7465 [00:00<00:27, 272.84it/s]


Eval loss: 1.9180963774862224e-05, Eval acc: 70.08333333333331


100%|██████████| 7465/7465 [00:28<00:00, 261.28it/s]



Train loss: 0.0018380297137966265


  0%|          | 27/7465 [00:00<00:28, 265.02it/s]


Eval loss: 6.337339113523193e-06, Eval acc: 68.04166666666667


100%|██████████| 7465/7465 [00:28<00:00, 260.63it/s]



Train loss: 0.0009958164759140767


  0%|          | 27/7465 [00:00<00:27, 269.73it/s]


Eval loss: 6.883063572671969e-07, Eval acc: 69.95833333333334


100%|██████████| 7465/7465 [00:28<00:00, 261.59it/s]



Train loss: 0.000493311748143076


  0%|          | 27/7465 [00:00<00:28, 265.60it/s]


Eval loss: 1.3309826192815705e-06, Eval acc: 68.60416666666667


100%|██████████| 7465/7465 [00:28<00:00, 261.57it/s]



Train loss: 0.00023800085686351508


  0%|          | 27/7465 [00:00<00:27, 266.99it/s]


Eval loss: 3.096423058426244e-08, Eval acc: 67.9375


100%|██████████| 7465/7465 [00:28<00:00, 260.41it/s]



Train loss: 0.00024497623230811854


  0%|          | 28/7465 [00:00<00:27, 273.07it/s]


Eval loss: 4.210132712472243e-08, Eval acc: 67.83333333333334


100%|██████████| 7465/7465 [00:28<00:00, 261.04it/s]



Train loss: 8.349765471260367e-05


  0%|          | 26/7465 [00:00<00:29, 255.82it/s]


Eval loss: 3.448975787037093e-08, Eval acc: 69.10416666666667


100%|██████████| 7465/7465 [00:28<00:00, 259.59it/s]



Train loss: 0.00019807354210131978


  0%|          | 27/7465 [00:00<00:28, 263.18it/s]


Eval loss: 1.3929773233438586e-08, Eval acc: 68.3125


100%|██████████| 7465/7465 [00:28<00:00, 260.62it/s]



Train loss: 7.055089119029004e-05

Eval loss: 9.88428494533867e-08, Eval acc: 68.5625


In [None]:
def get_targets(model, text, encode_function, device, target_mapping):
  enc_text = torch.tensor(encode_function(text)).to(device)
  output = model(enc_text.unsqueeze(0))
  trg_indices = torch.argmax(output, dim=1)[:, :3]
  return [target_mapping[i] for i in trg_indices[0]]

In [None]:
text1 = 'Шевченко Богдан Миколайович'.split()
targets1 = get_targets(model, text1, dataset.encode_sample, device, dataset.labels)
targets1

['Прізвище', "Ім'я", 'По батькові']

In [None]:
text1 = 'Трохим Бабич Миколайович'.split()
targets1 = get_targets(model, text1, dataset.encode_sample, device, dataset.labels)
targets1

["Ім'я", 'Прізвище', 'По батькові']

In [None]:
text1 = 'Тарас Назар Миколайович'.split()
targets1 = get_targets(model, text1, dataset.encode_sample, device, dataset.labels)
targets1

['Прізвище', "Ім'я", 'По батькові']

In [None]:
text1 = 'Тарас Назар NaN'.split()
targets1 = get_targets(model, text1, dataset.encode_sample, device, dataset.labels)
targets1

['Прізвище', "Ім'я", "Ім'я"]

# Evaluation

In [None]:
test1 = pd.read_excel('second_set.xlsx')
test1 = test1.astype(str)
test1 = test1.replace(to_replace =["nan"], 
                            value ="NaN")
test1.head()

Unnamed: 0,Прізвище,Ім’я,По батькові
0,Лещенко,Марина,Миколаївна
1,Міллєр,Катерина,Максимівна
2,Білоусов,Геннадій,Григорович
3,Левандровська,Алія,Геннадіївна
4,Піддубний,Сергій,Олександрович


In [None]:
test1[['A_pred', 'B_pred', 'C_pred']] = [np.nan, np.nan, np.nan]

In [None]:
for idx, row in test1[test1.columns[:3]].iterrows():
    #print(idx, row)
    targets = get_targets(model, row.values, dataset.encode_sample, device, dataset.labels)
    #print(targets)
    for col_idx, i in enumerate(targets):
        test1[test1.columns[3+col_idx]].iloc[idx] = i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
test1.head()

Unnamed: 0,Прізвище,Ім’я,По батькові,A_pred,B_pred,C_pred
0,Лещенко,Марина,Миколаївна,Прізвище,Ім'я,По батькові
1,Міллєр,Катерина,Максимівна,Прізвище,Ім'я,По батькові
2,Білоусов,Геннадій,Григорович,Прізвище,Ім'я,По батькові
3,Левандровська,Алія,Геннадіївна,Прізвище,Ім'я,По батькові
4,Піддубний,Сергій,Олександрович,Прізвище,Ім'я,По батькові


In [None]:
test1['A_pred'].value_counts()

Прізвище    43621
Ім'я            1
Name: A_pred, dtype: int64

In [None]:
test1['B_pred'].value_counts()

Ім'я        43620
Прізвище        2
Name: B_pred, dtype: int64

In [None]:
test1['C_pred'].value_counts()

По батькові    43618
Ім'я               3
Прізвище           1
Name: C_pred, dtype: int64

In [None]:
test1.shape

(43622, 6)

In [None]:
test1.to_csv('new_test6.csv')

In [None]:
test1[test1['C_pred'] == 'Прізвище'] 

Unnamed: 0,Прізвище,Ім’я,По батькові,A_pred,B_pred,C_pred
2403,Няссе,Жан Жак,,Прізвище,Ім'я,Прізвище
