In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from torchmetrics import Accuracy
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import ToTensor, PadTransform, Sequential, VocabTransform
from tqdm.notebook import tqdm
from utils import build_model, train, test, get_predictions
import yaml

In [3]:
# read data
with open("./data/pos_data.txt", encoding='UTF-8') as txt_file:
    data = txt_file.readlines()

In [4]:
# clean data 
data = list(
    filter(
        lambda x: x != [''],
        map(
            lambda x: x.rstrip().split('\t'),
            data
        )
    )
)

In [5]:
# collect input and label data
X, y = [], []
for row in data:
    if row[1] == '<beg>':
        x_element, y_element = [], []
        y_element.append(row[1])
    elif row[1] == '<end>':
        y_element.append(row[1])
        X.append(x_element), y.append(y_element)
    else:
        y_element.append(row[2].split('|')[0])

    x_element.append(row[1])

In [6]:
all_sent_lens = list(
    map(
        lambda x: len(x) - 2,
        X
    )
)
q_75 = int(np.quantile(all_sent_lens, q=0.75))

In [7]:
X = list(
    filter(
        lambda x: len(x) <= q_75 + 2,
        X
    )
)
y = list(
    filter(
        lambda x: len(x) <= q_75 + 2,
        y
    )
)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)
vocab_X = build_vocab_from_iterator(X_train, specials=["<PAD>", "<UNK>", "<beg>", "<end>"], special_first=[0, 1, 2, 3])
vocab_X.set_default_index(1)
vocab_y = build_vocab_from_iterator(y_train, specials=["<PAD>", "<UNK>", "<beg>", "<end>"], special_first=[0, 1, 2, 3])
vocab_y.set_default_index(1)
print(f"Number of tokens in inputs data {len(vocab_X)}")
print(f"Number of parts of speech {len(vocab_y)}")

Number of tokens in inputs data 104763
Number of parts of speech 20


In [9]:
class POSTTaggingDataset(Dataset):
    def __init__(self, X_data, y_data, X_transforms, y_transforms):
        self.X_data = X_data
        self.y_data = y_data
        self.X_transforms = X_transforms
        self.y_transforms = y_transforms

    def __getitem__(self, index):
        return self.X_transforms(self.X_data[index]), self.y_transforms(self.y_data[index])
    
    def __len__(self):
        return len(self.X_data)

X_transforms = Sequential(
    VocabTransform(vocab_X),
    ToTensor(0),
    PadTransform(max_length=q_75 + 2, pad_value=0),
)

y_transforms = Sequential(
    VocabTransform(vocab_y),
    ToTensor(0),
    PadTransform(max_length=q_75 + 2, pad_value=0),
)

train_dataset = POSTTaggingDataset(X_data=X_train, y_data=y_train, X_transforms=X_transforms, y_transforms=y_transforms)
test_dataset = POSTTaggingDataset(X_data=X_test, y_data=y_test, X_transforms=X_transforms, y_transforms=y_transforms)

In [10]:
with open("train_config.yaml", "r") as stream:
    config = yaml.safe_load(stream)

In [11]:
model = build_model(config)

In [12]:
batch_size = config["batch_size"]
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
device = config["device"] if torch.cuda.is_available() else "cpu"
criterion = nn.CrossEntropyLoss(ignore_index=0)
lr = config["optimizer_lr"]
n_epochs = config["n_epochs"]
optimizer = optim.AdamW(model.parameters(), lr=lr)
model.to(device=device)
writer = SummaryWriter(config["log_dir"])

In [13]:
for epoch in tqdm(range(n_epochs)):
    train_loss = train(model,
                       train_dataloader,
                       criterion,
                       optimizer,
                       device=device)
    val_loss, acc_score_val = test(
        model, 
        test_dataloader,
        criterion,
        task_type='clf',
        metric=[Accuracy(task="multiclass", num_classes=len(vocab_y), ignore_index=0)],
        device=device
    )
    train_loss, acc_score_train = test(
        model, 
        train_dataloader,
        criterion,
        task_type='clf',
        metric=[Accuracy(task="multiclass", num_classes=len(vocab_y), ignore_index=0)],
        train_or_test_mode="train",
        device=device
    )
    writer.add_scalar('Loss/train', train_loss / len(train_dataloader), epoch)
    writer.add_scalar('Loss/test', val_loss / len(test_dataloader), epoch)
    writer.add_scalar('Acc/train',acc_score_train[0], epoch)
    writer.add_scalar('Acc/test', acc_score_val[0], epoch)

  0%|          | 0/5 [00:00<?, ?it/s]

In [14]:
y_true, y_pred = get_predictions(model, test_dataloader, device=device)
y_without_pad_mask = y_true != 0

  0%|          | 0/9701 [00:00<?, ?it/s]

In [15]:
print("Test sample")
print(f"Final Accuracy-score = {accuracy_score(y_true[y_without_pad_mask], y_pred[y_without_pad_mask])}")
print(classification_report(y_true, y_pred, zero_division=1))

Test sample
Final Accuracy-score = 0.9557736741588879
              precision    recall  f1-score   support

           0       1.00      0.00      0.00   2563801
           2       1.00      1.00      1.00    620802
           3       0.88      1.00      0.93    620802
           4       0.27      0.98      0.42    801896
           5       1.00      1.00      1.00    697183
           6       0.68      0.91      0.78    695566
           7       0.59      0.76      0.67    365388
           8       0.98      0.99      0.98    345793
           9       1.00      1.00      1.00    247222
          10       0.94      0.94      0.94    231953
          11       0.95      0.95      0.95    115777
          12       0.97      0.90      0.93    105956
          13       0.85      0.83      0.84     25943
          14       0.53      0.16      0.24      4192
          15       0.19      0.24      0.21      2443
          16       0.71      0.25      0.37      1819
          17       0.42    

In [16]:
torch.save(obj=model.state_dict(), f="weight.pth")

In [17]:
idx = 10
assert idx < len(test_dataset)
x_test_element, y_test_element = test_dataset[idx]
out = model(x_test_element.unsqueeze(0).to(device)).argmax(dim=1).cpu().flatten().numpy().tolist()
print(f"Input data: {vocab_X.lookup_tokens(x_test_element.tolist())}")
print(f"Model answer: {vocab_y.lookup_tokens(out)}")
print(f"Ground Truth: {vocab_y.lookup_tokens(y_test_element.tolist())}")

Input data: ['<beg>', '<UNK>', 'лучами', 'солнца', '<end>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Model answer: ['<beg>', 'VERB', 'NOUN', 'NOUN', '<end>', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN']
Ground Truth: ['<beg>', 'VERB', 'NOUN', 'NOUN', '<end>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
