In [1]:
import json
import pickle
import statistics

import torch
import numpy as np

from spacy.lang.en import English
from sklearn.model_selection import train_test_split


from torch.utils.data import DataLoader
from torch import nn
from tqdm.notebook import tqdm

from utils.preprocessing import spacy_tokenize, dummy_fn
from utils.autoskill_torch import SkillDataset, collate_fn 
from utils.base_torch_utils import train_single_model, evaluate_single_model, BaseModel

In [2]:
with open('data/labels.json', 'r', encoding="utf8") as f:
    labels_map = json.load(f)

with open('data/dataset.json', 'r', encoding="utf8") as f:
    data = json.load(f)
    
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
len(train), len(test)

(9644, 2411)

In [4]:
tokenizer = English().tokenizer
tfidf = pickle.load(open("models/tfidf_3_08_300.pkl", 'rb'))
tfidf

TfidfVectorizer(lowercase=False, max_df=0.8, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000001D8C761F700>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000001D8C761F700>)

In [5]:
train_dataset = SkillDataset(
    train, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=tfidf
)

len(train_dataset)

9644

In [6]:
print(train_dataset[42][0].shape, '\n') # x_vec
print(train_dataset[42][1], '\n') # y labels: ([midas_id, entity_id], midas_and_entity_id)

(1023,) 

([2, 17], 47) 



In [7]:
test_dataset = SkillDataset(
    test, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=tfidf
)

len(test_dataset)

2411

In [8]:
train_loader = DataLoader(
    train_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

test_loader = DataLoader(
    test_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

In [9]:
for batch in train_loader:
    break

batch[0].shape, batch[1].shape, batch[2].shape, batch[0].dtype

(torch.Size([32, 1023]), torch.Size([32, 2]), torch.Size([32]), torch.float32)

In [10]:
progress_bar = tqdm(total=len(train_loader.dataset), desc='Testing')

for x, y_m, y_s in train_loader:
    progress_bar.update(x.size(0))
    
progress_bar.close()

Testing:   0%|          | 0/9644 [00:00<?, ?it/s]

In [11]:
progress_bar = tqdm(total=len(test_loader.dataset), desc='Testing')

for x, y_m, y_s in test_loader:
    progress_bar.update(x.size(0))
    
progress_bar.close()

Testing:   0%|          | 0/2411 [00:00<?, ?it/s]

In [12]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_midas_and_entity2id']),
    batch_size=train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1023, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=194, bias=True)
)

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [19]:
# train loop
NUM_EPOCHS = 5  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(train_loader.dataset) / train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(test_loader.dataset) / test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [20]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3.)
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion)
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/302 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/76 [00:00<?, ?it/s]

Epoch: 1
Train loss - 0.0280 | Test loss - 6.1074
TEST: f1 weighted - 0.1493 | accuracy - 0.1651



Train epoch 2:   0%|          | 0/302 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/76 [00:00<?, ?it/s]

Epoch: 2
Train loss - 0.0227 | Test loss - 6.1992
TEST: f1 weighted - 0.1514 | accuracy - 0.1676



Train epoch 3:   0%|          | 0/302 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/76 [00:00<?, ?it/s]

Epoch: 3
Train loss - 0.0183 | Test loss - 6.3323
TEST: f1 weighted - 0.1505 | accuracy - 0.1672



Train epoch 4:   0%|          | 0/302 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/76 [00:00<?, ?it/s]

Epoch: 4
Train loss - 0.0150 | Test loss - 6.4549
TEST: f1 weighted - 0.1495 | accuracy - 0.1667



Train epoch 5:   0%|          | 0/302 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/76 [00:00<?, ?it/s]

Epoch: 5
Train loss - 0.0122 | Test loss - 6.5680
TEST: f1 weighted - 0.1495 | accuracy - 0.1672

