In [1]:
import json
import pickle
import statistics

from collections import Counter

import torch
import numpy as np

from catboost import CatBoostClassifier, Pool
from spacy.lang.en import English
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader
from torch import nn
from tqdm.notebook import tqdm

from utils.preprocessing import spacy_tokenize, dummy_fn
from utils.autoskill_torch import SkillDataset, collate_fn 
from utils.base_torch_utils import train_single_model, evaluate_single_model, BaseModel

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
with open('data/daily_labels.json', 'r', encoding="utf8") as f:
    daily_labels_map = json.load(f)

with open('data/daily_dataset.json', 'r', encoding="utf8") as f:
    daily = json.load(f)

In [4]:
with open('data/topical_labels.json', 'r', encoding="utf8") as f:
    topical_labels_map = json.load(f)

with open('data/topical_dataset.json', 'r', encoding="utf8") as f:
    topical = json.load(f)

In [5]:
tokenizer = English().tokenizer

# TFIDF + Linear_ReLu_Linear

## Daily Dataset

In [4]:
daily_train, daily_test = train_test_split(daily, test_size=0.2, random_state=42)
len(daily_train), len(daily_test)

(1881, 471)

In [5]:
daily_tfidf = pickle.load(open("models/daily_tfidf_3_08_300.pkl", 'rb'))
daily_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000022757040820>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000022757040820>)

In [6]:
daily_train_dataset = SkillDataset(
    daily_train, daily_labels_map, 
    tokenizer=tokenizer,
    tfidf_model=daily_tfidf
)

len(daily_train_dataset)

1881

In [7]:
print(daily_train_dataset[42][0].shape, '\n') # x_vec
print(daily_train_dataset[42][1], '\n') # y labels: ([midas_id, entity_id], midas_and_entity_id)

(999,) 

[0, 5, 17] 



In [8]:
daily_test_dataset = SkillDataset(
    daily_test, daily_labels_map, 
    tokenizer=tokenizer,
    tfidf_model=daily_tfidf
)

len(daily_test_dataset)

471

In [12]:
daily_train_loader = DataLoader(
    daily_train_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

daily_test_loader = DataLoader(
    daily_test_dataset, batch_size=32, 
    shuffle=False, collate_fn=collate_fn)

In [13]:
for x, y in daily_train_loader:
    break

x.shape, y.shape

(torch.Size([32, 999]), torch.Size([32, 3]))

In [14]:
progress_bar = tqdm(total=len(daily_train_loader.dataset), desc='Testing')

for batch in daily_train_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/1881 [00:00<?, ?it/s]

In [15]:
progress_bar = tqdm(total=len(daily_test_loader.dataset), desc='Testing')

for batch in daily_test_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/471 [00:00<?, ?it/s]

### Midas

In [16]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(daily_labels_map['target_midas2id']),
    batch_size=daily_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=999, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=12, bias=True)
)

In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [18]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(daily_train_loader.dataset) / daily_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(daily_test_loader.dataset) / daily_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [19]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=daily_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='midas')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=daily_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='midas')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 1
Train loss - 1.5033 | Test loss - 1.1732
TEST: f1 weighted - 0.5133 | accuracy - 0.6476



Train epoch 2:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 2
Train loss - 1.0924 | Test loss - 1.0528
TEST: f1 weighted - 0.5780 | accuracy - 0.6709



Train epoch 3:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 3
Train loss - 0.9033 | Test loss - 1.0107
TEST: f1 weighted - 0.6155 | accuracy - 0.6815



Train epoch 4:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 4
Train loss - 0.7257 | Test loss - 1.0217
TEST: f1 weighted - 0.6440 | accuracy - 0.6837



Train epoch 5:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 5
Train loss - 0.5544 | Test loss - 1.0566
TEST: f1 weighted - 0.6632 | accuracy - 0.6879



Train epoch 6:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 6
Train loss - 0.4127 | Test loss - 1.0939
TEST: f1 weighted - 0.6518 | accuracy - 0.6773



Train epoch 7:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 7
Train loss - 0.2861 | Test loss - 1.1532
TEST: f1 weighted - 0.6645 | accuracy - 0.6815



Train epoch 8:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.1936 | Test loss - 1.2128
TEST: f1 weighted - 0.6671 | accuracy - 0.6879



Train epoch 9:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.1285 | Test loss - 1.2768
TEST: f1 weighted - 0.6695 | accuracy - 0.6900



Train epoch 10:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.0853 | Test loss - 1.3461
TEST: f1 weighted - 0.6674 | accuracy - 0.6879



### Entity

In [20]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(daily_labels_map['target_entity2id']),
    batch_size=daily_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=999, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=18, bias=True)
)

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [22]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(daily_train_loader.dataset) / daily_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(daily_test_loader.dataset) / daily_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [23]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=daily_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='entity')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=daily_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='entity')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 1
Train loss - 2.3557 | Test loss - 2.0531
TEST: f1 weighted - 0.2608 | accuracy - 0.3546



Train epoch 2:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 2
Train loss - 1.7832 | Test loss - 1.7193
TEST: f1 weighted - 0.4419 | accuracy - 0.4841



Train epoch 3:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 3
Train loss - 1.3127 | Test loss - 1.5092
TEST: f1 weighted - 0.5071 | accuracy - 0.5435



Train epoch 4:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 4
Train loss - 0.9755 | Test loss - 1.4482
TEST: f1 weighted - 0.5651 | accuracy - 0.5690



Train epoch 5:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 5
Train loss - 0.7284 | Test loss - 1.4348
TEST: f1 weighted - 0.5519 | accuracy - 0.5626



Train epoch 6:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 6
Train loss - 0.5268 | Test loss - 1.4381
TEST: f1 weighted - 0.5612 | accuracy - 0.5690



Train epoch 7:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 7
Train loss - 0.3768 | Test loss - 1.4948
TEST: f1 weighted - 0.5653 | accuracy - 0.5711



Train epoch 8:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.2693 | Test loss - 1.5586
TEST: f1 weighted - 0.5755 | accuracy - 0.5796



Train epoch 9:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.1872 | Test loss - 1.6070
TEST: f1 weighted - 0.5749 | accuracy - 0.5817



Train epoch 10:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.1360 | Test loss - 1.6682
TEST: f1 weighted - 0.5799 | accuracy - 0.5839



### Concatenation

In [24]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(daily_labels_map['target_midas_and_entity2id']),
    batch_size=daily_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=999, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=120, bias=True)
)

In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [26]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(daily_train_loader.dataset) / daily_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(daily_test_loader.dataset) / daily_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [27]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=daily_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='concatenation')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=daily_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='concatenation')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 1
Train loss - 3.8350 | Test loss - 3.2671
TEST: f1 weighted - 0.1252 | accuracy - 0.2505



Train epoch 2:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 2
Train loss - 3.0736 | Test loss - 2.9968
TEST: f1 weighted - 0.2077 | accuracy - 0.3206



Train epoch 3:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 3
Train loss - 2.5982 | Test loss - 2.7862
TEST: f1 weighted - 0.2901 | accuracy - 0.3758



Train epoch 4:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 4
Train loss - 2.1431 | Test loss - 2.6670
TEST: f1 weighted - 0.3265 | accuracy - 0.3970



Train epoch 5:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 5
Train loss - 1.7331 | Test loss - 2.5791
TEST: f1 weighted - 0.3540 | accuracy - 0.4246



Train epoch 6:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 6
Train loss - 1.3598 | Test loss - 2.5301
TEST: f1 weighted - 0.4046 | accuracy - 0.4565



Train epoch 7:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 7
Train loss - 1.0080 | Test loss - 2.5171
TEST: f1 weighted - 0.4015 | accuracy - 0.4501



Train epoch 8:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.7256 | Test loss - 2.5165
TEST: f1 weighted - 0.4255 | accuracy - 0.4565



Train epoch 9:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.5017 | Test loss - 2.5462
TEST: f1 weighted - 0.4316 | accuracy - 0.4586



Train epoch 10:   0%|          | 0/59 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.3392 | Test loss - 2.6038
TEST: f1 weighted - 0.4323 | accuracy - 0.4565



## Topical Chat

In [6]:
topical_train, topical_test = train_test_split(topical, test_size=0.2, random_state=42)
len(topical_train), len(topical_test)

(8093, 2024)

In [7]:
tokenizer = English().tokenizer
topical_tfidf = pickle.load(open("models/topical_tfidf_3_08_300.pkl", 'rb'))
topical_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x00000275C37A45E0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x00000275C37A45E0>)

In [8]:
topical_train_dataset = SkillDataset(
    topical_train, topical_labels_map, 
    tokenizer=tokenizer,
    tfidf_model=topical_tfidf
)

len(topical_train_dataset)

8093

In [9]:
print(topical_train_dataset[42][0].shape, '\n') # x_vec
print(topical_train_dataset[42][1], '\n') # y labels: ([midas_id, entity_id], midas_and_entity_id)

(1005,) 

[2, 1, 6] 



In [10]:
topical_test_dataset = SkillDataset(
    topical_test, topical_labels_map, 
    tokenizer=tokenizer,
    tfidf_model=topical_tfidf
)

len(topical_test_dataset)

2024

In [11]:
topical_train_loader = DataLoader(
    topical_train_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

topical_test_loader = DataLoader(
    topical_test_dataset, batch_size=32, 
    shuffle=False, collate_fn=collate_fn)

In [12]:
for x, y in topical_train_loader:
    break

x.shape, y.shape

(torch.Size([32, 1005]), torch.Size([32, 3]))

In [13]:
progress_bar = tqdm(total=len(topical_train_loader.dataset), desc='Testing')

for batch in topical_train_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/8093 [00:00<?, ?it/s]

In [14]:
progress_bar = tqdm(total=len(topical_test_loader.dataset), desc='Testing')

for batch in topical_test_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/2024 [00:00<?, ?it/s]

### MIDAS

In [15]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(topical_labels_map['target_midas2id']),
    batch_size=topical_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1005, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=13, bias=True)
)

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [17]:
# train loop
NUM_EPOCHS = 5  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(topical_train_loader.dataset) / topical_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(topical_test_loader.dataset) / topical_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [18]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=topical_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='midas')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=topical_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='midas')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/253 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Entity

In [None]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(topical_labels_map['target_entity2id']),
    batch_size=topical_train_loader.batch_size
)

model.to(DEVICE)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [None]:
# train loop
NUM_EPOCHS = 5  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(topical_train_loader.dataset) / topical_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(topical_test_loader.dataset) / topical_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [None]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=topical_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='entity')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=topical_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='entity')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

### Concatenation

In [None]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(topical_labels_map['target_midas_and_entity2id']),
    batch_size=topical_train_loader.batch_size
)

model.to(DEVICE)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [None]:
# train loop
NUM_EPOCHS = 5  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(topical_train_loader.dataset) / topical_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(topical_test_loader.dataset) / topical_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [None]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=topical_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='concatenation')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=topical_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='concatenation')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

# TFIDF + Catboost / LinearRegression / RandomForest

## DailyDialog

In [6]:
daily_tfidf = pickle.load(open("models/daily_tfidf_3_08_300.pkl", 'rb'))
daily_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x00000266668A45E0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x00000266668A45E0>)

In [7]:
daily_dataset = SkillDataset(
    daily, daily_labels_map, 
    tokenizer=tokenizer,
    tfidf_model=daily_tfidf
)

In [8]:
daily_loader = DataLoader(
    daily_dataset, batch_size=len(daily_dataset), 
    shuffle=False, collate_fn=collate_fn)

In [9]:
for X, y in daily_loader:
    break

In [10]:
X.shape, y.shape

(torch.Size([2352, 999]), torch.Size([2352, 3]))

In [11]:
X, y = X.numpy(), y.numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
y_train_midas, y_train_entity, y_train_concat = y_train[:,0], y_train[:,1], y_train[:,2]
y_test_midas, y_test_entity, y_test_concat = y_test[:,0], y_test[:,1], y_test[:,2]

### Midas

In [13]:
n_bins = len(daily_labels_map['target_midas2id'])
class_weights, _ = np.histogram(y_train_midas, bins=n_bins, density=True)
class_weights

array([0.72599681, 0.08867624, 0.05677831, 0.2169059 , 0.00574163,
       0.        , 0.01339713, 0.0414673 , 0.00574163, 0.03189793,
       0.00829346, 0.00510367])

#### Catboost

In [14]:
cb_train = Pool(X_train, label=y_train_midas)
cb_eval = Pool(X_test, label=y_test_midas)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.6049973	total: 174ms	remaining: 2.43s
1:	learn: 0.6049973	total: 202ms	remaining: 1.31s
2:	learn: 0.6049973	total: 228ms	remaining: 913ms
3:	learn: 0.6049973	total: 254ms	remaining: 698ms
4:	learn: 0.6049973	total: 282ms	remaining: 564ms
5:	learn: 0.6049973	total: 308ms	remaining: 462ms
6:	learn: 0.6049973	total: 333ms	remaining: 380ms
7:	learn: 0.6049973	total: 359ms	remaining: 314ms
8:	learn: 0.6049973	total: 387ms	remaining: 258ms
9:	learn: 0.6049973	total: 415ms	remaining: 207ms
10:	learn: 0.6049973	total: 439ms	remaining: 160ms
11:	learn: 0.6049973	total: 464ms	remaining: 116ms
12:	learn: 0.6049973	total: 491ms	remaining: 75.6ms
13:	learn: 0.6049973	total: 517ms	remaining: 36.9ms
14:	learn: 0.6049973	total: 542ms	remaining: 0us
class =  (471, 1)


In [15]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (471, 1)


In [16]:
Counter(cb_pred.squeeze())

Counter({0: 471})

In [17]:
accuracy_score(y_test_midas, cb_pred.squeeze())

0.643312101910828

In [18]:
model.score(cb_eval)

0.643312101910828

In [19]:
f1_score(y_test_midas, cb_pred.squeeze(), average='weighted')

0.5036784673875475

#### LogReg

In [20]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_midas)

LogisticRegression(max_iter=500, random_state=42)

In [21]:
logreg_pred = lg.predict(X_test)

In [22]:
Counter(logreg_pred)

Counter({0: 361, 3: 62, 1: 33, 6: 6, 8: 1, 2: 5, 9: 2, 5: 1})

In [23]:
accuracy_score(y_test_midas, logreg_pred)

0.673036093418259

In [24]:
f1_score(y_test_midas, logreg_pred, average='weighted')

0.6356860621078079

#### RandomForest

In [25]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [26]:
rf.fit(X_train, y_train_midas)

RandomForestClassifier(max_depth=10, random_state=42)

In [27]:
rf_preds = rf.predict(X_test)

In [28]:
Counter(rf_preds)

Counter({0: 458, 3: 11, 9: 1, 6: 1})

In [29]:
accuracy_score(y_test_midas, rf_preds)

0.6709129511677282

In [30]:
f1_score(y_test_midas, rf_preds, average='weighted')

0.5604248121389984

### Entity

In [31]:
n_bins = len(daily_labels_map['target_entity2id'])
class_weights, _ = np.histogram(y_train_entity, bins=n_bins, density=True)
class_weights

array([0.15592457, 0.15761328, 0.23754574, 0.08893892, 0.10357444,
       0.09513088, 0.05797917, 0.04503237, 0.02589361, 0.03321137,
       0.02983394, 0.00168871, 0.00394033, 0.00900647, 0.00844357,
       0.0005629 , 0.00394033, 0.0005629 ])

#### Catboost

In [32]:
cb_train = Pool(X_train, label=y_train_entity)
cb_eval = Pool(X_test, label=y_test_entity)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.4610467	test: 0.4937825	best: 0.4937825 (0)	total: 51.2ms	remaining: 717ms
1:	learn: 0.4859751	test: 0.5281193	best: 0.5281193 (1)	total: 95.4ms	remaining: 620ms
2:	learn: 0.4782904	test: 0.4862280	best: 0.5281193 (1)	total: 135ms	remaining: 541ms
3:	learn: 0.4946227	test: 0.5057156	best: 0.5281193 (1)	total: 180ms	remaining: 494ms
4:	learn: 0.5054960	test: 0.5464646	best: 0.5464646 (4)	total: 221ms	remaining: 442ms
5:	learn: 0.4946294	test: 0.5008956	best: 0.5464646 (4)	total: 264ms	remaining: 396ms
6:	learn: 0.4940024	test: 0.5033186	best: 0.5464646 (4)	total: 307ms	remaining: 351ms
7:	learn: 0.4761318	test: 0.5084501	best: 0.5464646 (4)	total: 353ms	remaining: 309ms
8:	learn: 0.4952564	test: 0.5081645	best: 0.5464646 (4)	total: 394ms	remaining: 263ms
9:	learn: 0.4958833	test: 0.5081645	best: 0.5464646 (4)	total: 439ms	remaining: 219ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.5464646322
bestIteration = 4

Shrink model to first 5 iterations.
class

In [33]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (471, 1)


In [34]:
Counter(cb_pred.squeeze())

Counter({2: 391, 1: 42, 0: 38})

In [35]:
accuracy_score(y_test_entity, cb_pred.squeeze())

0.35668789808917195

In [36]:
model.score(cb_eval)

0.35668789808917195

In [37]:
f1_score(y_test_entity, cb_pred.squeeze(), average='weighted')

0.25978650845685963

#### LogReg

In [38]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_entity)

LogisticRegression(max_iter=500, random_state=42)

In [39]:
logreg_pred = lg.predict(X_test)

In [40]:
Counter(logreg_pred)

Counter({3: 35,
         1: 69,
         2: 144,
         0: 77,
         5: 42,
         4: 42,
         6: 23,
         7: 18,
         9: 5,
         8: 5,
         10: 10,
         12: 1})

In [41]:
accuracy_score(y_test_entity, logreg_pred)

0.5668789808917197

In [42]:
f1_score(y_test_entity, logreg_pred, average='weighted')

0.5580237049289674

#### RandomForest

In [43]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [44]:
rf.fit(X_train, y_train_entity)

RandomForestClassifier(max_depth=10, random_state=42)

In [45]:
rf_preds = rf.predict(X_test)

In [46]:
Counter(rf_preds)

Counter({2: 303, 1: 63, 0: 44, 4: 21, 5: 15, 6: 11, 10: 5, 7: 6, 8: 3})

In [47]:
accuracy_score(y_test_entity, rf_preds)

0.49469214437367304

In [48]:
f1_score(y_test_entity, rf_preds, average='weighted')

0.45174358189713515

### Concatenation

In [49]:
n_bins = len(daily_labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(y_train_concat, bins=n_bins, density=True)
class_weights

array([0.12488849, 0.01351607, 0.08920607, 0.18327792, 0.00162193,
       0.01730057, 0.0189225 , 0.00162193, 0.03676371, 0.00108129,
       0.00702836, 0.00432514, 0.01243478, 0.00054064, 0.02378828,
       0.00540643, 0.00270321, 0.0491985 , 0.02054443, 0.0416295 ,
       0.01621928, 0.00919093, 0.00540643, 0.0189225 , 0.00648771,
       0.00162193, 0.00919093, 0.03838564, 0.00865029, 0.01081286,
       0.02973536, 0.00540643, 0.00270321, 0.00270321, 0.00108129,
       0.00432514, 0.01621928, 0.00486579, 0.01946314, 0.00216257,
       0.00919093, 0.0037845 , 0.00270321, 0.00162193, 0.00108129,
       0.0037845 , 0.00162193, 0.00162193, 0.00162193, 0.00108129,
       0.00324386, 0.0037845 , 0.00324386, 0.00648771, 0.0037845 ,
       0.00054064, 0.00648771, 0.00540643, 0.00486579, 0.        ,
       0.00270321, 0.00054064, 0.00648771, 0.00919093, 0.00108129,
       0.00054064, 0.00540643, 0.00270321, 0.00216257, 0.00108129,
       0.00054064, 0.00054064, 0.00054064, 0.00054064, 0.00594

#### Catboost

In [50]:
cb_train = Pool(X_train, label=y_train_concat)
cb_eval = Pool(X_test, label=y_test_concat)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    # 'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.1866029	total: 397ms	remaining: 5.56s
1:	learn: 0.1818182	total: 829ms	remaining: 5.39s
2:	learn: 0.1802233	total: 1.25s	remaining: 5.02s
3:	learn: 0.2020202	total: 1.68s	remaining: 4.62s
4:	learn: 0.1993620	total: 2.1s	remaining: 4.21s
5:	learn: 0.2126528	total: 2.53s	remaining: 3.8s
6:	learn: 0.2025518	total: 2.96s	remaining: 3.38s
7:	learn: 0.2014886	total: 3.39s	remaining: 2.96s
8:	learn: 0.2036151	total: 3.81s	remaining: 2.54s
9:	learn: 0.2349814	total: 4.24s	remaining: 2.12s
10:	learn: 0.2349814	total: 4.64s	remaining: 1.69s
11:	learn: 0.2408293	total: 5.04s	remaining: 1.26s
12:	learn: 0.2402977	total: 5.44s	remaining: 837ms
13:	learn: 0.2402977	total: 5.94s	remaining: 424ms
14:	learn: 0.2365763	total: 6.48s	remaining: 0us
class =  (471, 1)


In [51]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (471, 1)


In [52]:
Counter(cb_pred.squeeze())

Counter({3: 416, 0: 44, 2: 11})

In [53]:
accuracy_score(y_test_concat, cb_pred.squeeze())

0.2781316348195329

In [54]:
model.score(cb_eval)

0.2781316348195329

In [55]:
f1_score(y_test_concat, cb_pred.squeeze(), average='weighted')

0.17032043526799548

#### LogReg

In [56]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_concat)

LogisticRegression(max_iter=500, random_state=42)

In [57]:
logreg_pred = lg.predict(X_test)

In [58]:
Counter(logreg_pred)

Counter({3: 154,
         0: 97,
         2: 53,
         23: 3,
         17: 35,
         27: 19,
         14: 11,
         20: 6,
         19: 20,
         8: 10,
         6: 4,
         38: 10,
         18: 3,
         28: 3,
         56: 2,
         78: 1,
         5: 6,
         1: 5,
         12: 3,
         67: 1,
         21: 2,
         77: 1,
         30: 11,
         66: 1,
         74: 2,
         98: 1,
         10: 2,
         15: 1,
         40: 1,
         26: 1,
         58: 1,
         29: 1})

In [59]:
accuracy_score(y_test_concat, logreg_pred)

0.45010615711252655

In [60]:
f1_score(y_test_concat, logreg_pred, average='weighted')

0.3940860475574395

#### RandomForest

In [61]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [62]:
rf.fit(X_train, y_train_concat)

RandomForestClassifier(max_depth=10, random_state=42)

In [63]:
rf_preds = rf.predict(X_test)

In [64]:
Counter(rf_preds)

Counter({3: 349,
         0: 60,
         2: 40,
         17: 12,
         38: 4,
         19: 1,
         66: 1,
         98: 1,
         15: 1,
         40: 1,
         30: 1})

In [65]:
accuracy_score(y_test_concat, rf_preds)

0.37154989384288745

In [66]:
f1_score(y_test_concat, rf_preds, average='weighted')

0.26456882329679

## Topical Chat

In [198]:
topical_tfidf = pickle.load(open("models/topical_tfidf_3_08_300.pkl", 'rb'))
topical_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x00000290DA2CF790>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x00000290DA2CF790>)

In [199]:
topical_dataset = SkillDataset(
    topical, topical_labels_map, 
    tokenizer=tokenizer,
    tfidf_model=topical_tfidf
)

In [200]:
topical_loader = DataLoader(
    topical_dataset, batch_size=len(topical_dataset), 
    shuffle=False, collate_fn=collate_fn)

In [201]:
for X, y in topical_loader:
    break

In [202]:
X.shape, y.shape

(torch.Size([10117, 1005]), torch.Size([10117, 3]))

In [203]:
X, y = X.numpy(), y.numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [204]:
y_train_midas, y_train_entity, y_train_concat = y_train[:,0], y_train[:,1], y_train[:,2]
y_test_midas, y_test_entity, y_test_concat = y_test[:,0], y_test[:,1], y_test[:,2]

### Midas

In [207]:
n_bins = len(topical_labels_map['target_midas2id'])
class_weights, _ = np.histogram(y_train_midas, bins=n_bins, density=True)
class_weights

array([3.86187652e-01, 3.27958318e-02, 4.91937477e-01, 9.63795873e-02,
       7.63005066e-03, 1.68664278e-02, 1.19135879e-02, 1.20474484e-02,
       2.30240125e-02, 2.94493183e-03, 9.37023765e-04, 4.01581614e-04,
       2.67721076e-04])

#### Catboost

In [208]:
cb_train = Pool(X_train, label=y_train_midas)
cb_eval = Pool(X_test, label=y_test_midas)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.6012048	test: 0.6155530	best: 0.6155530 (0)	total: 161ms	remaining: 2.25s
1:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (1)	total: 291ms	remaining: 1.89s
2:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (1)	total: 427ms	remaining: 1.71s
3:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (1)	total: 564ms	remaining: 1.55s
4:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (1)	total: 691ms	remaining: 1.38s
5:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (1)	total: 845ms	remaining: 1.27s
6:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (1)	total: 978ms	remaining: 1.12s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.6161941837
bestIteration = 1

Shrink model to first 2 iterations.
class =  (2024, 1)


In [209]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2024, 1)


In [210]:
Counter(cb_pred.squeeze())

Counter({2: 2024})

In [211]:
accuracy_score(y_test_midas, cb_pred.squeeze())

0.47480237154150196

In [212]:
model.score(cb_eval)

0.47480237154150196

In [213]:
f1_score(y_test_midas, cb_pred.squeeze(), average='weighted')

0.30571864593057513

#### LogReg

In [214]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_midas)

LogisticRegression(max_iter=500, random_state=42)

In [215]:
logreg_pred = lg.predict(X_test)

In [216]:
Counter(logreg_pred)

Counter({0: 777, 2: 1160, 3: 53, 1: 27, 8: 4, 7: 1, 5: 2})

In [217]:
accuracy_score(y_test_midas, logreg_pred)

0.4881422924901186

In [218]:
f1_score(y_test_midas, logreg_pred, average='weighted')

0.45664922988596107

#### RandomForest

In [219]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [220]:
rf.fit(X_train, y_train_midas)

RandomForestClassifier(max_depth=10, random_state=42)

In [221]:
rf_preds = rf.predict(X_test)

In [222]:
Counter(rf_preds)

Counter({2: 1675, 0: 349})

In [223]:
accuracy_score(y_test_midas, rf_preds)

0.5177865612648221

In [224]:
f1_score(y_test_midas, rf_preds, average='weighted')

0.4339993924562257

### Entity

In [225]:
n_bins = len(topical_labels_map['target_entity2id'])
class_weights, _ = np.histogram(y_train_entity, bins=n_bins, density=True)
class_weights

array([0.17363934, 0.27340066, 0.11549942, 0.06815507, 0.05983078,
       0.09598939, 0.00429221, 0.05931052, 0.04162141, 0.02419245,
       0.02510292, 0.0165185 , 0.04773456, 0.00650335, 0.01027529,
       0.01053542, 0.0003902 , 0.01560803, 0.0011706 , 0.00286147])

#### Catboost

In [226]:
cb_train = Pool(X_train, label=y_train_entity)
cb_eval = Pool(X_test, label=y_test_entity)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.5439662	test: 0.5318617	best: 0.5318617 (0)	total: 228ms	remaining: 3.19s
1:	learn: 0.5438802	test: 0.5318617	best: 0.5318617 (0)	total: 432ms	remaining: 2.81s
2:	learn: 0.5438802	test: 0.5318617	best: 0.5318617 (0)	total: 636ms	remaining: 2.54s
3:	learn: 0.5438802	test: 0.5318617	best: 0.5318617 (0)	total: 828ms	remaining: 2.28s
4:	learn: 0.5438802	test: 0.5318617	best: 0.5318617 (0)	total: 1.03s	remaining: 2.06s
5:	learn: 0.5438802	test: 0.5318617	best: 0.5318617 (0)	total: 1.22s	remaining: 1.82s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.5318617065
bestIteration = 0

Shrink model to first 1 iterations.
class =  (2024, 1)


In [227]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2024, 1)


In [228]:
Counter(cb_pred.squeeze())

Counter({1: 1839, 0: 184, 5: 1})

In [229]:
accuracy_score(y_test_entity, cb_pred.squeeze())

0.2885375494071146

In [230]:
model.score(cb_eval)

0.2885375494071146

In [231]:
f1_score(y_test_entity, cb_pred.squeeze(), average='weighted')

0.17329069484306936

#### LogReg

In [232]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_entity)

LogisticRegression(max_iter=500, random_state=42)

In [233]:
logreg_pred = lg.predict(X_test)

In [234]:
Counter(logreg_pred)

Counter({12: 84,
         1: 720,
         5: 190,
         4: 90,
         0: 355,
         2: 252,
         7: 101,
         10: 30,
         3: 87,
         9: 22,
         11: 12,
         8: 56,
         6: 5,
         15: 11,
         17: 8,
         13: 1})

In [235]:
accuracy_score(y_test_entity, logreg_pred)

0.43478260869565216

In [236]:
f1_score(y_test_entity, logreg_pred, average='weighted')

0.4183170730559262

#### RandomForest

In [237]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [238]:
rf.fit(X_train, y_train_entity)

RandomForestClassifier(max_depth=10, random_state=42)

In [239]:
rf_preds = rf.predict(X_test)

In [240]:
Counter(rf_preds)

Counter({7: 79, 1: 1619, 0: 123, 4: 35, 2: 65, 5: 56, 12: 20, 8: 11, 3: 16})

In [241]:
accuracy_score(y_test_entity, rf_preds)

0.3601778656126482

In [242]:
f1_score(y_test_entity, rf_preds, average='weighted')

0.2934047444061142

### Concatenation

In [243]:
n_bins = len(topical_labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(y_train_concat, bins=n_bins, density=True)
class_weights

array([5.50765560e-02, 3.60546304e-03, 7.69579869e-02, 8.11850815e-02,
       5.10981141e-02, 3.68005882e-02, 1.31785890e-01, 1.18109996e-02,
       2.39949781e-02, 2.30003676e-02, 4.32655564e-02, 9.94610493e-04,
       2.76004412e-02, 1.80273152e-02, 2.08868203e-02, 6.09198927e-03,
       2.17571045e-02, 1.56651153e-02, 1.72813573e-02, 1.61624205e-03,
       4.42601669e-02, 1.36758943e-03, 4.10276828e-03, 1.74056836e-03,
       6.21631558e-04, 3.85411566e-03, 1.49191574e-02, 1.11893680e-03,
       6.09198927e-03, 3.12059042e-02, 9.94610493e-04, 1.54164626e-02,
       2.48652623e-03, 5.09737877e-03, 7.83255763e-03, 4.35142090e-03,
       2.52382412e-02, 2.48652623e-03, 2.73517885e-03, 6.21631558e-03,
       3.10815779e-03, 3.23248410e-03, 1.24326312e-04, 3.72978935e-04,
       3.72978935e-03, 3.23248410e-03, 6.34064189e-03, 2.85950517e-03,
       5.59468402e-03, 6.46496820e-03, 6.96227345e-03, 9.94610493e-04,
       4.35142090e-03, 1.24326312e-03, 4.47574722e-03, 7.45957869e-04,
      

#### Catboost

In [251]:
cb_train = Pool(X_train, label=y_train_concat)
cb_eval = Pool(X_test, label=y_test_concat)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    # 'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.1497591	total: 2.76s	remaining: 38.6s
1:	learn: 0.1495119	total: 5.65s	remaining: 36.7s
2:	learn: 0.1497591	total: 8.45s	remaining: 33.8s
3:	learn: 0.1495119	total: 11.1s	remaining: 30.6s
4:	learn: 0.1354257	total: 13.8s	remaining: 27.6s
5:	learn: 0.1485234	total: 16.9s	remaining: 25.4s
6:	learn: 0.1479056	total: 20.2s	remaining: 23.1s
7:	learn: 0.1479056	total: 23.5s	remaining: 20.6s
8:	learn: 0.1397504	total: 27.2s	remaining: 18.1s
9:	learn: 0.1398740	total: 30.9s	remaining: 15.5s
10:	learn: 0.1365377	total: 34.2s	remaining: 12.5s
11:	learn: 0.1365377	total: 37.5s	remaining: 9.38s
12:	learn: 0.1323366	total: 40.7s	remaining: 6.27s
13:	learn: 0.1309774	total: 44s	remaining: 3.14s
14:	learn: 0.1309774	total: 47.3s	remaining: 0us
class =  (2024, 1)


In [252]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2024, 1)


In [253]:
Counter(cb_pred.squeeze())

Counter({6: 2024})

In [254]:
accuracy_score(y_test_concat, cb_pred.squeeze())

0.13537549407114624

In [255]:
model.score(cb_eval)

0.13537549407114624

In [256]:
f1_score(y_test_concat, cb_pred.squeeze(), average='weighted')

0.03228275489599136

#### LogReg

In [257]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_concat)

LogisticRegression(max_iter=500, random_state=42)

In [258]:
logreg_pred = lg.predict(X_test)

In [259]:
Counter(logreg_pred)

Counter({12: 66,
         3: 218,
         10: 123,
         36: 53,
         6: 481,
         8: 53,
         29: 66,
         0: 121,
         2: 219,
         16: 22,
         4: 145,
         20: 111,
         5: 74,
         7: 9,
         39: 2,
         9: 21,
         17: 23,
         14: 32,
         49: 2,
         13: 34,
         34: 1,
         59: 14,
         18: 36,
         31: 27,
         28: 3,
         26: 28,
         46: 6,
         100: 3,
         67: 3,
         50: 4,
         22: 3,
         66: 1,
         64: 2,
         44: 4,
         33: 3,
         93: 3,
         54: 2,
         35: 1,
         48: 1,
         45: 2,
         121: 1,
         25: 1})

In [260]:
accuracy_score(y_test_concat, logreg_pred)

0.22529644268774704

In [261]:
f1_score(y_test_concat, logreg_pred, average='weighted')

0.1946221711091106

#### RandomForest

In [262]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [263]:
rf.fit(X_train, y_train_concat)

RandomForestClassifier(max_depth=10, random_state=42)

In [264]:
rf_preds = rf.predict(X_test)

In [265]:
Counter(rf_preds)

Counter({12: 73,
         6: 1585,
         36: 28,
         2: 97,
         8: 29,
         20: 19,
         4: 34,
         3: 83,
         10: 39,
         5: 21,
         0: 6,
         44: 1,
         13: 7,
         14: 1,
         18: 1})

In [266]:
accuracy_score(y_test_concat, rf_preds)

0.19812252964426877

In [267]:
f1_score(y_test_concat, rf_preds, average='weighted')

0.13548179096806773