In [1]:
import json
import pickle
import statistics

from collections import Counter

import torch
import numpy as np

from catboost import CatBoostClassifier, Pool
from spacy.lang.en import English
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader
from torch import nn
from tqdm.notebook import tqdm

from utils.preprocessing import spacy_tokenize, dummy_fn
# from utils.autoskill_torch import SkillDataset, collate_fn 
from utils.base_torch_utils import train_single_model, evaluate_single_model, BaseModel

In [2]:
Midas2ID = {
    "appreciation": 0, "command": 1, "comment": 2,"complaint": 3,
    "dev_command": 4, "neg_answer": 5, "open_question_factual": 6,
    "open_question_opinion": 7, "opinion": 8, "other_answers": 9,
    "pos_answer": 10, "statement": 11, "yes_no_question": 12,
}

ID2Midas = list(Midas2ID.keys())

Entity2ID = {'misc': 0, 'product': 1, 'food': 2, 'location': 3, 'business': 4,
             'event': 5, 'work_of_art': 6, 'org': 7, 'occupation': 8, 'fac': 9,
             'academic_discipline': 10, 'law': 11, 'film': 12, 'person': 13,
             'language': 14, 'type_of_sport': 15, 'nation': 16, 'literary_work': 17,
             'norp': 18, 'music_genre': 19, 'sports_event': 20, 'song': 21,
             'animal': 22, 'sports_venue': 23, 'sports_season': 24,
             'chemical_element': 25, 'political_party': 26, 'sport_team': 27,
             'national': 28, 'championship': 29, 'association_football_club': 30,
             'sports_league': 31}

EntityTargets2ID = {'product': 0, 'food': 1, 'location': 2, 'business': 3,
                    'event': 4, 'work_of_art': 5, 'org': 6, 'occupation': 7,
                    'fac': 8, 'academic_discipline': 9, 'law': 10, 'person': 11,
                    'language': 12, 'type_of_sport': 13, 'nation': 14,
                    'norp': 15, 'music_genre': 16, 'sports_event': 17,
                    'animal': 18, 'sports_venue': 19, 'sports_season': 20,
                    'chemical_element': 21, 'political_party': 22,
                    'sport_team': 23, 'national': 24, 'championship': 25,
                    'association_football_club': 26, 'sports_league': 27}

from itertools import product
midas_entity2id = list(product(list(Midas2ID.keys()), list(EntityTargets2ID.keys())))
midas_entity2id = {f'{labels[0]}_{labels[1]}': i for i, labels in enumerate(midas_entity2id)}

labels_map = { 
    'midas2id': Midas2ID,
    'entities2id': Entity2ID,
    'target_entity2id': EntityTargets2ID,
    'target_midas2id': Midas2ID,
    'target_midas_and_entity2id': midas_entity2id
}

In [3]:
print(list(midas_entity2id.keys())[list(midas_entity2id.values()).index(308)])

statement_product


In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
with open('data/single_entity_daily_dataset_v3.json', 'r', encoding="utf8") as f:
    daily = json.load(f)

In [6]:
with open('data/single_entity_topical_dataset_v3.json', 'r', encoding="utf8") as f:
    topical = json.load(f)

In [7]:
tokenizer = English().tokenizer

In [8]:
import numpy as np
import torch


class SkillDataset(torch.utils.data.Dataset):
    
    """ customized Dataset class from torch """
    
    def __init__(self, data: list, vars2id: dict, tokenizer, tfidf_model):
        self.data = data
        self.vars2id = vars2id
        self.tokenizer = tokenizer
        self.tfidf_model = tfidf_model
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        """ shape each sample into a proper """
        sample = self.data[index]
        
        x_tfidf = self.__vectorize(sample['previous_text'])
        x_midas = self.__norm_midas(sample['midas_vectors'])
        x_entities = self.__ohencode(sample['previous_entities'])
        x_i = self.__concat_vecs(x_tfidf, x_midas, x_entities)
        
        y_midas = self.data[index]['predict']['midas']
        y_entity = self.data[index]['predict']['entities'][0]['label']
        y_i = self.__encode_labels(y_midas, y_entity)
        
        return x_i, y_i
        
    
    def __norm_midas(self, midas_vectors: list) -> np.array:
        """ 
        takes midas vectors of all sentences in the utterance
        averages them and then applies softmax
        """
        vecs = np.zeros((len(midas_vectors), 13))
        
        for i, vec in enumerate(midas_vectors):
            # get max probability per each midas labels
            vecs[i] = np.max(np.array(vec), axis=0)

        # return normalized
        return vecs
    
    def __tokenize(self, texts: list) -> list:
        """ transform list of strings into a list of list of tokens using spaCy """
        return [[token.lower_ for token in self.tokenizer(" ".join(ut))] for ut in texts]
    
    def __vectorize(self, texts: list) -> np.array:
        """ 
        Using tfidf, vectorize each utterance in the sample
        
        return matrix N_utterances * vocab_size of tfidf_model 
        
        input:
        texts: list - list of strings
        
        output:
        matrix - np.array
        """
        texts = self.__tokenize(texts)
        matrix = self.tfidf_model.transform(texts)
        return matrix.todense()
    
    def __ohencode(self, entities) -> torch.Tensor:
        """ one-hot encoding of entities per each sample """
        entities = [[ent['label'] for sent in ut for ent in sent] for ut in entities]
        ohe_vec = np.zeros((len(entities), len(self.vars2id['entities2id'])))
        
        for i, ut in enumerate(entities):
            for ent in set(ut):
                ohe_vec[i][self.vars2id['entities2id'][ent]] = 1
        
        return ohe_vec
    
    def __concat_vecs(self, tfidf_vec, midas_vec, ohe_vec) -> np.array:
        """ 
        Takes tfidf, midas and one-hot encoded entities vectors and
        transforms them into (1, vec_size). 
        
        Vec_size comes from:
        1. [tfidf utterance(i-2)]
        2. [midas proba distribution utterance(i-2)]
        3. [entity type one-hot utterance(i-2)]
        4. [tfidf (i-1)]
        5. [midas (i-1)][entity (i-1)]
        6. [tfidf (i)] 
        7. [midas (i)]
        8. [entity (i)]

        vec_size = n_utterances * (tfidf.shape[1] + midas.shape[1] + ohe.shape[1])
        """
        assert tfidf_vec.shape[0] == midas_vec.shape[0] == ohe_vec.shape[0]

        n_ut = tfidf_vec.shape[0]
        ut_vec_size = tfidf_vec.shape[1] + midas_vec.shape[1] + ohe_vec.shape[1]

        vecs = np.zeros((n_ut, ut_vec_size))

        vecs[:,:tfidf_vec.shape[1]] = tfidf_vec
        vecs[:,tfidf_vec.shape[1]:tfidf_vec.shape[1]+midas_vec.shape[1]] = midas_vec
        vecs[:,tfidf_vec.shape[1]+midas_vec.shape[1]:] = ohe_vec

        # concat utterance vectors into a sample vector
        return vecs.reshape(-1)
    
    def __encode_labels(self, midas_label: str, entity_label: str) -> tuple:
        """ 
        Returns idx of midas label, entity label and their concatenation.
        
        the first two will be used when a separate classifier is applied
        
        while their concatenation is used with a universal classifier
        
        output:
        [target_midas_id: int, target_entity_id: int], concatenation_id: int
        """
        midas_id = self.vars2id['target_midas2id'][midas_label]
        entity_id = self.vars2id['target_entity2id'][entity_label]
        concat_id = self.vars2id['target_midas_and_entity2id'][f"{midas_label}_{entity_label}"]
        
        return [midas_id, entity_id, concat_id]
    
    
def collate_fn(batch) -> tuple:
    """ a custom collate function to shape a batch properly """
    
    batch_size = len(batch)
    # create empty Tensors to concatenate vectorized utterances and labels
    X_batch = torch.zeros(batch_size, batch[0][0].shape[0])
    y_batch = torch.zeros(batch_size, 3).type(torch.long)
    
    for i, sample in enumerate(batch):
        X_batch[i] = torch.Tensor(batch[i][0]).type(torch.float32)
        y_batch[i] = torch.Tensor(batch[i][1])
        
    return X_batch, y_batch

# TFIDF + Linear_ReLu_Linear

## Daily Dataset

In [44]:
daily_train, daily_test = train_test_split(daily, test_size=0.2, random_state=42)
len(daily_train), len(daily_test)

(3115, 779)

In [23]:
daily_tfidf = pickle.load(open("models/daily_tfidf_3_08_300.pkl", 'rb'))
daily_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000020EC41E14C0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000020EC41E14C0>)

In [46]:
daily_train_dataset = SkillDataset(
    daily_train, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=daily_tfidf
)

len(daily_train_dataset)

3115

In [47]:
print(daily_train_dataset[42][0].shape, '\n') # x_vec
print(daily_train_dataset[42][1], '\n') # y labels: ([midas_id, entity_id], midas_and_entity_id)

(1035,) 

[12, 0, 336] 



In [49]:
daily_test_dataset = SkillDataset(
    daily_test, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=daily_tfidf
)

len(daily_test_dataset)

779

In [50]:
daily_train_loader = DataLoader(
    daily_train_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

daily_test_loader = DataLoader(
    daily_test_dataset, batch_size=32, 
    shuffle=False, collate_fn=collate_fn)

In [52]:
for x, y in daily_train_loader:
    break

x.shape, y.shape

(torch.Size([32, 1035]), torch.Size([32, 3]))

In [53]:
progress_bar = tqdm(total=len(daily_train_loader.dataset), desc='Testing')

for batch in daily_train_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/3115 [00:00<?, ?it/s]

In [54]:
progress_bar = tqdm(total=len(daily_test_loader.dataset), desc='Testing')

for batch in daily_test_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/779 [00:00<?, ?it/s]

### Midas

In [56]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_midas2id']),
    batch_size=daily_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1035, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=13, bias=True)
)

In [57]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [58]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(daily_train_loader.dataset) / daily_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(daily_test_loader.dataset) / daily_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [59]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=daily_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='midas')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=daily_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='midas')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1
Train loss - 1.7462 | Test loss - 1.5095
TEST: f1 weighted - 0.3900 | accuracy - 0.4557



Train epoch 2:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 2
Train loss - 1.3846 | Test loss - 1.4631
TEST: f1 weighted - 0.4069 | accuracy - 0.4531



Train epoch 3:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 3
Train loss - 1.1630 | Test loss - 1.4804
TEST: f1 weighted - 0.4091 | accuracy - 0.4621



Train epoch 4:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 4
Train loss - 0.9598 | Test loss - 1.5107
TEST: f1 weighted - 0.4456 | accuracy - 0.4814



Train epoch 5:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 5
Train loss - 0.7563 | Test loss - 1.5750
TEST: f1 weighted - 0.4731 | accuracy - 0.4942



Train epoch 6:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 6
Train loss - 0.5817 | Test loss - 1.6446
TEST: f1 weighted - 0.4912 | accuracy - 0.5019



Train epoch 7:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 7
Train loss - 0.4274 | Test loss - 1.7210
TEST: f1 weighted - 0.5018 | accuracy - 0.5109



Train epoch 8:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.3047 | Test loss - 1.8189
TEST: f1 weighted - 0.5006 | accuracy - 0.5071



Train epoch 9:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.2121 | Test loss - 1.9273
TEST: f1 weighted - 0.5080 | accuracy - 0.5160



Train epoch 10:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.1423 | Test loss - 2.0273
TEST: f1 weighted - 0.5070 | accuracy - 0.5122



### Entity

In [60]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_entity2id']),
    batch_size=daily_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1035, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=28, bias=True)
)

In [61]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [62]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(daily_train_loader.dataset) / daily_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(daily_test_loader.dataset) / daily_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [63]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=daily_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='entity')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=daily_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='entity')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1
Train loss - 2.2947 | Test loss - 1.9740
TEST: f1 weighted - 0.3226 | accuracy - 0.4403



Train epoch 2:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 2
Train loss - 1.7177 | Test loss - 1.8124
TEST: f1 weighted - 0.3972 | accuracy - 0.4801



Train epoch 3:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 3
Train loss - 1.4090 | Test loss - 1.7533
TEST: f1 weighted - 0.4624 | accuracy - 0.5045



Train epoch 4:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 4
Train loss - 1.1312 | Test loss - 1.7070
TEST: f1 weighted - 0.4788 | accuracy - 0.5186



Train epoch 5:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 5
Train loss - 0.8829 | Test loss - 1.7543
TEST: f1 weighted - 0.4905 | accuracy - 0.5109



Train epoch 6:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 6
Train loss - 0.6799 | Test loss - 1.8124
TEST: f1 weighted - 0.4862 | accuracy - 0.5135



Train epoch 7:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 7
Train loss - 0.5113 | Test loss - 1.8851
TEST: f1 weighted - 0.5079 | accuracy - 0.5250



Train epoch 8:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.3730 | Test loss - 1.9726
TEST: f1 weighted - 0.5000 | accuracy - 0.5186



Train epoch 9:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.2753 | Test loss - 2.0850
TEST: f1 weighted - 0.4936 | accuracy - 0.5173



Train epoch 10:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.1951 | Test loss - 2.1631
TEST: f1 weighted - 0.4846 | accuracy - 0.4994



### Concatenation

In [64]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_midas_and_entity2id']),
    batch_size=daily_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1035, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=364, bias=True)
)

In [65]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [66]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(daily_train_loader.dataset) / daily_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(daily_test_loader.dataset) / daily_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [67]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=daily_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='concatenation')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=daily_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='concatenation')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1
Train loss - 4.3210 | Test loss - 3.7844
TEST: f1 weighted - 0.0699 | accuracy - 0.1772



Train epoch 2:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 2
Train loss - 3.3991 | Test loss - 3.5015
TEST: f1 weighted - 0.0920 | accuracy - 0.1938



Train epoch 3:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 3
Train loss - 2.9141 | Test loss - 3.3518
TEST: f1 weighted - 0.1545 | accuracy - 0.2169



Train epoch 4:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 4
Train loss - 2.4540 | Test loss - 3.2736
TEST: f1 weighted - 0.1881 | accuracy - 0.2465



Train epoch 5:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 5
Train loss - 1.9958 | Test loss - 3.2268
TEST: f1 weighted - 0.2111 | accuracy - 0.2516



Train epoch 6:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 6
Train loss - 1.5759 | Test loss - 3.2322
TEST: f1 weighted - 0.2443 | accuracy - 0.2760



Train epoch 7:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 7
Train loss - 1.2058 | Test loss - 3.2802
TEST: f1 weighted - 0.2697 | accuracy - 0.3017



Train epoch 8:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.8861 | Test loss - 3.3080
TEST: f1 weighted - 0.2881 | accuracy - 0.3119



Train epoch 9:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.6457 | Test loss - 3.4075
TEST: f1 weighted - 0.2933 | accuracy - 0.3158



Train epoch 10:   0%|          | 0/98 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.4614 | Test loss - 3.4966
TEST: f1 weighted - 0.2990 | accuracy - 0.3209



## Topical Chat

In [69]:
topical_train, topical_test = train_test_split(topical, test_size=0.2, random_state=42)
len(topical_train), len(topical_test)

(6240, 1560)

In [70]:
tokenizer = English().tokenizer
topical_tfidf = pickle.load(open("models/topical_tfidf_3_08_300.pkl", 'rb'))
topical_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000020EC41E14C0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000020EC41E14C0>)

In [72]:
topical_train_dataset = SkillDataset(
    topical_train, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=topical_tfidf
)

len(topical_train_dataset)

6240

In [73]:
print(topical_train_dataset[42][0].shape, '\n') # x_vec
print(topical_train_dataset[42][1], '\n') # y labels: ([midas_id, entity_id], midas_and_entity_id)

(1035,) 

[11, 2, 310] 



In [74]:
topical_test_dataset = SkillDataset(
    topical_test, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=topical_tfidf
)

len(topical_test_dataset)

1560

In [75]:
topical_train_loader = DataLoader(
    topical_train_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

topical_test_loader = DataLoader(
    topical_test_dataset, batch_size=32, 
    shuffle=False, collate_fn=collate_fn)

In [76]:
for x, y in topical_train_loader:
    break

x.shape, y.shape

(torch.Size([32, 1035]), torch.Size([32, 3]))

In [77]:
progress_bar = tqdm(total=len(topical_train_loader.dataset), desc='Testing')

for batch in topical_train_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/6240 [00:00<?, ?it/s]

In [78]:
progress_bar = tqdm(total=len(topical_test_loader.dataset), desc='Testing')

for batch in topical_test_loader:
    progress_bar.update(batch[0].size(0))
    
progress_bar.close()

Testing:   0%|          | 0/1560 [00:00<?, ?it/s]

### MIDAS

In [80]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_midas2id']),
    batch_size=topical_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1035, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=13, bias=True)
)

In [81]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [83]:
# train loop
NUM_EPOCHS = 10

TRAIN_SIZE = int(np.ceil(len(topical_train_loader.dataset) / topical_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(topical_test_loader.dataset) / topical_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [84]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=topical_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='midas')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=topical_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='midas')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 1
Train loss - 1.4306 | Test loss - 1.3691
TEST: f1 weighted - 0.4127 | accuracy - 0.4647



Train epoch 2:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 2
Train loss - 1.2616 | Test loss - 1.3555
TEST: f1 weighted - 0.4006 | accuracy - 0.4506



Train epoch 3:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 3
Train loss - 1.1586 | Test loss - 1.3765
TEST: f1 weighted - 0.4185 | accuracy - 0.4603



Train epoch 4:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 4
Train loss - 1.0394 | Test loss - 1.4053
TEST: f1 weighted - 0.4189 | accuracy - 0.4596



Train epoch 5:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 5
Train loss - 0.8868 | Test loss - 1.5015
TEST: f1 weighted - 0.4097 | accuracy - 0.4519



Train epoch 6:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 6
Train loss - 0.6938 | Test loss - 1.5611
TEST: f1 weighted - 0.4098 | accuracy - 0.4417



Train epoch 7:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 7
Train loss - 0.4905 | Test loss - 1.6563
TEST: f1 weighted - 0.4000 | accuracy - 0.4199



Train epoch 8:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.3187 | Test loss - 1.8041
TEST: f1 weighted - 0.4056 | accuracy - 0.4301



Train epoch 9:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.1952 | Test loss - 1.9315
TEST: f1 weighted - 0.4095 | accuracy - 0.4372



Train epoch 10:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.1161 | Test loss - 2.0090
TEST: f1 weighted - 0.4017 | accuracy - 0.4199



### Entity

In [85]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_entity2id']),
    batch_size=topical_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1035, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=28, bias=True)
)

In [86]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [88]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(topical_train_loader.dataset) / topical_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(topical_test_loader.dataset) / topical_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [89]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=topical_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='entity')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=topical_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='entity')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 1
Train loss - 2.4103 | Test loss - 2.1734
TEST: f1 weighted - 0.2955 | accuracy - 0.3692



Train epoch 2:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 2
Train loss - 1.9962 | Test loss - 2.0809
TEST: f1 weighted - 0.3462 | accuracy - 0.3833



Train epoch 3:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 3
Train loss - 1.7571 | Test loss - 2.0842
TEST: f1 weighted - 0.3541 | accuracy - 0.3853



Train epoch 4:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 4
Train loss - 1.5391 | Test loss - 2.1518
TEST: f1 weighted - 0.3437 | accuracy - 0.3744



Train epoch 5:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 5
Train loss - 1.3208 | Test loss - 2.2852
TEST: f1 weighted - 0.3383 | accuracy - 0.3577



Train epoch 6:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 6
Train loss - 1.1084 | Test loss - 2.3595
TEST: f1 weighted - 0.3299 | accuracy - 0.3532



Train epoch 7:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 7
Train loss - 0.8956 | Test loss - 2.4658
TEST: f1 weighted - 0.3309 | accuracy - 0.3487



Train epoch 8:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 8
Train loss - 0.7059 | Test loss - 2.5846
TEST: f1 weighted - 0.3267 | accuracy - 0.3404



Train epoch 9:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.5334 | Test loss - 2.7520
TEST: f1 weighted - 0.3145 | accuracy - 0.3231



Train epoch 10:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.3888 | Test loss - 2.8923
TEST: f1 weighted - 0.3246 | accuracy - 0.3429



### Concatenation

In [90]:
model = BaseModel(
    input_size=batch[0].shape[1],
    hidden_size=512,
    n_classes=len(labels_map['target_midas_and_entity2id']),
    batch_size=topical_train_loader.batch_size
)

model.to(DEVICE)

BaseModel(
  (linear_in): Linear(in_features=1035, out_features=512, bias=True)
  (relu): ReLU()
  (clf): Linear(in_features=512, out_features=364, bias=True)
)

In [91]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [92]:
# train loop
NUM_EPOCHS = 10  # Задайте количество эпох

TRAIN_SIZE = int(np.ceil(len(topical_train_loader.dataset) / topical_train_loader.batch_size))
TEST_SIZE = int(np.ceil(len(topical_test_loader.dataset) / topical_test_loader.batch_size))

train_losses = list()
test_losses = list()

test_f1 = list()
test_acc = list()

In [93]:
for e in range(1, NUM_EPOCHS+1):
    
    epoch_train_losses = train_single_model(
        model=model, device=DEVICE, dataloader=topical_train_loader, 
        n_batches=TRAIN_SIZE, epoch=e,
        loss_fn=criterion, optimizer=optimizer,
        clip=3., label_type='concatenation')
    
    epoch_test_losses, epoch_f1, epoch_acc = evaluate_single_model(
        model=model, device=DEVICE, dataloader=topical_test_loader, 
        n_batches=TEST_SIZE, epoch=e, loss_fn=criterion, label_type='concatenation')
    
    message = f'Epoch: {e}\n'
    message += f'Train loss - {np.mean(epoch_train_losses):.4f} | Test loss - {np.mean(epoch_test_losses):.4f}\n'
    message += f'TEST: f1 weighted - {epoch_f1:.4f} | accuracy - {epoch_acc:.4f}\n'

    print(message)

Train epoch 1:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 1:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 1
Train loss - 4.0227 | Test loss - 3.6634
TEST: f1 weighted - 0.0835 | accuracy - 0.1545



Train epoch 2:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 2:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 2
Train loss - 3.3871 | Test loss - 3.5370
TEST: f1 weighted - 0.1391 | accuracy - 0.1846



Train epoch 3:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 3:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 3
Train loss - 3.0217 | Test loss - 3.5035
TEST: f1 weighted - 0.1368 | accuracy - 0.1846



Train epoch 4:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 4:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 4
Train loss - 2.6613 | Test loss - 3.5438
TEST: f1 weighted - 0.1429 | accuracy - 0.1801



Train epoch 5:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 5:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 5
Train loss - 2.2836 | Test loss - 3.6031
TEST: f1 weighted - 0.1532 | accuracy - 0.1904



Train epoch 6:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 6:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 6
Train loss - 1.8879 | Test loss - 3.7537
TEST: f1 weighted - 0.1446 | accuracy - 0.1744



Train epoch 7:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 7:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 7
Train loss - 1.5001 | Test loss - 3.9175
TEST: f1 weighted - 0.1457 | accuracy - 0.1750



Train epoch 8:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 8:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 8
Train loss - 1.1509 | Test loss - 4.0875
TEST: f1 weighted - 0.1399 | accuracy - 0.1609



Train epoch 9:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 9:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 9
Train loss - 0.8594 | Test loss - 4.2629
TEST: f1 weighted - 0.1412 | accuracy - 0.1667



Train epoch 10:   0%|          | 0/195 [00:00<?, ?it/s]

Val epoch 10:   0%|          | 0/49 [00:00<?, ?it/s]

Epoch: 10
Train loss - 0.6292 | Test loss - 4.4815
TEST: f1 weighted - 0.1321 | accuracy - 0.1538



# TFIDF + Catboost / LinearRegression / RandomForest

## DailyDialog

In [9]:
daily_tfidf = pickle.load(open("models/daily_tfidf_3_08_300.pkl", 'rb'))
daily_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000002153191F550>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000002153191F550>)

In [10]:
daily_dataset = SkillDataset(
    daily, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=daily_tfidf
)

In [11]:
daily_loader = DataLoader(
    daily_dataset, batch_size=len(daily_dataset), 
    shuffle=False, collate_fn=collate_fn)

In [12]:
for X, y in daily_loader:
    break

In [13]:
X.shape, y.shape

(torch.Size([3894, 1035]), torch.Size([3894, 3]))

In [14]:
X, y = X.numpy(), y.numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
y_train_midas, y_train_entity, y_train_concat = y_train[:,0], y_train[:,1], y_train[:,2]
y_test_midas, y_test_entity, y_test_concat = y_test[:,0], y_test[:,1], y_test[:,2]

### Midas

In [16]:
n_bins = len(labels_map['target_midas2id'])
class_weights, _ = np.histogram(y_train_midas, bins=n_bins, density=True)
class_weights

array([6.67736758e-02, 1.85903984e-02, 7.96731359e-03, 1.21406683e-02,
       1.36582519e-02, 7.43615935e-02, 0.00000000e+00, 5.95651539e-02,
       2.64059536e-01, 3.79395885e-04, 1.17612724e-02, 4.95491026e-01,
       1.57069896e-01])

#### Catboost

In [59]:
cb_train = Pool(X_train, label=y_train_midas)
cb_eval = Pool(X_test, label=y_test_midas)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, eval_set=cb_eval, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Learning rate set to 0.5
0:	learn: 0.4375602	test: 0.4133504	best: 0.4133504 (0)	total: 432ms	remaining: 6.04s
1:	learn: 0.4439807	test: 0.4223363	best: 0.4223363 (1)	total: 844ms	remaining: 5.49s
2:	learn: 0.4520064	test: 0.4146341	best: 0.4223363 (1)	total: 1.32s	remaining: 5.29s
3:	learn: 0.4577849	test: 0.4133504	best: 0.4223363 (1)	total: 1.78s	remaining: 4.89s
4:	learn: 0.4648475	test: 0.4159178	best: 0.4223363 (1)	total: 2.22s	remaining: 4.45s
5:	learn: 0.4789727	test: 0.4287548	best: 0.4287548 (5)	total: 2.65s	remaining: 3.97s
6:	learn: 0.4853933	test: 0.4326059	best: 0.4326059 (6)	total: 3.06s	remaining: 3.5s
7:	learn: 0.4837881	test: 0.4287548	best: 0.4326059 (6)	total: 3.48s	remaining: 3.05s
8:	learn: 0.4860353	test: 0.4184852	best: 0.4326059 (6)	total: 3.91s	remaining: 2.61s
9:	learn: 0.4886035	test: 0.4274711	best: 0.4326059 (6)	total: 4.38s	remaining: 2.19s
10:	learn: 0.4991974	test: 0.4261874	best: 0.4326059 (6)	total: 4.81s	remaining: 1.75s
11:	learn: 0.5036918	test: 0.

In [60]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (779, 1)


In [61]:
Counter(cb_pred.squeeze())

Counter({11: 532, 8: 162, 12: 85})

In [62]:
accuracy_score(y_test_midas, cb_pred.squeeze())

0.4326059050064185

In [63]:
model.score(cb_eval)

0.4326059050064185

In [64]:
f1_score(y_test_midas, cb_pred.squeeze(), average='weighted')

0.36064206475430116

#### LogReg

In [23]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_midas)

LogisticRegression(max_iter=500, random_state=42)

In [24]:
logreg_pred = lg.predict(X_test)

In [25]:
Counter(logreg_pred)

Counter({8: 184,
         11: 425,
         12: 96,
         7: 21,
         6: 32,
         1: 10,
         2: 4,
         10: 3,
         5: 3,
         4: 1})

In [26]:
accuracy_score(y_test_midas, logreg_pred)

0.4762516046213094

In [27]:
f1_score(y_test_midas, logreg_pred, average='weighted')

0.45340793576035904

#### RandomForest

In [28]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [29]:
rf.fit(X_train, y_train_midas)

RandomForestClassifier(max_depth=10, random_state=42)

In [30]:
rf_preds = rf.predict(X_test)

In [31]:
Counter(rf_preds)

Counter({11: 680, 8: 50, 7: 3, 12: 45, 2: 1})

In [32]:
accuracy_score(y_test_midas, rf_preds)

0.4672657252888318

In [33]:
f1_score(y_test_midas, rf_preds, average='weighted')

0.3626381153058806

### Entity

In [34]:
n_bins = len(labels_map['target_entity2id'])
class_weights, _ = np.histogram(y_train_entity, bins=n_bins, density=True)
class_weights

array([0.3974597 , 0.17117733, 0.16336102, 0.07464582, 0.05236932,
       0.        , 0.05627748, 0.06331216, 0.0371275 , 0.03790914,
       0.0246214 , 0.        , 0.04064485, 0.02305813, 0.03165608,
       0.01211529, 0.        , 0.00547142, 0.01133366, 0.00273571,
       0.00390816, 0.00195408, 0.        , 0.00156326, 0.00117245,
       0.00078163, 0.00078163, 0.00195408])

#### Catboost

In [65]:
cb_train = Pool(X_train, label=y_train_entity)
cb_eval = Pool(X_test, label=y_test_entity)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Learning rate set to 0.5
0:	learn: 0.4086677	total: 883ms	remaining: 12.4s
1:	learn: 0.4208668	total: 1.85s	remaining: 12.1s
2:	learn: 0.4279294	total: 2.73s	remaining: 10.9s
3:	learn: 0.4430177	total: 3.61s	remaining: 9.93s
4:	learn: 0.4420546	total: 4.48s	remaining: 8.96s
5:	learn: 0.4459069	total: 5.38s	remaining: 8.06s
6:	learn: 0.4484751	total: 6.28s	remaining: 7.18s
7:	learn: 0.4516854	total: 7.17s	remaining: 6.28s
8:	learn: 0.4565008	total: 8.05s	remaining: 5.37s
9:	learn: 0.4574639	total: 8.92s	remaining: 4.46s
10:	learn: 0.4577849	total: 9.79s	remaining: 3.56s
11:	learn: 0.4568218	total: 10.7s	remaining: 2.67s
12:	learn: 0.4674157	total: 11.6s	remaining: 1.78s
13:	learn: 0.4674157	total: 12.5s	remaining: 890ms
14:	learn: 0.4738363	total: 13.4s	remaining: 0us
class =  (779, 1)


In [66]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (779, 1)


In [67]:
Counter(cb_pred.squeeze())

Counter({2: 85,
         0: 544,
         3: 30,
         1: 84,
         6: 12,
         12: 13,
         5: 3,
         9: 1,
         10: 3,
         7: 2,
         4: 2})

In [68]:
accuracy_score(y_test_entity, cb_pred.squeeze())

0.4287548138639281

In [69]:
model.score(cb_eval)

0.4287548138639281

In [70]:
f1_score(y_test_entity, cb_pred.squeeze(), average='weighted')

0.34967552857158113

#### LogReg

In [42]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_entity)

LogisticRegression(max_iter=500, random_state=42)

In [43]:
logreg_pred = lg.predict(X_test)

In [44]:
Counter(logreg_pred)

Counter({2: 129,
         0: 361,
         3: 35,
         1: 115,
         4: 25,
         11: 8,
         6: 34,
         8: 8,
         5: 22,
         10: 8,
         12: 18,
         7: 6,
         16: 1,
         13: 4,
         9: 5})

In [45]:
accuracy_score(y_test_entity, logreg_pred)

0.503209242618742

In [46]:
f1_score(y_test_entity, logreg_pred, average='weighted')

0.4683790779119247

#### RandomForest

In [47]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [48]:
rf.fit(X_train, y_train_entity)

RandomForestClassifier(max_depth=10, random_state=42)

In [49]:
rf_preds = rf.predict(X_test)

In [50]:
Counter(rf_preds)

Counter({2: 39, 0: 641, 1: 89, 12: 9, 3: 1})

In [51]:
accuracy_score(y_test_entity, rf_preds)

0.4249037227214377

In [52]:
f1_score(y_test_entity, rf_preds, average='weighted')

0.3201967718051575

### Concatenation

In [53]:
n_bins = len(labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(y_train_concat, bins=n_bins, density=True)
class_weights

array([0.02471231, 0.01270919, 0.00494246, 0.0031773 , 0.00141213,
       0.00494246, 0.0021182 , 0.00141213, 0.0010591 , 0.00176517,
       0.0021182 , 0.        , 0.00141213, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00035303, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00564853, 0.00282426, 0.00282426, 0.        , 0.00035303,
       0.0010591 , 0.00035303, 0.00070607, 0.00247123, 0.        ,
       0.00035303, 0.        , 0.        , 0.00035303, 0.        ,
       0.00035303, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00353033, 0.00035303, 0.        , 0.        ,
       0.00070607, 0.        , 0.00070607, 0.00035303, 0.        ,
       0.00070607, 0.        , 0.        , 0.        , 0.     

#### Catboost

In [71]:
cb_train = Pool(X_train, label=y_train_concat)
cb_eval = Pool(X_test, label=y_test_concat)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    # 'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Learning rate set to 0.5
0:	learn: 0.1666132	total: 5.93s	remaining: 1m 23s
1:	learn: 0.1235955	total: 12s	remaining: 1m 18s
2:	learn: 0.1441413	total: 18.8s	remaining: 1m 15s
3:	learn: 0.1566613	total: 25.3s	remaining: 1m 9s
4:	learn: 0.1617978	total: 31.7s	remaining: 1m 3s
5:	learn: 0.1778491	total: 38.1s	remaining: 57.1s
6:	learn: 0.1865169	total: 44.4s	remaining: 50.8s
7:	learn: 0.1939005	total: 50.8s	remaining: 44.5s
8:	learn: 0.2105939	total: 57.5s	remaining: 38.3s
9:	learn: 0.2218299	total: 1m 4s	remaining: 32.2s
10:	learn: 0.2288925	total: 1m 10s	remaining: 25.8s
11:	learn: 0.2333868	total: 1m 17s	remaining: 19.4s
12:	learn: 0.2382022	total: 1m 24s	remaining: 12.9s
13:	learn: 0.2439807	total: 1m 30s	remaining: 6.47s
14:	learn: 0.2510433	total: 1m 37s	remaining: 0us
class =  (779, 1)


In [72]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (779, 1)


In [73]:
Counter(cb_pred.squeeze())

Counter({308: 275,
         338: 16,
         319: 1,
         28: 10,
         310: 112,
         314: 14,
         336: 60,
         224: 68,
         339: 2,
         309: 25,
         225: 59,
         311: 12,
         168: 11,
         316: 13,
         348: 3,
         228: 5,
         340: 1,
         226: 12,
         337: 16,
         318: 9,
         196: 11,
         315: 4,
         312: 3,
         231: 3,
         170: 14,
         227: 1,
         197: 6,
         29: 1,
         342: 2,
         202: 2,
         230: 2,
         313: 2,
         320: 1,
         237: 1,
         346: 2})

In [74]:
accuracy_score(y_test_concat, cb_pred.squeeze())

0.1386392811296534

In [75]:
model.score(cb_eval)

0.1386392811296534

In [76]:
f1_score(y_test_concat, cb_pred.squeeze(), average='weighted')

0.09746762093343321

#### LogReg

In [77]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_concat)

LogisticRegression(max_iter=500, random_state=42)

In [78]:
logreg_pred = lg.predict(X_test)

In [79]:
Counter(logreg_pred)

Counter({310: 79,
         308: 247,
         224: 57,
         28: 16,
         311: 22,
         168: 7,
         204: 1,
         225: 43,
         337: 9,
         233: 3,
         336: 42,
         199: 1,
         29: 5,
         231: 2,
         309: 36,
         319: 9,
         178: 3,
         316: 7,
         229: 4,
         228: 9,
         312: 8,
         313: 10,
         236: 10,
         314: 21,
         226: 24,
         197: 11,
         315: 2,
         232: 2,
         196: 6,
         324: 1,
         174: 1,
         170: 10,
         338: 7,
         237: 3,
         227: 5,
         348: 2,
         171: 4,
         33: 1,
         342: 4,
         181: 1,
         344: 1,
         320: 7,
         339: 3,
         198: 3,
         30: 1,
         230: 7,
         341: 1,
         59: 1,
         346: 1,
         63: 1,
         112: 2,
         318: 3,
         180: 1,
         200: 1,
         280: 1,
         340: 2,
         172: 1,
         202: 1,
     

In [80]:
accuracy_score(y_test_concat, logreg_pred)

0.2862644415917843

In [81]:
f1_score(y_test_concat, logreg_pred, average='weighted')

0.26027189200954154

#### RandomForest

In [82]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [83]:
rf.fit(X_train, y_train_concat)

RandomForestClassifier(max_depth=10, random_state=42)

In [84]:
rf_preds = rf.predict(X_test)

In [85]:
Counter(rf_preds)

Counter({308: 663,
         319: 2,
         310: 31,
         225: 59,
         197: 2,
         309: 11,
         224: 3,
         336: 2,
         63: 1,
         112: 1,
         337: 1,
         339: 1,
         226: 1,
         281: 1})

In [86]:
accuracy_score(y_test_concat, rf_preds)

0.20025673940949937

In [87]:
f1_score(y_test_concat, rf_preds, average='weighted')

0.11367263555855998

## Topical Chat

In [88]:
topical_tfidf = pickle.load(open("models/topical_tfidf_3_08_300.pkl", 'rb'))
topical_tfidf

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000002153191F550>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000002153191F550>)

In [89]:
topical_dataset = SkillDataset(
    topical, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=topical_tfidf
)

In [90]:
topical_loader = DataLoader(
    topical_dataset, batch_size=len(topical_dataset), 
    shuffle=False, collate_fn=collate_fn)

In [91]:
for X, y in topical_loader:
    break

In [92]:
X.shape, y.shape

(torch.Size([7800, 1035]), torch.Size([7800, 3]))

In [93]:
X, y = X.numpy(), y.numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
y_train_midas, y_train_entity, y_train_concat = y_train[:,0], y_train[:,1], y_train[:,2]
y_test_midas, y_test_entity, y_test_concat = y_test[:,0], y_test[:,1], y_test[:,2]

### Midas

In [96]:
n_bins = len(labels_map['target_midas2id'])
class_weights, _ = np.histogram(y_train_midas, bins=n_bins, density=True)
class_weights

array([0.00069444, 0.01944444, 0.0578125 , 0.00173611, 0.00138889,
       0.00798611, 0.01614583, 0.01111111, 0.47395833, 0.00052083,
       0.02083333, 0.39739583, 0.07430556])

#### Catboost

In [103]:
cb_train = Pool(X_train, label=y_train_midas)
cb_eval = Pool(X_test, label=y_test_midas)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Learning rate set to 0.5
0:	learn: 0.4381410	test: 0.4352564	best: 0.4352564 (0)	total: 947ms	remaining: 13.3s
1:	learn: 0.4536859	test: 0.4448718	best: 0.4448718 (1)	total: 1.98s	remaining: 12.9s
2:	learn: 0.4791667	test: 0.4544872	best: 0.4544872 (2)	total: 3.03s	remaining: 12.1s
3:	learn: 0.4886218	test: 0.4685897	best: 0.4685897 (3)	total: 4.09s	remaining: 11.2s
4:	learn: 0.4972756	test: 0.4698718	best: 0.4698718 (4)	total: 5.13s	remaining: 10.3s
5:	learn: 0.5040064	test: 0.4762821	best: 0.4762821 (5)	total: 6.17s	remaining: 9.25s
6:	learn: 0.5054487	test: 0.4762821	best: 0.4762821 (5)	total: 7.3s	remaining: 8.34s
7:	learn: 0.5068910	test: 0.4807692	best: 0.4807692 (7)	total: 8.42s	remaining: 7.36s
8:	learn: 0.5134615	test: 0.4788462	best: 0.4807692 (7)	total: 9.48s	remaining: 6.32s
9:	learn: 0.5173077	test: 0.4730769	best: 0.4807692 (7)	total: 10.7s	remaining: 5.36s
10:	learn: 0.5213141	test: 0.4762821	best: 0.4807692 (7)	total: 11.9s	remaining: 4.34s
11:	learn: 0.5241987	test: 0.

In [104]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (1560, 1)


In [105]:
Counter(cb_pred.squeeze())

Counter({8: 1124, 11: 428, 10: 1, 12: 5, 2: 2})

In [106]:
accuracy_score(y_test_midas, cb_pred.squeeze())

0.48525641025641025

In [107]:
model.score(cb_eval)

0.48525641025641025

In [108]:
f1_score(y_test_midas, cb_pred.squeeze(), average='weighted')

0.4194700797177774

#### LogReg

In [109]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_midas)

LogisticRegression(max_iter=500, random_state=42)

In [110]:
logreg_pred = lg.predict(X_test)

In [111]:
Counter(logreg_pred)

Counter({11: 614, 8: 871, 12: 42, 2: 25, 1: 1, 10: 6, 6: 1})

In [112]:
accuracy_score(y_test_midas, logreg_pred)

0.4512820512820513

In [113]:
f1_score(y_test_midas, logreg_pred, average='weighted')

0.4146449010196535

#### RandomForest

In [114]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [115]:
rf.fit(X_train, y_train_midas)

RandomForestClassifier(max_depth=10, random_state=42)

In [116]:
rf_preds = rf.predict(X_test)

In [117]:
Counter(rf_preds)

Counter({8: 1207, 11: 353})

In [118]:
accuracy_score(y_test_midas, rf_preds)

0.49166666666666664

In [119]:
f1_score(y_test_midas, rf_preds, average='weighted')

0.4144003559248664

### Entity

In [121]:
n_bins = len(labels_map['target_entity2id'])
class_weights, _ = np.histogram(y_train_entity, bins=n_bins, density=True)
class_weights

array([2.61585945e-01, 1.99430199e-02, 1.39767331e-01, 6.96343780e-02,
       2.74216524e-02, 1.04202279e-01, 6.88034188e-02, 5.84995252e-02,
       1.32953466e-02, 1.99430199e-02, 1.87796771e-02, 6.91358025e-02,
       1.09686610e-02, 5.05223172e-02, 6.64767331e-03, 1.91120608e-02,
       1.86134853e-02, 4.65337132e-03, 1.81149098e-02, 2.16049383e-03,
       3.32383666e-03, 1.66191833e-04, 6.81386515e-03, 1.91120608e-02,
       4.98575499e-04, 3.49002849e-03, 3.32383666e-04, 1.49572650e-03])

#### Catboost

In [124]:
cb_train = Pool(X_train, label=y_train_entity)
cb_eval = Pool(X_test, label=y_test_entity)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Learning rate set to 0.5
0:	learn: 0.2541667	test: 0.2698718	best: 0.2698718 (0)	total: 2.36s	remaining: 33s
1:	learn: 0.1535256	test: 0.1416667	best: 0.2698718 (0)	total: 4.72s	remaining: 30.7s
2:	learn: 0.2591346	test: 0.2634615	best: 0.2698718 (0)	total: 7.08s	remaining: 28.3s
3:	learn: 0.2860577	test: 0.3012821	best: 0.3012821 (3)	total: 9.49s	remaining: 26.1s
4:	learn: 0.3064103	test: 0.3038462	best: 0.3038462 (4)	total: 12s	remaining: 24s
5:	learn: 0.3142628	test: 0.3038462	best: 0.3038462 (4)	total: 14.8s	remaining: 22.2s
6:	learn: 0.3379808	test: 0.3326923	best: 0.3326923 (6)	total: 17.5s	remaining: 20s
7:	learn: 0.3415064	test: 0.3352564	best: 0.3352564 (7)	total: 20.2s	remaining: 17.7s
8:	learn: 0.3461538	test: 0.3352564	best: 0.3352564 (7)	total: 22.9s	remaining: 15.3s
9:	learn: 0.3504808	test: 0.3384615	best: 0.3384615 (9)	total: 25.7s	remaining: 12.8s
10:	learn: 0.3540064	test: 0.3391026	best: 0.3391026 (10)	total: 28.4s	remaining: 10.3s
11:	learn: 0.3584936	test: 0.341666

In [125]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (1560, 1)


In [126]:
Counter(cb_pred.squeeze())

Counter({0: 744,
         11: 98,
         2: 269,
         7: 73,
         3: 64,
         6: 31,
         12: 11,
         5: 168,
         23: 8,
         13: 64,
         1: 18,
         18: 10,
         4: 2})

In [127]:
accuracy_score(y_test_entity, cb_pred.squeeze())

0.34615384615384615

In [128]:
model.score(cb_eval)

0.34615384615384615

In [129]:
f1_score(y_test_entity, cb_pred.squeeze(), average='weighted')

0.2964713469004042

#### LogReg

In [130]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_entity)

LogisticRegression(max_iter=500, random_state=42)

In [131]:
logreg_pred = lg.predict(X_test)

In [132]:
Counter(logreg_pred)

Counter({18: 11,
         16: 20,
         2: 234,
         0: 605,
         5: 183,
         10: 10,
         12: 16,
         11: 100,
         4: 35,
         3: 77,
         7: 78,
         6: 75,
         15: 11,
         13: 63,
         20: 1,
         1: 17,
         8: 5,
         9: 5,
         23: 9,
         22: 5})

In [133]:
accuracy_score(y_test_entity, logreg_pred)

0.3641025641025641

In [134]:
f1_score(y_test_entity, logreg_pred, average='weighted')

0.33884176833264446

#### RandomForest

In [135]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [136]:
rf.fit(X_train, y_train_entity)

RandomForestClassifier(max_depth=10, random_state=42)

In [137]:
rf_preds = rf.predict(X_test)

In [138]:
Counter(rf_preds)

Counter({0: 1349, 2: 113, 7: 20, 5: 38, 13: 36, 18: 2, 11: 2})

In [139]:
accuracy_score(y_test_entity, rf_preds)

0.3192307692307692

In [140]:
f1_score(y_test_entity, rf_preds, average='weighted')

0.2079391210083521

### Concatenation

In [141]:
n_bins = len(labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(y_train_concat, bins=n_bins, density=True)
class_weights

array([0.0003214 , 0.        , 0.        , 0.0001607 , 0.        ,
       0.0001607 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00482094, 0.00064279,
       0.0003214 , 0.00144628, 0.0003214 , 0.00080349, 0.00144628,
       0.00305326, 0.        , 0.00224977, 0.0003214 , 0.00112489,
       0.0001607 , 0.00096419, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0001607 , 0.        , 0.0001607 , 0.        ,
       0.        , 0.01060606, 0.00096419, 0.00530303, 0.00176768,
       0.00385675, 0.00883838, 0.00401745, 0.0078742 , 0.0001607 ,
       0.00128558, 0.00192837, 0.00128558, 0.        , 0.00112489,
       0.0003214 , 0.00208907, 0.0003214 , 0.0001607 , 0.00016

#### Catboost

In [143]:
cb_train = Pool(X_train, label=y_train_concat)
cb_eval = Pool(X_test, label=y_test_concat)

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.5
0:	learn: 0.1174679	total: 18.9s	remaining: 4m 25s
1:	learn: 0.0673077	total: 40.7s	remaining: 4m 24s
2:	learn: 0.1102564	total: 1m	remaining: 4m 1s
3:	learn: 0.1081731	total: 1m 20s	remaining: 3m 42s
4:	learn: 0.0883013	total: 1m 40s	remaining: 3m 21s
5:	learn: 0.0943910	total: 2m	remaining: 3m 1s
6:	learn: 0.1211538	total: 2m 21s	remaining: 2m 42s
7:	learn: 0.1350962	total: 2m 47s	remaining: 2m 26s
8:	learn: 0.1383013	total: 3m 14s	remaining: 2m 9s
9:	learn: 0.1469551	total: 3m 38s	remaining: 1m 49s
10:	learn: 0.1559295	total: 3m 59s	remaining: 1m 26s
11:	learn: 0.1636218	total: 4m 19s	remaining: 1m 4s
12:	learn: 0.1669872	total: 4m 40s	remaining: 43.2s
13:	learn: 0.1737179	total: 5m 1s	remaining: 21.6s
14:	learn: 0.1820513	total: 5m 26s	remaining: 0us
class =  (1560, 1)


In [144]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (1560, 1)


In [145]:
Counter(cb_pred.squeeze())

Counter({229: 202,
         63: 5,
         226: 107,
         230: 21,
         224: 265,
         313: 91,
         308: 338,
         235: 90,
         310: 175,
         227: 34,
         56: 9,
         240: 7,
         311: 21,
         228: 3,
         336: 24,
         319: 25,
         349: 2,
         280: 10,
         326: 8,
         233: 2,
         236: 7,
         237: 53,
         239: 2,
         309: 1,
         338: 7,
         170: 1,
         247: 12,
         314: 15,
         293: 2,
         321: 1,
         225: 5,
         339: 3,
         61: 4,
         331: 2,
         318: 1,
         312: 3,
         315: 2})

In [146]:
accuracy_score(y_test_concat, cb_pred.squeeze())

0.12628205128205128

In [147]:
model.score(cb_eval)

0.12628205128205128

In [148]:
f1_score(y_test_concat, cb_pred.squeeze(), average='weighted')

0.09697429010490341

#### LogReg

In [149]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, y_train_concat)

LogisticRegression(max_iter=500, random_state=42)

In [150]:
logreg_pred = lg.predict(X_test)

In [151]:
Counter(logreg_pred)

Counter({242: 3,
         308: 278,
         226: 97,
         230: 51,
         313: 43,
         224: 272,
         311: 41,
         336: 30,
         235: 58,
         227: 46,
         231: 29,
         310: 166,
         314: 30,
         236: 15,
         229: 156,
         240: 11,
         232: 5,
         319: 29,
         323: 6,
         237: 58,
         315: 37,
         324: 3,
         280: 2,
         331: 3,
         225: 14,
         239: 6,
         287: 2,
         234: 5,
         312: 10,
         247: 7,
         228: 13,
         326: 6,
         56: 3,
         58: 1,
         63: 1,
         321: 3,
         233: 3,
         330: 1,
         318: 2,
         202: 1,
         246: 4,
         320: 1,
         349: 1,
         316: 1,
         338: 2,
         28: 1,
         293: 2,
         61: 1})

In [152]:
accuracy_score(y_test_concat, logreg_pred)

0.16923076923076924

In [153]:
f1_score(y_test_concat, logreg_pred, average='weighted')

0.14168762662600887

#### RandomForest

In [154]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [155]:
rf.fit(X_train, y_train_concat)

RandomForestClassifier(max_depth=10, random_state=42)

In [156]:
rf_preds = rf.predict(X_test)

In [157]:
Counter(rf_preds)

Counter({308: 706,
         226: 31,
         224: 481,
         310: 126,
         229: 128,
         237: 67,
         315: 7,
         311: 3,
         287: 2,
         247: 2,
         231: 2,
         319: 2,
         314: 1,
         235: 1,
         236: 1})

In [158]:
accuracy_score(y_test_concat, rf_preds)

0.16602564102564102

In [159]:
f1_score(y_test_concat, rf_preds, average='weighted')

0.09939974955657623