In [1]:
import json
import pickle
import os
from collections import Counter

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm

from utils.preprocessing import LabelEncoder
# from utils.tensorflow_utils import SkillDataset

In [2]:
os.environ['TFHUB_CACHE_DIR'] = './models/tf_cache'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [3]:
Midas2ID = {
    "appreciation": 0, "command": 1, "comment": 2,"complaint": 3,
    "dev_command": 4, "neg_answer": 5, "open_question_factual": 6,
    "open_question_opinion": 7, "opinion": 8, "other_answers": 9,
    "pos_answer": 10, "statement": 11, "yes_no_question": 12,
}

ID2Midas = list(Midas2ID.keys())

Entity2ID = {'misc': 0, 'product': 1, 'food': 2, 'location': 3, 'business': 4,
             'event': 5, 'work_of_art': 6, 'org': 7, 'occupation': 8, 'fac': 9,
             'academic_discipline': 10, 'law': 11, 'film': 12, 'person': 13,
             'language': 14, 'type_of_sport': 15, 'nation': 16, 'literary_work': 17,
             'norp': 18, 'music_genre': 19, 'sports_event': 20, 'song': 21,
             'animal': 22, 'sports_venue': 23, 'sports_season': 24,
             'chemical_element': 25, 'political_party': 26, 'sport_team': 27,
             'national': 28, 'championship': 29, 'association_football_club': 30,
             'sports_league': 31}

EntityTargets2ID = {'product': 0, 'food': 1, 'location': 2, 'business': 3,
                    'event': 4, 'work_of_art': 5, 'org': 6, 'occupation': 7,
                    'fac': 8, 'academic_discipline': 9, 'law': 10, 'person': 11,
                    'language': 12, 'type_of_sport': 13, 'nation': 14,
                    'norp': 15, 'music_genre': 16, 'sports_event': 17,
                    'animal': 18, 'sports_venue': 19, 'sports_season': 20,
                    'chemical_element': 21, 'political_party': 22,
                    'sport_team': 23, 'national': 24, 'championship': 25,
                    'association_football_club': 26, 'sports_league': 27}

from itertools import product
midas_entity2id = list(product(list(Midas2ID.keys()), list(EntityTargets2ID.keys())))
midas_entity2id = {f'{labels[0]}_{labels[1]}': i for i, labels in enumerate(midas_entity2id)}

labels_map = { 
    'midas2id': Midas2ID,
    'entity2id': Entity2ID,
    'target_entity2id': EntityTargets2ID,
    'target_midas2id': Midas2ID,
    'target_midas_and_entity2id': midas_entity2id
}

In [4]:
import math
import numpy as np
import tensorflow as tf

from tensorflow.keras.utils import Sequence

class SkillDataset(Sequence):
    
    """ customized Dataset class from torch """
    
    def __init__(
        self, data: list, vars2id: dict, 
        text_vectorizer, label_encoder,
        n_previous: int=3, embed_dim: int = 512, 
        batch_size: int = 32, shuffle: bool = False):
        
        self.data = data
        self.indexes = np.arange(len(self.data))
        self.vars2id = vars2id
        self.vectorizer = text_vectorizer
        self.label_encoder = label_encoder
        self.n_previous = n_previous
        self.utterance_dim = (
            embed_dim + 
            len(vars2id['midas2id']) + 
            len(vars2id['entity2id']))
        
        self.batch_size = batch_size
        self.shuffle=shuffle
        
    def __len__(self):
        """
        Denotes the number of batches per epoch
        A common practice is to set this value to [num_samples / batch size⌋
        so that the model sees the training samples at most once per epoch.
        """
        return int(np.ceil(len(self.data) / self.batch_size))

    
    def on_epoch_end(self):
        """
        Updates indexes after each epoch
        Shuffling the order so that batches between epochs do not look alike.
        It can make a model more robust.
        """
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __getitem__(self, idx: int):
        """ get batch_id and return its vectorized representation """
        indexes = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch = [self.data[index] for index in indexes]
        
        x_batch, y_batch = self.__extract_features(batch)
        
        return x_batch, y_batch
    
    
    def __extract_features(self, batch) -> tuple:
        """ 
        transaforms raw data into vectorized features and encoded labels
        and collate them into batches
        """
        x_batch = np.zeros([len(batch), self.n_previous*self.utterance_dim])
        y_batch = list()
        
        for i, sample in enumerate(batch):
            embedding = self.__embed(sample['previous_text'])
            x_midas = self.__norm_midas(sample['midas_vectors'])
            x_entities = self.__oh_encode(sample['previous_entities'])
            x_batch[i,:] = self.__concat_vecs(embedding, x_midas, x_entities)
            y_batch.append(
                (sample['predict']['midas'], sample['predict']['entities'][0]['label'])
            )
            
        y_batch = self.label_encoder.to_categorical(y_batch)
            
        return x_batch, y_batch
            

    def __embed(self, utterances: list) -> np.ndarray:
        """ 
        vectorizes a list of N previous utterances using a provided encoder
        
        USE returns Tensors but item assignment is performed downstream,
        thus it is converted to numpy as EagerTensor objects 
        do not support item assignment
        
        input: List[str]
        output: numpy array (len(utterance), embed_dim)
        """
        return self.vectorizer([" ".join(ut) for ut in utterances]).numpy()
    
    def __norm_midas(self, midas_vectors: list) -> np.ndarray:
        """ 
        takes midas vectors of all sentences in the utterance
        and returns a vector with max values per midas label
        """
        vecs = np.zeros((len(midas_vectors), 13))
        
        for i, vec in enumerate(midas_vectors):
            # get max probability per each midas labels
            vecs[i] = np.max(np.array(vec), axis=0)

        # return normalized
        return vecs

    def __oh_encode(self, entities) -> np.ndarray:
        """ 
        one-hot encoding of entities per each sample 
        
        TODO: replace with sklearn MultiLabelBinarizer
        """
        entities = [[ent['label'] for sent in ut for ent in sent] for ut in entities]
        ohe_vec = np.zeros((len(entities), len(self.vars2id['entity2id'])))
        
        for i, ut in enumerate(entities):
            for ent in set(ut):
                ohe_vec[i][self.vars2id['entity2id'][ent]] = 1
                
        return ohe_vec
    
      
    def __concat_vecs(self, embedding: tf.Tensor,
                      midas_vec: np.array, 
                      ohe_vec: np.array) -> tf.Tensor:
        """ 
        concatenates text embeddings with midas vectors 
        and one-hot encoded entities
        
        The output vector will be (n_utterances, self.vector_dim)
        Vector dim comes from:
        1. [tfidf utterance(i-2)]
        2. [midas proba distribution utterance(i-2)]
        3. [entity type one-hot utterance(i-2)]
        4. [tfidf (i-1)]
        5. [midas (i-1)][entity (i-1)]
        6. [tfidf (i)] 
        7. [midas (i)]
        8. [entity (i)]
        """
        assert embedding.shape[0] == midas_vec.shape[0] == ohe_vec.shape[0]
        
        vecs = np.zeros((self.n_previous, self.utterance_dim))
        
        vecs[:,:embedding.shape[1]] = embedding
        vecs[:,embedding.shape[1]:embedding.shape[1]+midas_vec.shape[1]] = midas_vec
        vecs[:,embedding.shape[1]+midas_vec.shape[1]:] = ohe_vec

        # returned one utterance vectors shaped from its sentences
        return tf.reshape(vecs, [-1])

In [5]:
class BaseModel(nn.Module):
    def __init__(
        self, input_size: int, hidden_size: int, n_classes: int, batch_size: int = 32):
        super().__init__()
        
        # parameters
        self.input_size = input_size
        self.output_size = n_classes
        self.batch_size = batch_size

        # layers
        self.linear_in = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        
        # classifiers
        self.clf = nn.Linear(hidden_size, n_classes)
        

    def forward(self, x: torch.Tensor):
        x = self.linear_in(x)
        x = self.relu(x)
        pred = self.clf(x)
        return pred

NameError: name 'nn' is not defined

# Daily

In [13]:
with open('data/single_entity_daily_dataset_v3.json', 'r', encoding="utf8") as f:
    daily = json.load(f)

In [14]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

## Midas

In [15]:
daily_dataset = SkillDataset(
    data=daily, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas2id']),
        'midas'),
    shuffle=False, batch_size=len(daily), **PARAMS)

In [16]:
for X, y in daily_dataset:
    break

In [26]:
X.shape, y.shape

((3894, 1671), (3894, 13))

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3115, 1671), (779, 1671), (3115, 13), (779, 13))

In [29]:
X[0,525:540]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [31]:
n_bins = len(labels_map['target_midas2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([6.67736758e-02, 1.85903984e-02, 7.96731359e-03, 1.21406683e-02,
       1.36582519e-02, 7.43615935e-02, 0.00000000e+00, 5.95651539e-02,
       2.64059536e-01, 3.79395885e-04, 1.17612724e-02, 4.95491026e-01,
       1.57069896e-01])

### Catboost

In [32]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [33]:
model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
    
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

Learning rate set to 0.5
0:	learn: 0.4340289	total: 2.76s	remaining: 38.6s
1:	learn: 0.4430177	total: 5.3s	remaining: 34.5s
2:	learn: 0.4520064	total: 7.82s	remaining: 31.3s
3:	learn: 0.4799358	total: 10.3s	remaining: 28.4s
4:	learn: 0.4927769	total: 13s	remaining: 26s
5:	learn: 0.5014446	total: 16.3s	remaining: 24.5s
6:	learn: 0.5133226	total: 19.7s	remaining: 22.5s
7:	learn: 0.5242376	total: 22.8s	remaining: 19.9s
8:	learn: 0.5354735	total: 25.8s	remaining: 17.2s
9:	learn: 0.5473515	total: 28.7s	remaining: 14.4s
10:	learn: 0.5579454	total: 31.7s	remaining: 11.5s
11:	learn: 0.5704655	total: 34.7s	remaining: 8.68s
12:	learn: 0.5874799	total: 37.8s	remaining: 5.82s
13:	learn: 0.5987159	total: 40.9s	remaining: 2.92s
14:	learn: 0.6025682	total: 43.8s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2382d8be6d0>

In [34]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (779, 1)


In [35]:
Counter(cb_pred.squeeze())

Counter({11: 523, 8: 160, 12: 84, 1: 2, 6: 5, 7: 4, 2: 1})

In [36]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.4544287548138639

In [37]:
model.score(cb_eval)

0.4544287548138639

In [38]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.38912798332323656

### LogisticRegression

In [39]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [40]:
logreg_pred = lg.predict(X_test)

In [41]:
Counter(logreg_pred)

Counter({11: 430,
         12: 115,
         8: 159,
         7: 19,
         6: 32,
         10: 3,
         2: 3,
         1: 13,
         5: 4,
         4: 1})

In [42]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4980744544287548

In [43]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.47392818568634326

### RandomForeset

In [44]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [45]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [46]:
rf_preds = rf.predict(X_test)

In [47]:
Counter(rf_preds)

Counter({11: 688, 8: 56, 12: 25, 1: 2, 6: 3, 7: 3, 2: 1, 4: 1})

In [48]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.5096277278562259

In [49]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.4186111684338658

### Tensorflow

In [None]:
daily_dataset = SkillDataset(
    data=daily, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=32, **PARAMS)

## Entities

In [50]:
daily_dataset = SkillDataset(
    data=daily, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=len(daily), **PARAMS)

In [51]:
for X, y in daily_dataset:
    break

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3115, 1671), (779, 1671), (3115, 28), (779, 28))

In [54]:
X[0,525:540]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [56]:
n_bins = len(labels_map['target_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.3974597 , 0.17117733, 0.16336102, 0.07464582, 0.05236932,
       0.        , 0.05627748, 0.06331216, 0.0371275 , 0.03790914,
       0.0246214 , 0.        , 0.04064485, 0.02305813, 0.03165608,
       0.01211529, 0.        , 0.00547142, 0.01133366, 0.00273571,
       0.00390816, 0.00195408, 0.        , 0.00156326, 0.00117245,
       0.00078163, 0.00078163, 0.00195408])

### Catboost

In [58]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.5
0:	learn: 0.4102729	total: 5.79s	remaining: 1m 21s
1:	learn: 0.4093098	total: 11.3s	remaining: 1m 13s
2:	learn: 0.4388443	total: 17.3s	remaining: 1m 9s
3:	learn: 0.4465490	total: 23.5s	remaining: 1m 4s
4:	learn: 0.4529695	total: 30.2s	remaining: 1m
5:	learn: 0.4651685	total: 36.7s	remaining: 55.1s
6:	learn: 0.4699839	total: 42.7s	remaining: 48.8s
7:	learn: 0.4889246	total: 49.9s	remaining: 43.7s
8:	learn: 0.5030498	total: 56.7s	remaining: 37.8s
9:	learn: 0.5126806	total: 1m 3s	remaining: 31.9s
10:	learn: 0.5258427	total: 1m 10s	remaining: 25.7s
11:	learn: 0.5399679	total: 1m 16s	remaining: 19.2s
12:	learn: 0.5585875	total: 1m 23s	remaining: 12.8s
13:	learn: 0.5701445	total: 1m 29s	remaining: 6.41s
14:	learn: 0.5861958	total: 1m 36s	remaining: 0us
class =  (779, 1)


In [59]:
Counter(cb_pred.squeeze())

Counter({2: 166,
         0: 380,
         3: 22,
         1: 131,
         6: 27,
         5: 8,
         12: 12,
         4: 11,
         10: 15,
         9: 1,
         11: 2,
         7: 2,
         8: 2})

In [60]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred)

0.4197689345314506

In [61]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.35701463669236133

### LogisticRegression

In [62]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [63]:
logreg_pred = lg.predict(X_test)

In [64]:
Counter(logreg_pred)

Counter({2: 132,
         0: 336,
         10: 10,
         1: 120,
         4: 29,
         9: 9,
         3: 34,
         6: 39,
         7: 13,
         11: 9,
         8: 7,
         5: 19,
         12: 16,
         13: 4,
         16: 2})

In [65]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.540436456996149

In [66]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.5114560467649589

### RandomForest

In [67]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [68]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [69]:
rf_preds = rf.predict(X_test)

In [70]:
Counter(rf_preds)

Counter({2: 55, 0: 589, 1: 115, 8: 1, 12: 9, 3: 3, 10: 2, 6: 3, 7: 1, 4: 1})

In [71]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.46598202824133506

In [72]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.37133969884685114

## Concatenation

In [73]:
daily_dataset = SkillDataset(
    data=daily, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas_and_entity2id']),
        'concatenation'),
    shuffle=False, batch_size=len(daily), **PARAMS)

In [74]:
for X, y in daily_dataset:
    break

In [75]:
X.shape, y.shape

((3894, 1671), (3894, 364))

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3115, 1671), (779, 1671), (3115, 364), (779, 364))

In [78]:
X[0,525:540]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [80]:
n_bins = len(labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.02471231, 0.01270919, 0.00494246, 0.0031773 , 0.00141213,
       0.00494246, 0.0021182 , 0.00141213, 0.0010591 , 0.00176517,
       0.0021182 , 0.        , 0.00141213, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00035303, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00564853, 0.00282426, 0.00282426, 0.        , 0.00035303,
       0.0010591 , 0.00035303, 0.00070607, 0.00247123, 0.        ,
       0.00035303, 0.        , 0.        , 0.00035303, 0.        ,
       0.00035303, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00353033, 0.00035303, 0.        , 0.        ,
       0.00070607, 0.        , 0.00070607, 0.00035303, 0.        ,
       0.00070607, 0.        , 0.        , 0.        , 0.     

### Catboost

In [81]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [82]:
model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
    
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

Learning rate set to 0.5
0:	learn: 0.1733547	total: 40.2s	remaining: 9m 22s
1:	learn: 0.1601926	total: 1m 22s	remaining: 8m 58s
2:	learn: 0.1656501	total: 2m 9s	remaining: 8m 36s
3:	learn: 0.1858748	total: 2m 50s	remaining: 7m 50s
4:	learn: 0.2025682	total: 3m 32s	remaining: 7m 5s
5:	learn: 0.2237560	total: 4m 14s	remaining: 6m 21s
6:	learn: 0.2369181	total: 4m 58s	remaining: 5m 40s
7:	learn: 0.2571429	total: 5m 40s	remaining: 4m 58s
8:	learn: 0.2792937	total: 6m 23s	remaining: 4m 15s
9:	learn: 0.2991974	total: 7m 7s	remaining: 3m 33s
10:	learn: 0.3174960	total: 7m 51s	remaining: 2m 51s
11:	learn: 0.3428571	total: 8m 35s	remaining: 2m 8s
12:	learn: 0.3707865	total: 9m 18s	remaining: 1m 25s
13:	learn: 0.3955056	total: 10m 3s	remaining: 43.1s
14:	learn: 0.4157303	total: 10m 45s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2382e4bb2b0>

In [83]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (779, 1)


In [84]:
Counter(cb_pred.squeeze())

Counter({225: 83,
         308: 314,
         311: 18,
         313: 7,
         309: 38,
         310: 101,
         224: 61,
         314: 12,
         28: 11,
         318: 4,
         336: 42,
         316: 4,
         229: 4,
         196: 6,
         197: 5,
         226: 12,
         230: 10,
         315: 2,
         337: 13,
         237: 2,
         169: 1,
         227: 2,
         346: 1,
         338: 3,
         168: 7,
         312: 3,
         342: 1,
         63: 1,
         112: 1,
         228: 3,
         200: 1,
         317: 1,
         339: 1,
         29: 1,
         281: 1,
         319: 2})

In [85]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.1643132220795892

In [86]:
model.score(cb_eval)

0.1643132220795892

In [87]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.1241032189680418

### LogisticRegression

In [88]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [89]:
logreg_pred = lg.predict(X_test)

In [90]:
Counter(logreg_pred)

Counter({310: 98,
         308: 244,
         336: 44,
         168: 11,
         225: 47,
         204: 1,
         338: 12,
         309: 39,
         316: 6,
         224: 43,
         233: 2,
         199: 1,
         311: 20,
         342: 6,
         315: 3,
         226: 23,
         314: 14,
         337: 11,
         170: 9,
         29: 5,
         319: 11,
         229: 10,
         228: 8,
         313: 9,
         236: 10,
         227: 10,
         197: 6,
         232: 1,
         324: 1,
         174: 2,
         196: 3,
         28: 14,
         237: 3,
         312: 6,
         230: 8,
         346: 2,
         339: 4,
         320: 3,
         198: 2,
         30: 1,
         231: 3,
         341: 1,
         59: 1,
         318: 4,
         63: 1,
         112: 1,
         178: 2,
         317: 1,
         180: 1,
         200: 1,
         140: 1,
         172: 1,
         114: 1,
         56: 1,
         348: 1,
         340: 1,
         281: 2,
         144: 1,
  

In [91]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.3042362002567394

In [92]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.27114958499533515

### RandomForest

In [93]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [94]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [95]:
rf_preds = rf.predict(X_test)

In [96]:
Counter(rf_preds)

Counter({310: 55,
         308: 589,
         225: 70,
         309: 20,
         224: 9,
         337: 3,
         29: 1,
         319: 5,
         236: 2,
         348: 1,
         311: 1,
         197: 1,
         168: 1,
         336: 5,
         141: 1,
         63: 1,
         112: 1,
         318: 2,
         28: 2,
         200: 1,
         339: 1,
         196: 1,
         226: 2,
         228: 1,
         314: 1,
         281: 2})

In [97]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.25288831835686776

In [98]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.18235982874868165

# Topical

In [99]:
with open('data/single_entity_topical_dataset_v3.json', 'r', encoding="utf8") as f:
    topical = json.load(f)

In [100]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

### Midas

In [101]:
topical_dataset = SkillDataset(
    data=topical, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas2id']),
        'midas'),
    shuffle=False, batch_size=len(topical), **PARAMS)

In [102]:
for X, y in topical_dataset:
    break

In [103]:
X.shape, y.shape

((7800, 1671), (7800, 13))

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6240, 1671), (1560, 1671), (6240, 13), (1560, 13))

In [106]:
X[0,525:540]

array([0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [107]:
n_bins = len(labels_map['target_midas2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.00069444, 0.01944444, 0.0578125 , 0.00173611, 0.00138889,
       0.00798611, 0.01614583, 0.01111111, 0.47395833, 0.00052083,
       0.02083333, 0.39739583, 0.07430556])

### Catboost

In [108]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [109]:
model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

Learning rate set to 0.5
0:	learn: 0.5793843	test: 0.5610284	best: 0.5610284 (0)	total: 3.85s	remaining: 53.9s
1:	learn: 0.5763291	test: 0.5630017	best: 0.5630017 (1)	total: 7.65s	remaining: 49.7s
2:	learn: 0.6032970	test: 0.5780767	best: 0.5780767 (2)	total: 11.5s	remaining: 45.8s
3:	learn: 0.6231744	test: 0.5932358	best: 0.5932358 (3)	total: 15.3s	remaining: 42s
4:	learn: 0.6320444	test: 0.5926341	best: 0.5932358 (3)	total: 19.1s	remaining: 38.1s
5:	learn: 0.6383419	test: 0.6047587	best: 0.6047587 (5)	total: 22.8s	remaining: 34.2s
6:	learn: 0.6440721	test: 0.6016661	best: 0.6047587 (5)	total: 26.5s	remaining: 30.3s
7:	learn: 0.6507774	test: 0.6002557	best: 0.6047587 (5)	total: 30.7s	remaining: 26.8s
8:	learn: 0.6604211	test: 0.6019637	best: 0.6047587 (5)	total: 34.3s	remaining: 22.9s
9:	learn: 0.6621167	test: 0.6035100	best: 0.6047587 (5)	total: 38.1s	remaining: 19s
10:	learn: 0.6708949	test: 0.6070362	best: 0.6070362 (10)	total: 41.8s	remaining: 15.2s
11:	learn: 0.6766186	test: 0.60

<catboost.core.CatBoostClassifier at 0x2382e4bbac0>

In [110]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (1560, 1)


In [111]:
Counter(cb_pred.squeeze())

Counter({8: 1133, 11: 427})

In [112]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.483974358974359

In [113]:
model.score(cb_eval)

0.483974358974359

In [114]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.41507885888945495

### LogisticRegression

In [115]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [116]:
logreg_pred = lg.predict(X_test)

In [117]:
Counter(logreg_pred)

Counter({8: 885, 11: 614, 2: 19, 5: 1, 12: 29, 10: 8, 7: 1, 1: 2, 6: 1})

In [118]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4666666666666667

In [119]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.4266231486809739

### RandomForeset

In [120]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [121]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [122]:
rf_preds = rf.predict(X_test)

In [123]:
Counter(rf_preds)

Counter({8: 1233, 11: 327})

In [124]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.48846153846153845

In [125]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.4082918499445696

## Entities

In [137]:
topical_dataset = SkillDataset(
    data=topical, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=len(topical), **PARAMS)

In [138]:
for X, y in topical_dataset:
    break

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [140]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6240, 1671), (1560, 1671), (6240, 28), (1560, 28))

In [141]:
X[0,525:540]

array([0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [142]:
n_bins = len(labels_map['target_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([2.61585945e-01, 1.99430199e-02, 1.39767331e-01, 6.96343780e-02,
       2.74216524e-02, 1.04202279e-01, 6.88034188e-02, 5.84995252e-02,
       1.32953466e-02, 1.99430199e-02, 1.87796771e-02, 6.91358025e-02,
       1.09686610e-02, 5.05223172e-02, 6.64767331e-03, 1.91120608e-02,
       1.86134853e-02, 4.65337132e-03, 1.81149098e-02, 2.16049383e-03,
       3.32383666e-03, 1.66191833e-04, 6.81386515e-03, 1.91120608e-02,
       4.98575499e-04, 3.49002849e-03, 3.32383666e-04, 1.49572650e-03])

### Catboost

In [143]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

Learning rate set to 0.5
0:	learn: 0.2884615	test: 0.2967949	best: 0.2967949 (0)	total: 7.79s	remaining: 1m 49s
1:	learn: 0.2910256	test: 0.2967949	best: 0.2967949 (0)	total: 16.4s	remaining: 1m 46s
2:	learn: 0.3322115	test: 0.3294872	best: 0.3294872 (2)	total: 25.3s	remaining: 1m 41s
3:	learn: 0.3495192	test: 0.3378205	best: 0.3378205 (3)	total: 33.9s	remaining: 1m 33s
4:	learn: 0.3533654	test: 0.3397436	best: 0.3397436 (4)	total: 42.6s	remaining: 1m 25s
5:	learn: 0.3639423	test: 0.3493590	best: 0.3493590 (5)	total: 51.5s	remaining: 1m 17s
6:	learn: 0.3671474	test: 0.3500000	best: 0.3500000 (6)	total: 1m 1s	remaining: 1m 9s
7:	learn: 0.3759615	test: 0.3487179	best: 0.3500000 (6)	total: 1m 10s	remaining: 1m 1s
8:	learn: 0.3793269	test: 0.3532051	best: 0.3532051 (8)	total: 1m 18s	remaining: 52.3s
9:	learn: 0.3899038	test: 0.3544872	best: 0.3544872 (9)	total: 1m 26s	remaining: 43.4s
10:	learn: 0.3940705	test: 0.3519231	best: 0.3544872 (9)	total: 1m 34s	remaining: 34.5s
11:	learn: 0.40368

In [144]:
Counter(cb_pred.squeeze())

Counter({18: 12,
         11: 126,
         0: 702,
         2: 281,
         13: 84,
         7: 51,
         5: 183,
         6: 27,
         3: 57,
         4: 6,
         23: 22,
         16: 9})

In [145]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred)

0.3647435897435897

In [146]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.3115672105128657

### LogisticRegression

In [147]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [148]:
logreg_pred = lg.predict(X_test)

In [149]:
Counter(logreg_pred)

Counter({18: 19,
         11: 123,
         2: 216,
         0: 556,
         4: 25,
         5: 178,
         3: 72,
         16: 18,
         7: 91,
         6: 93,
         8: 7,
         13: 78,
         15: 13,
         1: 14,
         12: 14,
         23: 19,
         10: 8,
         9: 11,
         22: 5})

In [150]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.40064102564102566

In [151]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.3788650515375682

### RandomForest

In [152]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [153]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [154]:
rf_preds = rf.predict(X_test)

In [155]:
Counter(rf_preds)

Counter({18: 20,
         11: 21,
         2: 153,
         0: 1104,
         7: 44,
         5: 124,
         13: 83,
         3: 1,
         23: 8,
         6: 2})

In [156]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.3685897435897436

In [157]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.2771521450831389

## Concatenation

In [158]:
topical_dataset = SkillDataset(
    data=topical, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas_and_entity2id']),
        'concatenation'),
    shuffle=False, batch_size=len(topical), **PARAMS)

In [159]:
for X, y in topical_dataset:
    break

In [160]:
X.shape, y.shape

((7800, 1671), (7800, 364))

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [162]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6240, 1671), (1560, 1671), (6240, 364), (1560, 364))

In [163]:
X[0,525:540]

array([0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [164]:
n_bins = len(labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.0003214 , 0.        , 0.        , 0.0001607 , 0.        ,
       0.0001607 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00482094, 0.00064279,
       0.0003214 , 0.00144628, 0.0003214 , 0.00080349, 0.00144628,
       0.00305326, 0.        , 0.00224977, 0.0003214 , 0.00112489,
       0.0001607 , 0.00096419, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0001607 , 0.        , 0.0001607 , 0.        ,
       0.        , 0.01060606, 0.00096419, 0.00530303, 0.00176768,
       0.00385675, 0.00883838, 0.00401745, 0.0078742 , 0.0001607 ,
       0.00128558, 0.00192837, 0.00128558, 0.        , 0.00112489,
       0.0003214 , 0.00208907, 0.0003214 , 0.0001607 , 0.00016

### Catboost

In [165]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [166]:
model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

Learning rate set to 0.5
0:	learn: 0.1245192	total: 1m 4s	remaining: 15m 7s
1:	learn: 0.1189103	total: 2m 10s	remaining: 14m 5s
2:	learn: 0.1253205	total: 3m 12s	remaining: 12m 48s
3:	learn: 0.1331731	total: 4m 13s	remaining: 11m 37s
4:	learn: 0.1485577	total: 5m 18s	remaining: 10m 37s
5:	learn: 0.1621795	total: 6m 22s	remaining: 9m 33s
6:	learn: 0.1790064	total: 7m 25s	remaining: 8m 29s
7:	learn: 0.1939103	total: 8m 30s	remaining: 7m 26s
8:	learn: 0.2022436	total: 9m 33s	remaining: 6m 22s
9:	learn: 0.2142628	total: 10m 38s	remaining: 5m 19s
10:	learn: 0.2285256	total: 11m 40s	remaining: 4m 14s
11:	learn: 0.2354167	total: 12m 45s	remaining: 3m 11s
12:	learn: 0.2493590	total: 13m 48s	remaining: 2m 7s
13:	learn: 0.2647436	total: 14m 48s	remaining: 1m 3s
14:	learn: 0.2759615	total: 15m 53s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2382d8d5400>

In [167]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (1560, 1)


In [168]:
Counter(cb_pred.squeeze())

Counter({310: 244,
         235: 117,
         224: 245,
         308: 294,
         229: 132,
         228: 4,
         226: 132,
         319: 58,
         227: 30,
         315: 42,
         336: 16,
         230: 34,
         237: 48,
         311: 19,
         313: 41,
         233: 5,
         314: 13,
         56: 3,
         240: 14,
         247: 9,
         231: 20,
         61: 1,
         236: 2,
         326: 10,
         242: 7,
         60: 1,
         331: 4,
         239: 2,
         321: 3,
         280: 5,
         225: 2,
         63: 2,
         338: 1})

In [169]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.1391025641025641

In [170]:
model.score(cb_eval)

0.1391025641025641

In [171]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.10974946365446388

### LogisticRegression

In [172]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [173]:
logreg_pred = lg.predict(X_test)

In [174]:
Counter(logreg_pred)

Counter({242: 6,
         235: 78,
         225: 12,
         308: 274,
         224: 246,
         310: 139,
         226: 102,
         230: 69,
         240: 14,
         229: 146,
         313: 48,
         227: 45,
         315: 35,
         232: 4,
         314: 24,
         319: 38,
         321: 3,
         237: 65,
         231: 33,
         311: 36,
         336: 20,
         323: 5,
         280: 2,
         236: 14,
         331: 5,
         239: 7,
         228: 15,
         318: 5,
         287: 2,
         326: 9,
         234: 2,
         247: 10,
         58: 2,
         61: 5,
         324: 4,
         56: 4,
         312: 4,
         293: 2,
         63: 2,
         339: 2,
         233: 2,
         330: 1,
         342: 1,
         320: 2,
         347: 3,
         309: 1,
         202: 1,
         349: 2,
         338: 1,
         170: 1,
         246: 3,
         316: 2,
         66: 1,
         28: 1})

In [175]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.20256410256410257

In [176]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.17200188697629604

### RandomForest

In [177]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [178]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [179]:
rf_preds = rf.predict(X_test)

In [180]:
Counter(rf_preds)

Counter({326: 13,
         229: 185,
         310: 166,
         224: 411,
         308: 592,
         226: 37,
         235: 33,
         315: 8,
         237: 78,
         287: 4,
         242: 5,
         231: 8,
         247: 10,
         314: 2,
         319: 1,
         236: 1,
         230: 3,
         331: 2,
         313: 1})

In [181]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.16666666666666666

In [182]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.10530037101421616