In [1]:
import json
import pickle
import os
from collections import Counter

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm

from utils.preprocessing import LabelEncoder
from utils.tensorflow_utils import SkillDataset

In [2]:
os.environ['TFHUB_CACHE_DIR'] = './models/tf_cache'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


# Daily

In [3]:
with open('data/daily_labels.json', 'r', encoding="utf8") as f:
    daily_labels_map = json.load(f)

with open('data/daily_dataset.json', 'r', encoding="utf8") as f:
    daily = json.load(f)

In [4]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

### Midas

In [5]:
daily_dataset = SkillDataset(
    data=daily, vars2id=daily_labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(daily_labels_map['target_midas2id']),
        'midas'),
    shuffle=False, batch_size=len(daily), **PARAMS)

In [6]:
for X, y in daily_dataset:
    break

In [7]:
X.shape, y.shape

((2352, 1635), (2352, 12))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1881, 1635), (471, 1635), (1881, 12), (471, 12))

In [10]:
X[0,525:540]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
n_bins = len(daily_labels_map['target_midas2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.72599681, 0.08867624, 0.05677831, 0.2169059 , 0.00574163,
       0.        , 0.01339713, 0.0414673 , 0.00574163, 0.03189793,
       0.00829346, 0.00510367])

### Catboost

In [12]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [13]:
model_params = {
    'task_type': 'CPU',
    'iterations': 10,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
    
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

0:	learn: 0.6049973	total: 391ms	remaining: 3.52s
1:	learn: 0.6049973	total: 583ms	remaining: 2.33s
2:	learn: 0.6049973	total: 797ms	remaining: 1.86s
3:	learn: 0.6049973	total: 985ms	remaining: 1.48s
4:	learn: 0.6049973	total: 1.18s	remaining: 1.18s
5:	learn: 0.6049973	total: 1.38s	remaining: 920ms
6:	learn: 0.6049973	total: 1.59s	remaining: 681ms
7:	learn: 0.6049973	total: 1.79s	remaining: 447ms
8:	learn: 0.6049973	total: 2s	remaining: 222ms
9:	learn: 0.6049973	total: 2.23s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x20215805d30>

In [14]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (471, 1)


In [15]:
Counter(cb_pred.squeeze())

Counter({0: 471})

In [17]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.643312101910828

In [18]:
model.score(cb_eval)

0.643312101910828

In [16]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.5036784673875475

### LogisticRegression

In [19]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [20]:
logreg_pred = lg.predict(X_test)

In [21]:
Counter(logreg_pred)

Counter({0: 358, 3: 70, 1: 26, 2: 7, 8: 1, 6: 7, 9: 2})

In [23]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.6857749469214437

In [22]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.6498555648589889

### RandomForeset

In [24]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [25]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [26]:
rf_preds = rf.predict(X_test)

In [29]:
Counter(rf_preds)

Counter({0: 442, 1: 6, 3: 18, 6: 2, 9: 1, 2: 2})

In [28]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.7048832271762208

In [27]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.6250694931881164

## Entities

In [30]:
daily_dataset = SkillDataset(
    data=daily, vars2id=daily_labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(daily_labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=len(daily), **PARAMS)

In [31]:
for X, y in daily_dataset:
    break

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1881, 1635), (471, 1635), (1881, 18), (471, 18))

In [34]:
X[0,525:540]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
n_bins = len(daily_labels_map['target_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.15592457, 0.15761328, 0.23754574, 0.08893892, 0.10357444,
       0.09513088, 0.05797917, 0.04503237, 0.02589361, 0.03321137,
       0.02983394, 0.00168871, 0.00394033, 0.00900647, 0.00844357,
       0.0005629 , 0.00394033, 0.0005629 ])

### Catboost

In [36]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

model_params = {
    'task_type': 'CPU',
    'iterations': 100,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.4549025	test: 0.5314769	best: 0.5314769 (0)	total: 438ms	remaining: 43.4s
1:	learn: 0.4959505	test: 0.5276088	best: 0.5314769 (0)	total: 830ms	remaining: 40.7s
2:	learn: 0.4984471	test: 0.5568488	best: 0.5568488 (2)	total: 1.3s	remaining: 42.1s
3:	learn: 0.5241481	test: 0.5425446	best: 0.5568488 (2)	total: 1.74s	remaining: 41.8s
4:	learn: 0.5485906	test: 0.5546248	best: 0.5568488 (2)	total: 2.19s	remaining: 41.6s
5:	learn: 0.5570344	test: 0.5558017	best: 0.5568488 (2)	total: 2.59s	remaining: 40.5s
6:	learn: 0.5619471	test: 0.5470963	best: 0.5568488 (2)	total: 3.02s	remaining: 40.1s
7:	learn: 0.5654446	test: 0.5593323	best: 0.5593323 (7)	total: 3.44s	remaining: 39.5s
8:	learn: 0.5604133	test: 0.5484376	best: 0.5593323 (7)	total: 3.83s	remaining: 38.8s
9:	learn: 0.5597796	test: 0.5484722	best: 0.5593323 (7)	total: 4.27s	remaining: 38.4s
10:	learn: 0.5587809	test: 0.5495539	best: 0.5593323 (7)	total: 4.65s	remaining: 37.7s
11:	learn: 0.5650572	test: 0.5568228	best: 0.5593323 (

In [37]:
Counter(cb_pred.squeeze())

Counter({2: 300, 1: 95, 0: 76})

In [38]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred)

0.3821656050955414

In [39]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.271446939206632

### LogisticRegression

In [40]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [41]:
logreg_pred = lg.predict(X_test)

In [42]:
Counter(logreg_pred)

Counter({2: 134,
         0: 76,
         1: 75,
         4: 48,
         3: 34,
         6: 19,
         5: 46,
         7: 15,
         9: 4,
         8: 8,
         10: 10,
         13: 1,
         12: 1})

In [44]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.5966029723991507

In [43]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.58746736576402

### RandomForest

In [45]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [46]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [47]:
rf_preds = rf.predict(X_test)

In [50]:
Counter(rf_preds)

Counter({2: 193,
         4: 40,
         1: 89,
         0: 76,
         7: 8,
         5: 34,
         3: 11,
         10: 3,
         6: 14,
         12: 1,
         9: 1,
         8: 1})

In [49]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.6050955414012739

In [48]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.5706488233926231

## Concatenation

In [51]:
daily_dataset = SkillDataset(
    data=daily, vars2id=daily_labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(daily_labels_map['target_midas_and_entity2id']),
        'concatenation'),
    shuffle=False, batch_size=len(daily), **PARAMS)

In [52]:
for X, y in daily_dataset:
    break

In [53]:
X.shape, y.shape

((2352, 1635), (2352, 120))

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1881, 1635), (471, 1635), (1881, 120), (471, 120))

In [56]:
X[0,525:540]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [57]:
n_bins = len(daily_labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.12488849, 0.01351607, 0.08920607, 0.18327792, 0.00162193,
       0.01730057, 0.0189225 , 0.00162193, 0.03676371, 0.00108129,
       0.00702836, 0.00432514, 0.01243478, 0.00054064, 0.02378828,
       0.00540643, 0.00270321, 0.0491985 , 0.02054443, 0.0416295 ,
       0.01621928, 0.00919093, 0.00540643, 0.0189225 , 0.00648771,
       0.00162193, 0.00919093, 0.03838564, 0.00865029, 0.01081286,
       0.02973536, 0.00540643, 0.00270321, 0.00270321, 0.00108129,
       0.00432514, 0.01621928, 0.00486579, 0.01946314, 0.00216257,
       0.00919093, 0.0037845 , 0.00270321, 0.00162193, 0.00108129,
       0.0037845 , 0.00162193, 0.00162193, 0.00162193, 0.00108129,
       0.00324386, 0.0037845 , 0.00324386, 0.00648771, 0.0037845 ,
       0.00054064, 0.00648771, 0.00540643, 0.00486579, 0.        ,
       0.00270321, 0.00054064, 0.00648771, 0.00919093, 0.00108129,
       0.00054064, 0.00540643, 0.00270321, 0.00216257, 0.00108129,
       0.00054064, 0.00054064, 0.00054064, 0.00054064, 0.00594

### Catboost

In [58]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [59]:
model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
    
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

0:	learn: 0.2413610	total: 3.77s	remaining: 52.7s
1:	learn: 0.2456140	total: 7.73s	remaining: 50.2s
2:	learn: 0.2551834	total: 11.6s	remaining: 46.5s
3:	learn: 0.2567783	total: 15.7s	remaining: 43.2s
4:	learn: 0.2557150	total: 19.7s	remaining: 39.4s
5:	learn: 0.2503987	total: 23.5s	remaining: 35.3s
6:	learn: 0.2498671	total: 27.4s	remaining: 31.3s
7:	learn: 0.2509304	total: 31.3s	remaining: 27.4s
8:	learn: 0.2440191	total: 35s	remaining: 23.3s
9:	learn: 0.2424242	total: 38.7s	remaining: 19.3s
10:	learn: 0.2445508	total: 42.3s	remaining: 15.4s
11:	learn: 0.2541201	total: 45.9s	remaining: 11.5s
12:	learn: 0.2541201	total: 49.5s	remaining: 7.61s
13:	learn: 0.2535885	total: 53.1s	remaining: 3.79s
14:	learn: 0.2525253	total: 56.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x20215832310>

In [60]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (471, 1)


In [61]:
Counter(cb_pred.squeeze())

Counter({0: 203, 2: 12, 3: 256})

In [62]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.2823779193205945

In [63]:
model.score(cb_eval)

0.2823779193205945

In [64]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.16494663628421588

### LogisticRegression

In [65]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [66]:
logreg_pred = lg.predict(X_test)

In [67]:
Counter(logreg_pred)

Counter({3: 158,
         0: 84,
         2: 49,
         19: 25,
         27: 21,
         14: 8,
         20: 4,
         1: 6,
         23: 8,
         17: 40,
         8: 16,
         22: 1,
         38: 9,
         18: 5,
         28: 3,
         56: 2,
         78: 1,
         84: 1,
         5: 3,
         67: 1,
         21: 3,
         57: 1,
         6: 2,
         66: 1,
         30: 8,
         74: 3,
         98: 1,
         10: 1,
         12: 1,
         15: 2,
         40: 1,
         26: 1,
         58: 1})

In [69]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.45222929936305734

In [68]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.4020569694836455

### RandomForest

In [70]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [71]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [72]:
rf_preds = rf.predict(X_test)

In [73]:
Counter(rf_preds)

Counter({0: 105,
         2: 54,
         3: 249,
         14: 3,
         20: 2,
         36: 1,
         19: 5,
         17: 21,
         27: 5,
         52: 1,
         38: 5,
         28: 1,
         5: 1,
         1: 3,
         67: 1,
         21: 1,
         18: 1,
         56: 1,
         77: 1,
         33: 1,
         66: 1,
         98: 1,
         10: 1,
         8: 1,
         12: 1,
         15: 1,
         40: 1,
         45: 1,
         30: 1})

In [75]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.4692144373673036

In [74]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.3841952680581085

# Topical

In [76]:
with open('data/topical_labels.json', 'r', encoding="utf8") as f:
    topical_labels_map = json.load(f)

with open('data/topical_dataset.json', 'r', encoding="utf8") as f:
    topical = json.load(f)

In [77]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

### Midas

In [78]:
topical_dataset = SkillDataset(
    data=topical, vars2id=topical_labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(topical_labels_map['target_midas2id']),
        'midas'),
    shuffle=False, batch_size=len(topical), **PARAMS)

In [79]:
for X, y in topical_dataset:
    break

In [80]:
X.shape, y.shape

((10117, 1641), (10117, 13))

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8093, 1641), (2024, 1641), (8093, 13), (2024, 13))

In [83]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [84]:
n_bins = len(topical_labels_map['target_midas2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([3.86187652e-01, 3.27958318e-02, 4.91937477e-01, 9.63795873e-02,
       7.63005066e-03, 1.68664278e-02, 1.19135879e-02, 1.20474484e-02,
       2.30240125e-02, 2.94493183e-03, 9.37023765e-04, 4.01581614e-04,
       2.67721076e-04])

### Catboost

In [85]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [86]:
model_params = {
    'task_type': 'CPU',
    'iterations': 10,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

0:	learn: 0.6019261	test: 0.6125747	best: 0.6125747 (0)	total: 427ms	remaining: 3.84s
1:	learn: 0.6012399	test: 0.6155530	best: 0.6155530 (1)	total: 863ms	remaining: 3.45s
2:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (2)	total: 1.28s	remaining: 2.98s
3:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (2)	total: 1.65s	remaining: 2.48s
4:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (2)	total: 2.06s	remaining: 2.06s
5:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (2)	total: 2.52s	remaining: 1.68s
6:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (2)	total: 2.95s	remaining: 1.26s
7:	learn: 0.6009832	test: 0.6161942	best: 0.6161942 (2)	total: 3.44s	remaining: 861ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.6161941837
bestIteration = 2

Shrink model to first 3 iterations.


<catboost.core.CatBoostClassifier at 0x20215805c70>

In [87]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2024, 1)


In [88]:
Counter(cb_pred.squeeze())

Counter({2: 2024})

In [89]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.47480237154150196

In [90]:
model.score(cb_eval)

0.47480237154150196

In [91]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.30571864593057513

### LogisticRegression

In [92]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [93]:
logreg_pred = lg.predict(X_test)

In [94]:
Counter(logreg_pred)

Counter({0: 758, 2: 1171, 3: 66, 1: 26, 8: 3})

In [95]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4782608695652174

In [96]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.4477238510100303

### RandomForeset

In [97]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [98]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [99]:
rf_preds = rf.predict(X_test)

In [101]:
Counter(rf_preds)

Counter({2: 1597, 0: 425, 1: 2})

In [100]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.5177865612648221

In [102]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.44280225928025957

## Entities

In [103]:
topical_dataset = SkillDataset(
    data=topical, vars2id=topical_labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(topical_labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=len(topical), **PARAMS)

In [104]:
for X, y in topical_dataset:
    break

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [106]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8093, 1641), (2024, 1641), (8093, 20), (2024, 20))

In [107]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [108]:
n_bins = len(topical_labels_map['target_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([0.17363934, 0.27340066, 0.11549942, 0.06815507, 0.05983078,
       0.09598939, 0.00429221, 0.05931052, 0.04162141, 0.02419245,
       0.02510292, 0.0165185 , 0.04773456, 0.00650335, 0.01027529,
       0.01053542, 0.0003902 , 0.01560803, 0.0011706 , 0.00286147])

### Catboost

In [109]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
    'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.5439720	test: 0.5414431	best: 0.5414431 (0)	total: 824ms	remaining: 11.5s
1:	learn: 0.5480854	test: 0.5433233	best: 0.5433233 (1)	total: 1.5s	remaining: 9.76s
2:	learn: 0.5443439	test: 0.5336498	best: 0.5433233 (1)	total: 2.16s	remaining: 8.63s
3:	learn: 0.5466529	test: 0.5421011	best: 0.5433233 (1)	total: 2.87s	remaining: 7.88s
4:	learn: 0.5499386	test: 0.5517228	best: 0.5517228 (4)	total: 3.55s	remaining: 7.09s
5:	learn: 0.5473180	test: 0.5451900	best: 0.5517228 (4)	total: 4.2s	remaining: 6.29s
6:	learn: 0.5474339	test: 0.5427189	best: 0.5517228 (4)	total: 5.02s	remaining: 5.74s
7:	learn: 0.5420474	test: 0.5370015	best: 0.5517228 (4)	total: 5.74s	remaining: 5.02s
8:	learn: 0.5435165	test: 0.5377114	best: 0.5517228 (4)	total: 6.57s	remaining: 4.38s
9:	learn: 0.5423816	test: 0.5357660	best: 0.5517228 (4)	total: 7.3s	remaining: 3.65s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.5517227504
bestIteration = 4

Shrink model to first 5 iterations.
class =  (

In [110]:
Counter(cb_pred.squeeze())

Counter({1: 1732, 0: 292})

In [111]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred)

0.3038537549407115

In [112]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.18394759551622514

### LogisticRegression

In [113]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [114]:
logreg_pred = lg.predict(X_test)

In [115]:
Counter(logreg_pred)

Counter({12: 98,
         1: 659,
         5: 201,
         4: 112,
         8: 67,
         0: 358,
         2: 226,
         7: 108,
         3: 97,
         9: 22,
         17: 15,
         11: 13,
         10: 21,
         14: 5,
         15: 15,
         6: 5,
         13: 2})

In [117]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4491106719367589

In [116]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.43455429861165723

### RandomForest

In [118]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [119]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [120]:
rf_preds = rf.predict(X_test)

In [123]:
Counter(rf_preds)

Counter({12: 62, 1: 1165, 0: 318, 4: 56, 5: 109, 2: 202, 7: 81, 8: 11, 3: 20})

In [122]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.4426877470355731

In [121]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.39009887868759885

## Concatenation

In [124]:
topical_dataset = SkillDataset(
    data=topical, vars2id=topical_labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(topical_labels_map['target_midas_and_entity2id']),
        'concatenation'),
    shuffle=False, batch_size=len(topical), **PARAMS)

In [125]:
for X, y in topical_dataset:
    break

In [126]:
X.shape, y.shape

((10117, 1641), (10117, 163))

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8093, 1641), (2024, 1641), (8093, 163), (2024, 163))

In [129]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [130]:
n_bins = len(topical_labels_map['target_midas_and_entity2id'])
class_weights, _ = np.histogram(np.argmax(y_train, axis=-1), bins=n_bins, density=True)
class_weights

array([5.50765560e-02, 3.60546304e-03, 7.69579869e-02, 8.11850815e-02,
       5.10981141e-02, 3.68005882e-02, 1.31785890e-01, 1.18109996e-02,
       2.39949781e-02, 2.30003676e-02, 4.32655564e-02, 9.94610493e-04,
       2.76004412e-02, 1.80273152e-02, 2.08868203e-02, 6.09198927e-03,
       2.17571045e-02, 1.56651153e-02, 1.72813573e-02, 1.61624205e-03,
       4.42601669e-02, 1.36758943e-03, 4.10276828e-03, 1.74056836e-03,
       6.21631558e-04, 3.85411566e-03, 1.49191574e-02, 1.11893680e-03,
       6.09198927e-03, 3.12059042e-02, 9.94610493e-04, 1.54164626e-02,
       2.48652623e-03, 5.09737877e-03, 7.83255763e-03, 4.35142090e-03,
       2.52382412e-02, 2.48652623e-03, 2.73517885e-03, 6.21631558e-03,
       3.10815779e-03, 3.23248410e-03, 1.24326312e-04, 3.72978935e-04,
       3.72978935e-03, 3.23248410e-03, 6.34064189e-03, 2.85950517e-03,
       5.59468402e-03, 6.46496820e-03, 6.96227345e-03, 9.94610493e-04,
       4.35142090e-03, 1.24326312e-03, 4.47574722e-03, 7.45957869e-04,
      

### Catboost

In [131]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [132]:
model_params = {
    'task_type': 'CPU',
    'iterations': 15,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
    # 'eval_set': cb_eval, 
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

0:	learn: 0.1401211	total: 9.1s	remaining: 2m 7s
1:	learn: 0.1333251	total: 18.1s	remaining: 1m 57s
2:	learn: 0.1313481	total: 27.1s	remaining: 1m 48s
3:	learn: 0.1325837	total: 36.7s	remaining: 1m 41s
4:	learn: 0.1311010	total: 46.1s	remaining: 1m 32s
5:	learn: 0.1309774	total: 55.8s	remaining: 1m 23s
6:	learn: 0.1309774	total: 1m 4s	remaining: 1m 14s
7:	learn: 0.1309774	total: 1m 14s	remaining: 1m 5s
8:	learn: 0.1315952	total: 1m 23s	remaining: 55.8s
9:	learn: 0.1309774	total: 1m 33s	remaining: 46.5s
10:	learn: 0.1309774	total: 1m 41s	remaining: 36.9s
11:	learn: 0.1309774	total: 1m 50s	remaining: 27.5s
12:	learn: 0.1309774	total: 1m 58s	remaining: 18.3s
13:	learn: 0.1309774	total: 2m 7s	remaining: 9.08s
14:	learn: 0.1346843	total: 2m 15s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x20237d43b20>

In [133]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2024, 1)


In [134]:
Counter(cb_pred.squeeze())

Counter({6: 1999, 2: 25})

In [135]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.13982213438735178

In [136]:
model.score(cb_eval)

0.13982213438735178

In [137]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.04052507600327386

### LogisticRegression

In [138]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [139]:
logreg_pred = lg.predict(X_test)

In [140]:
Counter(logreg_pred)

Counter({31: 24,
         3: 205,
         10: 136,
         36: 57,
         6: 455,
         14: 47,
         29: 58,
         0: 128,
         2: 218,
         20: 99,
         12: 77,
         5: 79,
         9: 15,
         4: 140,
         17: 21,
         56: 2,
         8: 50,
         46: 4,
         18: 29,
         59: 19,
         13: 37,
         112: 1,
         28: 4,
         15: 1,
         52: 1,
         54: 5,
         26: 28,
         16: 21,
         25: 3,
         100: 3,
         93: 9,
         7: 11,
         44: 3,
         22: 3,
         64: 3,
         49: 1,
         45: 4,
         50: 1,
         66: 2,
         67: 5,
         33: 3,
         39: 1,
         35: 1,
         48: 2,
         19: 2,
         1: 1,
         76: 1,
         61: 1,
         37: 1,
         34: 1,
         121: 1})

In [141]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.23221343873517786

In [142]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.20415437113378418

### RandomForest

In [143]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [144]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [145]:
rf_preds = rf.predict(X_test)

In [146]:
Counter(rf_preds)

Counter({36: 63,
         6: 1127,
         2: 257,
         8: 38,
         4: 158,
         10: 100,
         20: 65,
         12: 76,
         3: 76,
         5: 19,
         0: 16,
         13: 4,
         14: 13,
         29: 4,
         31: 3,
         22: 2,
         100: 1,
         18: 2})

In [147]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.2366600790513834

In [148]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.16820439359168493