In [1]:
import json
import pickle
import os
from collections import Counter

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm

from utils.preprocessing import LabelEncoder
from utils.tensorflow_utils import SkillDataset

In [2]:
os.environ['TFHUB_CACHE_DIR'] = './models/tf_cache'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [3]:
with open('data/labels.json', 'r', encoding="utf8") as f:
    labels_map = json.load(f)

with open('data/dataset.json', 'r', encoding="utf8") as f:
    data = json.load(f)

In [4]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

## Midas

In [5]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas2id']),
        'midas'),
    shuffle=False, batch_size=len(data), **PARAMS)

In [6]:
for X, y in dataset:
    break

In [7]:
X.shape, y.shape

((12055, 1659), (12055, 12))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9644, 1659), (2411, 1659), (9644, 12), (2411, 12))

In [10]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
class_weights, _ = np.histogram(np.argmax(y_test, axis=-1), bins=12, density=True)
class_weights

array([0.54997926, 0.12343426, 0.43102447, 0.02438822, 0.0184156 ,
       0.        , 0.01045209, 0.015927  , 0.02040647, 0.00149316,
       0.00348403, 0.00099544])

### Catboost

In [15]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [16]:
model_params = {
    'task_type': 'CPU',
    'iterations': 100,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, eval_set=cb_eval, **fit_params)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.5908422	test: 0.5993200	best: 0.5993200 (0)	total: 641ms	remaining: 1m 3s
1:	learn: 0.5906719	test: 0.5990124	best: 0.5993200 (0)	total: 1.05s	remaining: 51.7s
2:	learn: 0.5906719	test: 0.5990124	best: 0.5993200 (0)	total: 1.48s	remaining: 48s
3:	learn: 0.5906719	test: 0.5990124	best: 0.5993200 (0)	total: 1.89s	remaining: 45.3s
4:	learn: 0.5906719	test: 0.5990124	best: 0.5993200 (0)	total: 2.25s	remaining: 42.8s
5:	learn: 0.5906719	test: 0.5990124	best: 0.5993200 (0)	total: 2.65s	remaining: 41.6s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.5993199638
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x2b2bca44d90>

In [17]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2411, 1)


In [18]:
Counter(cb_pred.squeeze())

Counter({0: 2407, 2: 4})

In [19]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.28979546709017007

In [21]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.45873081708834507

In [22]:
model.score(cb_eval)

0.45873081708834507

### LogisticRegression

In [46]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [47]:
logreg_pred = lg.predict(X_test)

In [48]:
Counter(logreg_pred)

Counter({2: 894, 1: 79, 0: 1431, 6: 4, 7: 1, 9: 1, 3: 1})

In [49]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.4429244218180792

In [50]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4807133969307341

### RandomForeset

In [69]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [70]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [71]:
rf_preds = rf.predict(X_test)

In [72]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.4109948947237741

In [73]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.49108253836582333

In [74]:
Counter(rf_preds)

Counter({2: 469, 0: 1942})

## Entities

In [75]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=len(data), **PARAMS)

In [76]:
for X, y in dataset:
    break

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9644, 1659), (2411, 1659), (9644, 25), (2411, 25))

In [79]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [80]:
class_weights, _ = np.histogram(np.argmax(y_test, axis=-1), bins=25, density=True)
class_weights

array([0.03715609, 0.08597746, 0.06221485, 0.23200954, 0.12010922,
       0.00734481, 0.05746233, 0.09461842, 0.1326386 , 0.00734481,
       0.05141366, 0.00518457, 0.01684985, 0.02592285, 0.00259229,
       0.01771395, 0.01252938, 0.03499585, 0.00475252, 0.        ,
       0.01296143, 0.01252938, 0.        , 0.00691276, 0.00043205])

### Catboost

In [81]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

model_params = {
    'task_type': 'CPU',
    'iterations': 100,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, eval_set=cb_eval, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.4861228	test: 0.4924351	best: 0.4924351 (0)	total: 934ms	remaining: 1m 32s
1:	learn: 0.4817112	test: 0.4882911	best: 0.4924351 (0)	total: 1.7s	remaining: 1m 23s
2:	learn: 0.4819379	test: 0.4884737	best: 0.4924351 (0)	total: 2.54s	remaining: 1m 22s
3:	learn: 0.4809126	test: 0.4834971	best: 0.4924351 (0)	total: 3.36s	remaining: 1m 20s
4:	learn: 0.4817724	test: 0.4868164	best: 0.4924351 (0)	total: 4.08s	remaining: 1m 17s
5:	learn: 0.4816185	test: 0.4880446	best: 0.4924351 (0)	total: 4.89s	remaining: 1m 16s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.4924350775
bestIteration = 0

Shrink model to first 1 iterations.
class =  (2411, 1)


In [82]:
Counter(cb_pred.squeeze())

Counter({3: 2141, 4: 107, 8: 163})

In [83]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.15219628899385088

In [84]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred)

0.2600580671920365

### LogisticRegression

In [85]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [86]:
logreg_pred = lg.predict(X_test)

In [87]:
Counter(logreg_pred)

Counter({3: 727,
         6: 99,
         8: 319,
         4: 308,
         1: 208,
         9: 18,
         17: 89,
         7: 252,
         2: 98,
         10: 116,
         21: 17,
         12: 8,
         23: 4,
         5: 13,
         13: 29,
         0: 51,
         20: 24,
         11: 3,
         18: 9,
         15: 13,
         16: 6})

In [88]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.3951676488785533

In [89]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.41144753214433843

### RandomForest

In [90]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [91]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [92]:
rf_preds = rf.predict(X_test)

In [93]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.32260967440778654

In [94]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.3919535462463708

In [95]:
Counter(rf_preds)

Counter({3: 1231, 4: 385, 8: 340, 7: 144, 1: 140, 17: 60, 10: 103, 6: 8})

## Concatenation

In [96]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas_and_entity2id']),
        'concatenation'),
    shuffle=False, batch_size=len(data), **PARAMS)

In [97]:
for X, y in dataset:
    break

In [98]:
X.shape, y.shape

((12055, 1659), (12055, 194))

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [100]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9644, 1659), (2411, 1659), (9644, 194), (2411, 194))

In [101]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [102]:
class_weights, _ = np.histogram(np.argmax(y_test, axis=-1), bins=194, density=True)
class_weights

array([0.01718253, 0.00754355, 0.03729867, 0.07459733, 0.02388791,
       0.04861399, 0.        , 0.10519062, 0.0331078 , 0.0331078 ,
       0.04190861, 0.05909114, 0.04651856, 0.01173441, 0.00502903,
       0.02472608, 0.0393941 , 0.00754355, 0.00335269, 0.05112851,
       0.03394598, 0.01592527, 0.00251452, 0.01341076, 0.01718253,
       0.00125726, 0.00209543, 0.00502903, 0.00377178, 0.00963898,
       0.00125726, 0.00335269, 0.00796264, 0.00712446, 0.00167634,
       0.00754355, 0.02095431, 0.00377178, 0.00460995, 0.01047715,
       0.00167634, 0.00251452, 0.00125726, 0.        , 0.00041909,
       0.00670538, 0.00041909, 0.00754355, 0.00796264, 0.00209543,
       0.00167634, 0.        , 0.00125726, 0.0121535 , 0.00209543,
       0.        , 0.00670538, 0.00167634, 0.00251452, 0.00251452,
       0.        , 0.00083817, 0.00586721, 0.00041909, 0.00921989,
       0.00167634, 0.00125726, 0.00041909, 0.00209543, 0.        ,
       0.00041909, 0.00041909, 0.00041909, 0.00125726, 0.00125

### Catboost

In [103]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [104]:
model_params = {
    'task_type': 'CPU',
    'iterations': 5,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

0:	learn: 0.1134384	total: 11.9s	remaining: 47.6s
1:	learn: 0.1119867	total: 25s	remaining: 37.5s
2:	learn: 0.1106387	total: 38.9s	remaining: 25.9s
3:	learn: 0.1112609	total: 51.8s	remaining: 12.9s
4:	learn: 0.1252592	total: 1m 6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2b32b2dbb20>

In [105]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2411, 1)


In [106]:
Counter(cb_pred.squeeze())

Counter({7: 2293, 11: 113, 19: 4, 5: 1})

In [107]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.037171793026822476

In [109]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred.squeeze())

0.11737868104520946

In [110]:
model.score(cb_eval)

0.11737868104520946

### LogisticRegression

In [111]:
lg = LogisticRegression(random_state=42, max_iter=500)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=500, random_state=42)

In [112]:
logreg_pred = lg.predict(X_test)

In [113]:
Counter(logreg_pred)

Counter({19: 152,
         7: 491,
         12: 80,
         9: 99,
         24: 25,
         8: 99,
         20: 91,
         5: 175,
         2: 106,
         36: 56,
         168: 2,
         10: 155,
         3: 192,
         80: 7,
         11: 192,
         15: 80,
         47: 24,
         14: 15,
         4: 26,
         28: 5,
         99: 19,
         17: 9,
         58: 2,
         27: 5,
         16: 100,
         13: 30,
         48: 6,
         21: 18,
         22: 1,
         39: 1,
         0: 21,
         53: 31,
         29: 12,
         64: 6,
         45: 8,
         32: 2,
         50: 2,
         35: 3,
         92: 3,
         33: 6,
         122: 2,
         71: 2,
         18: 3,
         1: 7,
         41: 3,
         38: 3,
         23: 24,
         49: 1,
         31: 2,
         37: 1,
         94: 1,
         62: 1,
         55: 1,
         57: 1,
         152: 1,
         56: 1})

In [114]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.1819080875639939

In [115]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.20821236001659063

### RandomForest

In [116]:
rf = RandomForestClassifier(max_depth=10, random_state=42)

In [117]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=10, random_state=42)

In [118]:
rf_preds = rf.predict(X_test)

In [119]:
Counter(rf_preds)

Counter({7: 1210,
         19: 169,
         9: 24,
         11: 298,
         8: 80,
         10: 118,
         5: 205,
         36: 61,
         15: 101,
         3: 66,
         16: 37,
         20: 18,
         2: 16,
         12: 2,
         71: 1,
         13: 3,
         53: 1,
         47: 1})

In [120]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.1189267491104623

In [121]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.19079220240564082