In [47]:
import json
import pickle
import os
from collections import Counter

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm

from utils.preprocessing import LabelEncoder
from utils.tensorflow_utils import SkillDataset

In [6]:
os.environ['TFHUB_CACHE_DIR'] = './models/tf_cache'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [7]:
with open('data/labels.json', 'r') as f:
    labels_map = json.load(f)

with open('data/dataset.json', 'r') as f:
    data = json.load(f)

In [8]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

## Midas

In [68]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas2id']),
        'midas'),
    shuffle=False, batch_size=len(data), **PARAMS)

In [69]:
for X, y in dataset:
    break

In [70]:
X.shape, y.shape

((10565, 1659), (10565, 13))

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8452, 1659), (2113, 1659), (8452, 13), (2113, 13))

In [73]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [74]:
class_weights, _ = np.histogram(np.argmax(y_test, axis=-1), bins=13, density=True)
class_weights

array([0.54252893, 0.15492837, 0.36019447, 0.03467711, 0.02293164,
       0.02908403, 0.        , 0.01006755, 0.01733855, 0.00279654,
       0.0067117 , 0.        , 0.00055931])

### Catboost

In [75]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [76]:
model_params = {
    'task_type': 'CPU',
    'iterations': 100,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, eval_set=cb_eval, **fit_params)

0:	learn: 0.6514772	test: 0.6527831	best: 0.6527831 (0)	total: 376ms	remaining: 37.2s
1:	learn: 0.6514222	test: 0.6527831	best: 0.6527831 (0)	total: 706ms	remaining: 34.6s
2:	learn: 0.6514222	test: 0.6527831	best: 0.6527831 (0)	total: 1.04s	remaining: 33.7s
3:	learn: 0.6514222	test: 0.6527831	best: 0.6527831 (0)	total: 1.39s	remaining: 33.4s
4:	learn: 0.6514222	test: 0.6527831	best: 0.6527831 (0)	total: 1.73s	remaining: 32.8s
5:	learn: 0.6514222	test: 0.6527831	best: 0.6527831 (0)	total: 2.12s	remaining: 33.3s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.6527830795
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x1f7a39f1af0>

In [77]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2113, 1)


In [80]:
Counter(cb_pred.squeeze())

Counter({0: 2113})

In [78]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.28886867036750546

In [79]:
accuracy_score(np.argmax(y_test, axis=-1), preds_class.squeeze())

0.45906294368196876

In [97]:
model.score(cb_eval)

0.45906294368196876

### LogisticRegression

In [81]:
lg = LogisticRegression(random_state=42, max_iter=1000)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=1000, random_state=42)

In [82]:
logreg_pred = lg.predict(X_test)

In [83]:
Counter(logreg_pred)

Counter({0: 1316, 1: 159, 4: 14, 2: 615, 3: 9})

In [84]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.4247766676298006

In [85]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4637955513487932

### RandomForeset

In [86]:
rf = RandomForestClassifier(max_depth=4, random_state=42)

In [87]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=4, random_state=42)

In [88]:
rf_preds = rf.predict(X_test)

In [89]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.30172021809033683

In [90]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.4633222905821107

In [91]:
Counter(rf_preds)

Counter({0: 2090, 2: 23})

## Entities

In [98]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_entity2id']),
        'entity'),
    shuffle=False, batch_size=len(data), **PARAMS)

In [99]:
for X, y in dataset:
    break

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8452, 1659), (2113, 1659), (8452, 25), (2113, 25))

In [102]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [103]:
class_weights, _ = np.histogram(np.argmax(y_test, axis=-1), bins=25, density=True)
class_weights

array([0.17962397, 0.04517489, 0.01129372, 0.27481392, 0.00860474,
       0.10594588, 0.05485523, 0.03119219, 0.        , 0.03388117,
       0.09411436, 0.10971045, 0.01828508, 0.00107559, 0.01882287,
       0.00322678, 0.        , 0.05055285, 0.04571269, 0.01021813,
       0.00537796, 0.00215119, 0.0145205 , 0.01183152, 0.00537796])

### Catboost

In [104]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

model_params = {
    'task_type': 'CPU',
    'iterations': 100,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'use_best_model': True,
    'class_weights': class_weights
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, eval_set=cb_eval, **fit_params)

cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

0:	learn: 0.5727428	test: 0.5863033	best: 0.5863033 (0)	total: 732ms	remaining: 1m 12s
1:	learn: 0.5801527	test: 0.5958832	best: 0.5958832 (1)	total: 1.37s	remaining: 1m 7s
2:	learn: 0.5727074	test: 0.5852013	best: 0.5958832 (1)	total: 1.98s	remaining: 1m 4s
3:	learn: 0.5965445	test: 0.6065693	best: 0.6065693 (3)	total: 2.6s	remaining: 1m 2s
4:	learn: 0.5974321	test: 0.6092500	best: 0.6092500 (4)	total: 3.23s	remaining: 1m 1s
5:	learn: 0.5967630	test: 0.6086450	best: 0.6092500 (4)	total: 3.84s	remaining: 1m
6:	learn: 0.5924798	test: 0.6056730	best: 0.6092500 (4)	total: 4.49s	remaining: 59.7s
7:	learn: 0.5753675	test: 0.5866027	best: 0.6092500 (4)	total: 5.12s	remaining: 58.8s
8:	learn: 0.5962978	test: 0.6079646	best: 0.6092500 (4)	total: 5.74s	remaining: 58.1s
9:	learn: 0.5801923	test: 0.5965127	best: 0.6092500 (4)	total: 6.35s	remaining: 57.1s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.6092499774
bestIteration = 4

Shrink model to first 5 iterations.
class =  (

In [107]:
Counter(cb_pred.squeeze())

Counter({3: 1768, 0: 203, 5: 142})

In [105]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.21241815248745655

In [106]:
accuracy_score(np.argmax(y_test, axis=-1), cb_pred)

0.31803123521060106

### LogisticRegression

In [108]:
lg = LogisticRegression(random_state=42, max_iter=1000)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=1000, random_state=42)

In [109]:
logreg_pred = lg.predict(X_test)

In [115]:
Counter(logreg_pred)

Counter({4: 15,
         3: 686,
         0: 360,
         5: 202,
         8: 19,
         10: 217,
         7: 48,
         9: 161,
         6: 116,
         2: 13,
         16: 73,
         15: 95,
         19: 3,
         1: 45,
         22: 4,
         21: 12,
         11: 16,
         20: 3,
         13: 12,
         17: 7,
         18: 6})

In [116]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.4155409488647077

In [117]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.4335068622811169

### RandomForest

In [118]:
rf = RandomForestClassifier(max_depth=5, random_state=42)

In [119]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=5, random_state=42)

In [120]:
rf_preds = rf.predict(X_test)

In [121]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.28376075471050893

In [122]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.36393752957879794

In [123]:
Counter(rf_preds)

Counter({3: 1572, 10: 100, 0: 295, 5: 100, 16: 11, 9: 11, 15: 24})

## Concatenation

In [131]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas_and_entity2id']),
        'concatenation'),
    shuffle=False, batch_size=len(data), **PARAMS)

In [132]:
for X, y in dataset:
    break

In [133]:
X.shape, y.shape

((10565, 1659), (10565, 199))

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [135]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8452, 1659), (2113, 1659), (8452, 199), (2113, 199))

In [136]:
X[0,525:540]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [137]:
class_weights, _ = np.histogram(np.argmax(y_test, axis=-1), bins=199, density=True)
class_weights

array([0.07848241, 0.00142695, 0.00903737, 0.03281992, 0.06563983,
       0.0009513 , 0.03710078, 0.0261608 , 0.00332956, 0.00903737,
       0.01950169, 0.11986405, 0.00237825, 0.        , 0.01046432,
       0.03281992, 0.03852773, 0.04090598, 0.00570781, 0.01522083,
       0.03995468, 0.04090598, 0.01189127, 0.00428086, 0.00047565,
       0.00856172, 0.0009513 , 0.00665911, 0.01331823, 0.02853906,
       0.00332956, 0.0009513 , 0.00142695, 0.00237825, 0.00808607,
       0.00951302, 0.01855039, 0.0223556 , 0.00237825, 0.00285391,
       0.0252095 , 0.00570781, 0.0009513 , 0.00570781, 0.0009513 ,
       0.00713476, 0.0019026 , 0.00332956, 0.01236693, 0.        ,
       0.01236693, 0.00285391, 0.00142695, 0.00475651, 0.        ,
       0.0019026 , 0.00285391, 0.00998867, 0.00856172, 0.00237825,
       0.0009513 , 0.00047565, 0.00237825, 0.00285391, 0.00047565,
       0.0009513 , 0.00047565, 0.01426953, 0.0009513 , 0.00380521,
       0.00047565, 0.00285391, 0.0019026 , 0.00142695, 0.00237

### Catboost

In [141]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [144]:
model_params = {
    'task_type': 'CPU',
    'iterations': 5,
    'learning_rate': 0.001,
    'depth': 3,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    # 'use_best_model': True,
    # 'class_weights': class_weights
}

fit_params = {
    # 'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, **fit_params)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.1261240	total: 10.1s	remaining: 40.3s
1:	learn: 0.1476574	total: 21.7s	remaining: 32.6s
2:	learn: 0.1482489	total: 32.7s	remaining: 21.8s
3:	learn: 0.1473024	total: 43.5s	remaining: 10.9s
4:	learn: 0.1471841	total: 54.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1f7a35a12e0>

In [145]:
cb_pred = model.predict(cb_eval)
print("class = ", cb_pred.shape)

class =  (2113, 1)


In [146]:
Counter(cb_pred.squeeze())

Counter({11: 1658, 0: 455})

In [147]:
f1_score(np.argmax(y_test, axis=-1), cb_pred.squeeze(), average='weighted')

0.050506459358042266

In [148]:
accuracy_score(np.argmax(y_test, axis=-1), preds_class.squeeze())

0.07808802650260294

In [149]:
model.score(cb_eval)

0.150970184571699

### LogisticRegression

In [151]:
lg = LogisticRegression(random_state=42, max_iter=1000)
lg.fit(X_train, np.argmax(y_train, axis=-1))

LogisticRegression(max_iter=1000, random_state=42)

In [152]:
logreg_pred = lg.predict(X_test)

In [153]:
Counter(logreg_pred)

Counter({11: 484,
         0: 233,
         15: 101,
         22: 14,
         3: 51,
         17: 98,
         4: 139,
         9: 21,
         16: 102,
         7: 94,
         18: 8,
         2: 14,
         21: 97,
         48: 18,
         6: 81,
         37: 54,
         67: 26,
         36: 82,
         28: 9,
         44: 4,
         20: 124,
         108: 3,
         40: 55,
         29: 47,
         35: 6,
         41: 4,
         57: 8,
         10: 49,
         107: 3,
         58: 5,
         80: 1,
         34: 5,
         33: 2,
         71: 7,
         53: 1,
         114: 1,
         50: 8,
         69: 4,
         161: 1,
         19: 5,
         52: 5,
         23: 4,
         81: 5,
         45: 8,
         38: 1,
         42: 1,
         39: 4,
         14: 2,
         8: 7,
         25: 5,
         59: 1,
         109: 1})

In [154]:
f1_score(np.argmax(y_test, axis=-1), logreg_pred, average='weighted')

0.19195402111020096

In [155]:
accuracy_score(np.argmax(y_test, axis=-1), logreg_pred)

0.22716516800757217

### RandomForest

In [156]:
rf = RandomForestClassifier(max_depth=4, random_state=42)

In [157]:
rf.fit(X_train, np.argmax(y_train, axis=-1))

RandomForestClassifier(max_depth=4, random_state=42)

In [158]:
rf_preds = rf.predict(X_test)

In [161]:
Counter(rf_preds)

Counter({11: 1763, 0: 253, 15: 21, 6: 36, 20: 24, 37: 11, 36: 3, 16: 2})

In [159]:
f1_score(np.argmax(y_test, axis=-1), rf_preds, average='weighted')

0.08049459516034534

In [160]:
accuracy_score(np.argmax(y_test, axis=-1), rf_preds)

0.16469474680548982