In [1]:
import json
import pickle

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

from utils.preprocessing import LabelEncoder
from utils.tensorflow_utils import SkillDataset

In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [3]:
with open('data/labels.json', 'r') as f:
    labels_map = json.load(f)

with open('data/dataset.json', 'r') as f:
    data = json.load(f)

In [4]:
PARAMS = {
    'embed_dim': 512,
    'n_previous': 3
}

In [25]:
dataset = SkillDataset(
    data=data, vars2id=labels_map,
    text_vectorizer=encoder,
    label_encoder=LabelEncoder(
        list(labels_map['target_midas2id']),
        'midas'),
    shuffle=True, batch_size=len(data), **PARAMS)

In [26]:
for X, y in dataset:
    break

In [27]:
X.shape, y.shape

((10565, 1659), (10565, 13))

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8452, 1659), (2113, 1659), (8452, 13), (2113, 13))

## Catboost

In [30]:
from catboost import (
    CatBoostClassifier, Pool, sum_models
)

In [31]:
cb_train = Pool(np.float32(X_train), label=np.argmax(y_train, axis=-1).tolist())
cb_eval = Pool(np.float32(X_test), label=np.argmax(y_test, axis=-1).tolist())

In [56]:
model_params = {
    'task_type': 'CPU',
    'iterations': 100,
    'learning_rate': 0.001,
    'depth': 5,
    'verbose': True,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy'
    
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5,
}

model = CatBoostClassifier(**model_params)
model.fit(cb_train, eval_set=cb_eval, **fit_params)

0:	learn: 0.4609560	test: 0.4600095	best: 0.4600095 (0)	total: 1.58s	remaining: 2m 36s
1:	learn: 0.4598912	test: 0.4585897	best: 0.4600095 (0)	total: 3.01s	remaining: 2m 27s
2:	learn: 0.4597728	test: 0.4590629	best: 0.4600095 (0)	total: 4.29s	remaining: 2m 18s
3:	learn: 0.4596545	test: 0.4590629	best: 0.4600095 (0)	total: 5.65s	remaining: 2m 15s
4:	learn: 0.4596545	test: 0.4590629	best: 0.4600095 (0)	total: 7.1s	remaining: 2m 14s
5:	learn: 0.4595362	test: 0.4590629	best: 0.4600095 (0)	total: 8.5s	remaining: 2m 13s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.4600094652
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x1d7aae21130>

In [58]:
preds_class = model.predict(cb_eval)
preds_proba = model.predict_proba(cb_eval)
print("class = ", preds_class.shape)
print("proba = ", preds_proba.shape)

class =  (2113, 1)
proba =  (2113, 13)


In [59]:
for i in preds_class[1000:1100]:
    print(i)

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
