<a href="https://colab.research.google.com/github/deniskapel/autoskill/blob/main/catboost_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
pip install catboost
pip install ipywidgets
jupyter nbextension enable --py widgetsnbextension

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
Installing collected packages: catboost
Successfully installed catboost-1.0.4


Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [3]:
import json
import pickle
from collections import Counter

import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score, accuracy_score
from joblib import dump, load
# models
from catboost import CatBoostClassifier, Pool
# for multilabel classification
# metrics
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical

In [4]:
Midas2ID = {
    "appreciation": 0, "command": 1, "comment": 2,"complaint": 3,
    "dev_command": 4, "neg_answer": 5, "open_question_factual": 6,
    "open_question_opinion": 7, "opinion": 8, "other_answers": 9,
    "pos_answer": 10, "statement": 11, "yes_no_question": 12,
}

ID2Midas = list(Midas2ID.keys())

Entity2ID = {
    'person': 0, 'location': 1, 'videoname': 2, 'organization': 3,
    'device': 4, 'sport': 5, 'duration': 6, 'number': 7, 'genre': 8,
    'sportteam': 9, 'position': 10, 'event': 11, 'softwareapplication': 12,
    'vehicle': 13, 'party': 14, 'year': 15, 'date': 16, 'gamename': 17,
    'songname': 18, 'bookname': 19}

ID2Entity = list(Entity2ID.keys())

In [5]:
with open('data/vectorized_train.npy', 'rb') as f:
    X_train = np.load(f)
    y_midas_train = np.load(f)
    y_entity_train = np.load(f)

with open('data/vectorized_val.npy', 'rb') as f:
    X_val = np.load(f)
    y_midas_val = np.load(f)
    y_entity_val = np.load(f)

In [6]:
X_train.shape, y_midas_train.shape, y_entity_train.shape

((179286, 1641), (179286,), (179286, 20))

In [7]:
X_val.shape, y_midas_val.shape, y_entity_val.shape

((39089, 1641), (39089,), (39089, 20))

# Catboost

## SymmetricTree

### Midas

In [8]:
X_midas_train = Pool(np.float32(X_train), label=y_midas_train)
X_midas_val = Pool(np.float32(X_val), label=y_midas_val)

In [34]:
model_params = {
    'verbose': True,
    'random_seed': 42,
    'use_best_model': True,
    'devices':'0:1'
}

fit_params = {
    'use_best_model': True,
    'early_stopping_rounds': 5   
}

In [37]:
midas_clf = CatBoostClassifier(
    grow_policy='SymmetricTree', 
    loss_function='MultiClass', 
    eval_metric='Accuracy', 
    task_type='GPU', **model_params)

In [38]:
    
midas_clf.fit(
    X_midas_train, eval_set=X_midas_val, **fit_params)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.177599
0:	learn: 0.3845476	test: 0.3845839	best: 0.3845839 (0)	total: 694ms	remaining: 11m 32s
1:	learn: 0.3852448	test: 0.3853258	best: 0.3853258 (1)	total: 1.46s	remaining: 12m 8s
2:	learn: 0.3854066	test: 0.3855049	best: 0.3855049 (2)	total: 2.21s	remaining: 12m 13s
3:	learn: 0.3863492	test: 0.3864514	best: 0.3864514 (3)	total: 2.92s	remaining: 12m 8s
4:	learn: 0.3870297	test: 0.3871166	best: 0.3871166 (4)	total: 3.74s	remaining: 12m 24s
5:	learn: 0.3880615	test: 0.3882934	best: 0.3882934 (5)	total: 4.54s	remaining: 12m 32s
6:	learn: 0.3882010	test: 0.3882166	best: 0.3882934 (5)	total: 5.28s	remaining: 12m 28s
7:	learn: 0.3888703	test: 0.3886771	best: 0.3886771 (7)	total: 6.08s	remaining: 12m 34s
8:	learn: 0.3892050	test: 0.3889841	best: 0.3889841 (8)	total: 6.79s	remaining: 12m 27s
9:	learn: 0.3896567	test: 0.3895469	best: 0.3895469 (9)	total: 7.52s	remaining: 12m 24s
10:	learn: 0.3901866	test: 0.3899307	best: 0.3899307 (10)	total: 8.38s	remaining: 12m 33s
11

<catboost.core.CatBoostClassifier at 0x7f8f959e94d0>

In [39]:
midas_preds = midas_clf.predict(X_midas_val).squeeze()

In [40]:
Counter(midas_preds)

Counter({2: 272, 5: 44, 6: 5, 8: 22774, 10: 788, 11: 15205, 12: 1})

In [41]:
print(
    classification_report(y_midas_val, midas_preds, target_names=ID2Midas)
)

                       precision    recall  f1-score   support

         appreciation       0.00      0.00      0.00      1032
              command       0.00      0.00      0.00       615
              comment       0.29      0.02      0.04      4035
            complaint       0.00      0.00      0.00       826
          dev_command       0.00      0.00      0.00        54
           neg_answer       0.25      0.01      0.02      1204
open_question_factual       0.20      0.00      0.00       809
open_question_opinion       0.00      0.00      0.00       497
              opinion       0.41      0.75      0.53     12549
        other_answers       0.00      0.00      0.00       309
           pos_answer       0.41      0.07      0.12      4639
            statement       0.37      0.52      0.44     10875
      yes_no_question       0.00      0.00      0.00      1645

             accuracy                           0.40     39089
            macro avg       0.15      0.11      0.09 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Entity

In [21]:
X_entity_train = Pool(np.float32(X_train), label=y_entity_train)
X_entity_val = Pool(np.float32(X_val), label=y_entity_val)

In [32]:
entity_clf = CatBoostClassifier(
    grow_policy='SymmetricTree', 
    loss_function='MultiLogloss', 
    eval_metric='HammingLoss',
    **model_params)

In [33]:
entity_clf.fit(X_entity_train, eval_set=X_entity_val, task_type='GPU', **fit_params)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.045811


CatBoostError: ignored

In [None]:
entity_preds = entity_clf.predict(X_entity_val).squeeze()

## Depthwise

### Midas

In [42]:
midas_clf = CatBoostClassifier(
    grow_policy='Depthwise', 
    loss_function='MultiClass', 
    eval_metric='Accuracy', 
    task_type='GPU', **model_params)

In [43]:
midas_clf.fit(
    X_midas_train, eval_set=X_midas_val, **fit_params)

Learning rate set to 0.177599
0:	learn: 0.3875707	test: 0.3844560	best: 0.3844560 (0)	total: 780ms	remaining: 12m 59s
1:	learn: 0.3894504	test: 0.3862724	best: 0.3862724 (1)	total: 1.52s	remaining: 12m 39s
2:	learn: 0.3907500	test: 0.3876538	best: 0.3876538 (2)	total: 2.24s	remaining: 12m 25s
3:	learn: 0.3916145	test: 0.3888050	best: 0.3888050 (3)	total: 2.94s	remaining: 12m 10s
4:	learn: 0.3928528	test: 0.3887283	best: 0.3888050 (3)	total: 3.64s	remaining: 12m 5s
5:	learn: 0.3945093	test: 0.3898539	best: 0.3898539 (5)	total: 4.48s	remaining: 12m 22s
6:	learn: 0.3957810	test: 0.3911842	best: 0.3911842 (6)	total: 5.2s	remaining: 12m 17s
7:	learn: 0.3967348	test: 0.3920796	best: 0.3920796 (7)	total: 5.95s	remaining: 12m 17s
8:	learn: 0.3976384	test: 0.3920540	best: 0.3920796 (7)	total: 6.63s	remaining: 12m 9s
9:	learn: 0.3992615	test: 0.3924889	best: 0.3924889 (9)	total: 7.42s	remaining: 12m 14s
10:	learn: 0.3999978	test: 0.3933332	best: 0.3933332 (10)	total: 8.17s	remaining: 12m 14s
11:

<catboost.core.CatBoostClassifier at 0x7f8f958c6890>

In [44]:
midas_preds = midas_clf.predict(X_midas_val).squeeze()

In [45]:
Counter(midas_preds)

Counter({2: 368, 5: 144, 6: 68, 7: 6, 8: 22826, 10: 1164, 11: 14473, 12: 40})

In [46]:
print(
    classification_report(y_midas_val, midas_preds, target_names=ID2Midas)
)

                       precision    recall  f1-score   support

         appreciation       0.00      0.00      0.00      1032
              command       0.00      0.00      0.00       615
              comment       0.30      0.03      0.05      4035
            complaint       0.00      0.00      0.00       826
          dev_command       0.00      0.00      0.00        54
           neg_answer       0.20      0.02      0.04      1204
open_question_factual       0.24      0.02      0.04       809
open_question_opinion       0.17      0.00      0.00       497
              opinion       0.41      0.75      0.53     12549
        other_answers       0.00      0.00      0.00       309
           pos_answer       0.37      0.09      0.15      4639
            statement       0.38      0.51      0.44     10875
      yes_no_question       0.30      0.01      0.01      1645

             accuracy                           0.40     39089
            macro avg       0.18      0.11      0.10 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Entity

## Lossguide

In [47]:
midas_clf = CatBoostClassifier(
    grow_policy='Lossguide', 
    loss_function='MultiClass', 
    eval_metric='Accuracy', 
    task_type='GPU', **model_params)

In [48]:
midas_clf.fit(
    X_midas_train, eval_set=X_midas_val, **fit_params)

Learning rate set to 0.177599
0:	learn: 0.3852615	test: 0.3836629	best: 0.3836629 (0)	total: 726ms	remaining: 12m 5s
1:	learn: 0.3873643	test: 0.3842769	best: 0.3842769 (1)	total: 1.41s	remaining: 11m 43s
2:	learn: 0.3898743	test: 0.3874747	best: 0.3874747 (2)	total: 2.07s	remaining: 11m 27s
3:	learn: 0.3902926	test: 0.3877817	best: 0.3877817 (3)	total: 2.72s	remaining: 11m 17s
4:	learn: 0.3912297	test: 0.3884469	best: 0.3884469 (4)	total: 3.36s	remaining: 11m 8s
5:	learn: 0.3923675	test: 0.3895981	best: 0.3895981 (5)	total: 4.15s	remaining: 11m 27s
6:	learn: 0.3933659	test: 0.3904423	best: 0.3904423 (6)	total: 4.91s	remaining: 11m 36s
7:	learn: 0.3942249	test: 0.3919517	best: 0.3919517 (7)	total: 5.59s	remaining: 11m 33s
8:	learn: 0.3945986	test: 0.3919005	best: 0.3919517 (7)	total: 6.25s	remaining: 11m 28s
9:	learn: 0.3951731	test: 0.3922587	best: 0.3922587 (9)	total: 6.98s	remaining: 11m 31s
10:	learn: 0.3958480	test: 0.3926168	best: 0.3926168 (10)	total: 7.66s	remaining: 11m 28s
11

<catboost.core.CatBoostClassifier at 0x7f8f958eea90>

In [49]:
midas_preds = midas_clf.predict(X_midas_val).squeeze()

In [50]:
Counter(midas_preds)

Counter({2: 590, 5: 184, 6: 94, 7: 2, 8: 22746, 10: 1205, 11: 14212, 12: 56})

In [51]:
print(
    classification_report(y_midas_val, midas_preds, target_names=ID2Midas)
)

                       precision    recall  f1-score   support

         appreciation       0.00      0.00      0.00      1032
              command       0.00      0.00      0.00       615
              comment       0.30      0.04      0.08      4035
            complaint       0.00      0.00      0.00       826
          dev_command       0.00      0.00      0.00        54
           neg_answer       0.23      0.04      0.06      1204
open_question_factual       0.21      0.02      0.04       809
open_question_opinion       0.00      0.00      0.00       497
              opinion       0.41      0.75      0.53     12549
        other_answers       0.00      0.00      0.00       309
           pos_answer       0.36      0.09      0.15      4639
            statement       0.39      0.51      0.44     10875
      yes_no_question       0.32      0.01      0.02      1645

             accuracy                           0.40     39089
            macro avg       0.17      0.11      0.10 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
