In [1]:
import torch
import pandas as pd
import catboost as cb
from sentence_transformers import SentenceTransformer
import numpy as np
import gensim.downloader as api
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV

from preprocessing.utils import is_sentence_in_boundaries
from datasets_utils import get_luxury_data, get_tech_data, get_retail_data, get_big_basket_data
from preprocess import preprocess, with_category_features
from utils import (
    tfidf_vectorize,
    w2v_vectorize,
    display_metrics,
    accuracy_ml_score,
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
device

'cuda'

In [3]:
big_basket_data = get_big_basket_data()
luxury_data = get_luxury_data()
tech_data = get_tech_data()
retail_data = get_retail_data()

# datasets = [big_basket_data]
# dataset_names = ['Big basket']
datasets = [big_basket_data, retail_data, luxury_data, tech_data]
dataset_names = ['Big basket', 'Retail', 'Luxury', 'Tech']

# Get datasets with description column preprocessed
big_basket_data['description'] = big_basket_data['description'].apply(preprocess)
tech_data['description'] = tech_data['description'].apply(preprocess)
luxury_data['description'] = luxury_data['description'].apply(preprocess)
retail_data['description'] = retail_data['description'].apply(preprocess)

# Preprocess categories
big_basket_data = with_category_features(big_basket_data)
tech_data = with_category_features(tech_data)
luxury_data = with_category_features(luxury_data)
retail_data = with_category_features(retail_data)

big_basket_data = big_basket_data[big_basket_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]
retail_data = retail_data[retail_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=250))]
luxury_data = luxury_data[luxury_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=100))]
tech_data = tech_data[tech_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]

  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 

In [15]:
grid = {
    'iterations': [30, 60],
    'loss_function': ['MultiLogloss', 'MultiCrossEntropy'],
    'allow_const_label': [True],
    'random_state': [13],
}

In [4]:
model_cv_best_params = {
    'allow_const_label': True,
    'iterations': 60,
    'loss_function': 'MultiLogloss',
    'random_state': 13,
}

## Big Basket

In [5]:
big_basket_X_train, big_basket_X_test, big_basket_y_train, big_basket_y_test = train_test_split(
    big_basket_data['description'],
    big_basket_data[[column for column in big_basket_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

In [6]:
big_basket_X_train.shape, big_basket_X_test.shape

((21156,), (5289,))

### TF-IDF

In [7]:
big_basket_X_train_tfidf, big_basket_X_test_tfidf = tfidf_vectorize(
    train_data=big_basket_X_train,
    test_data=big_basket_X_test,
)

In [8]:
model_cv_tfidf = GridSearchCV(
    estimator=cb.CatBoostClassifier(),
    param_grid=grid,
    scoring={
        'accuracy_ml': make_scorer(accuracy_ml_score),
    },
    refit='accuracy_ml',
    cv=3,
)

model_cv_tfidf.fit(np.array(big_basket_X_train_tfidf.todense()), big_basket_y_train.to_numpy())

Learning rate set to 0.5
0:	learn: 0.1401207	total: 8.05s	remaining: 3m 53s
1:	learn: 0.0891762	total: 16.2s	remaining: 3m 46s
2:	learn: 0.0741244	total: 24.2s	remaining: 3m 37s
3:	learn: 0.0695090	total: 32.1s	remaining: 3m 28s
4:	learn: 0.0658343	total: 40.1s	remaining: 3m 20s
5:	learn: 0.0627601	total: 48.3s	remaining: 3m 13s
6:	learn: 0.0604227	total: 56.3s	remaining: 3m 5s
7:	learn: 0.0584453	total: 1m 4s	remaining: 2m 56s
8:	learn: 0.0567185	total: 1m 12s	remaining: 2m 48s
9:	learn: 0.0551176	total: 1m 20s	remaining: 2m 41s
10:	learn: 0.0536832	total: 1m 29s	remaining: 2m 33s
11:	learn: 0.0521418	total: 1m 37s	remaining: 2m 25s
12:	learn: 0.0510145	total: 1m 45s	remaining: 2m 17s
13:	learn: 0.0500962	total: 1m 53s	remaining: 2m 9s
14:	learn: 0.0490589	total: 2m 1s	remaining: 2m 1s
15:	learn: 0.0476010	total: 2m 9s	remaining: 1m 53s
16:	learn: 0.0469024	total: 2m 17s	remaining: 1m 45s
17:	learn: 0.0460880	total: 2m 25s	remaining: 1m 37s
18:	learn: 0.0453121	total: 2m 33s	remaining

In [10]:
model_cv_tfidf.best_params_

{'allow_const_label': True,
 'iterations': 60,
 'loss_function': 'MultiLogloss',
 'random_state': 13}

In [12]:
model_cv_best_params = model_cv_tfidf.best_params_

model_cv_best_params

{'allow_const_label': True,
 'iterations': 60,
 'loss_function': 'MultiLogloss',
 'random_state': 13}

In [13]:
model_best_tfitd = cb.CatBoostClassifier(**model_cv_best_params)
model_best_tfitd.fit(np.array(big_basket_X_train_tfidf.todense()), big_basket_y_train.to_numpy())

Learning rate set to 0.5
0:	learn: 0.1404611	total: 11.1s	remaining: 10m 52s
1:	learn: 0.0830849	total: 21.9s	remaining: 10m 35s
2:	learn: 0.0719081	total: 32.7s	remaining: 10m 22s
3:	learn: 0.0659580	total: 43.6s	remaining: 10m 10s
4:	learn: 0.0633937	total: 54.4s	remaining: 9m 57s
5:	learn: 0.0604480	total: 1m 5s	remaining: 9m 46s
6:	learn: 0.0584051	total: 1m 16s	remaining: 9m 35s
7:	learn: 0.0567368	total: 1m 26s	remaining: 9m 24s
8:	learn: 0.0548341	total: 1m 37s	remaining: 9m 13s
9:	learn: 0.0535631	total: 1m 48s	remaining: 9m 3s
10:	learn: 0.0521903	total: 1m 59s	remaining: 8m 53s
11:	learn: 0.0512407	total: 2m 10s	remaining: 8m 42s
12:	learn: 0.0502450	total: 2m 21s	remaining: 8m 31s
13:	learn: 0.0490685	total: 2m 32s	remaining: 8m 20s
14:	learn: 0.0477598	total: 2m 43s	remaining: 8m 9s
15:	learn: 0.0467759	total: 2m 54s	remaining: 7m 59s
16:	learn: 0.0456943	total: 3m 5s	remaining: 7m 49s
17:	learn: 0.0449730	total: 3m 16s	remaining: 7m 38s
18:	learn: 0.0440384	total: 3m 27s	r

<catboost.core.CatBoostClassifier at 0x7f8b1422b910>

In [16]:
big_basket_y_pred_tfidf = model_best_tfitd.predict(big_basket_X_test_tfidf)

In [18]:
display_metrics(big_basket_y_test, big_basket_y_pred_tfidf)

Accuracy (subset): 0.4286254490451881
Accuracy (ML): 0.9894737990313713
Precision (macro): 0.7531470229838161
Precision (micro): 0.8845868381989115
Recall (macro): 0.3685328410748345
Recall (micro): 0.5955196535642905
Hamming loss: 0.010526200968628649


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [7]:
big_basket_X_train_w2v = w2v_vectorize(big_basket_X_train)
big_basket_X_test_w2v = w2v_vectorize(big_basket_X_test)

KeyboardInterrupt: 

In [None]:
model_best_w2v = cb.CatBoostClassifier(**model_cv_best_params)
model_best_w2v.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [24]:
big_basket_y_pred_w2v = model_best_w2v.predict(np.array([x for x in big_basket_X_test_w2v]))

In [25]:
display_metrics(big_basket_y_test, big_basket_y_pred_w2v)

Accuracy (subset): 0.47003214218188694
Accuracy (ML): 0.9897701324956005
Precision (macro): 0.7489442813514712
Precision (micro): 0.8395232521017346
Recall (macro): 0.4820822846172004
Recall (micro): 0.6569786808794137
Hamming loss: 0.010229867504399552


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BERT embeddings

In [8]:
big_basket_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

In [9]:
big_basket_X_train_emb = big_basket_sbert_model.encode(big_basket_X_train.values.tolist())
big_basket_X_test_emb = big_basket_sbert_model.encode(big_basket_X_test.values.tolist())

In [10]:
big_basket_X_train_emb.shape, big_basket_X_test_emb.shape

((21156, 384), (5289, 384))

In [11]:
model_best_emb = cb.CatBoostClassifier(**model_cv_best_params)
model_best_emb.fit(big_basket_X_train_emb, big_basket_y_train.to_numpy())

Learning rate set to 0.5
0:	learn: 0.1336219	total: 2.92s	remaining: 2m 52s
1:	learn: 0.0774499	total: 5.9s	remaining: 2m 51s
2:	learn: 0.0644110	total: 9s	remaining: 2m 51s
3:	learn: 0.0579162	total: 12.2s	remaining: 2m 50s
4:	learn: 0.0536503	total: 15.3s	remaining: 2m 48s
5:	learn: 0.0499719	total: 18.5s	remaining: 2m 46s
6:	learn: 0.0469873	total: 21.6s	remaining: 2m 43s
7:	learn: 0.0438107	total: 24.8s	remaining: 2m 41s
8:	learn: 0.0413989	total: 28s	remaining: 2m 38s
9:	learn: 0.0395394	total: 31.1s	remaining: 2m 35s
10:	learn: 0.0376752	total: 34.2s	remaining: 2m 32s
11:	learn: 0.0361719	total: 37.3s	remaining: 2m 29s
12:	learn: 0.0346674	total: 40.4s	remaining: 2m 26s
13:	learn: 0.0335408	total: 43.4s	remaining: 2m 22s
14:	learn: 0.0322807	total: 46.5s	remaining: 2m 19s
15:	learn: 0.0310677	total: 49.5s	remaining: 2m 16s
16:	learn: 0.0299616	total: 52.6s	remaining: 2m 13s
17:	learn: 0.0289950	total: 55.6s	remaining: 2m 9s
18:	learn: 0.0279890	total: 58.7s	remaining: 2m 6s
19:	l

<catboost.core.CatBoostClassifier at 0x7f2a5683ef80>

In [12]:
big_basket_y_pred_emb = model_best_emb.predict(big_basket_X_test_emb)

In [13]:
display_metrics(big_basket_y_test, big_basket_y_pred_emb)

Accuracy (subset): 0.4505577613915674
Accuracy (ML): 0.9893283592943265
Precision (macro): 0.7358690200420686
Precision (micro): 0.8424458826154876
Recall (macro): 0.4516851021226931
Recall (micro): 0.6287475016655563
Hamming loss: 0.010671640705673604


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Built-in

In [66]:
big_basket_X_train_df = pd.DataFrame(big_basket_X_train)

big_basket_train_pool = cb.Pool(
    big_basket_X_train_df[['description']],
    big_basket_y_train,
    text_features=['description'],
    feature_names=['description']
)

model_best_default = cb.CatBoostClassifier(**model_cv_best_params)

model_best_default.fit(big_basket_train_pool)

big_basket_X_test_df = pd.DataFrame(big_basket_X_test)

big_basket_test_pool = cb.Pool(
    big_basket_X_test_df[['description']],
    big_basket_y_test,
    text_features=['description'],
    feature_names=['description']
)

big_basket_y_pred_default = model_best_default.predict(big_basket_test_pool)

display_metrics(big_basket_y_test, big_basket_y_pred_default)

Accuracy (subset): 0.5882019285309132
Accuracy (ML): 0.9925789374172811
Precision (macro): 0.7035416421137527
Precision (micro): 0.868514041286963
Recall (macro): 0.5288159435947005
Recall (micro): 0.7778147901399067
Hamming loss: 0.007421062582718851


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Retail

In [18]:
retail_X_train, retail_X_test, retail_y_train, retail_y_test = train_test_split(
    retail_data['description'],
    retail_data[[column for column in retail_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

retail_X_train.shape, retail_X_test.shape

((14816,), (3705,))

### TF-IDF

In [20]:
retail_X_train_tfidf, retail_X_test_tfidf = tfidf_vectorize(
    train_data=retail_X_train,
    test_data=retail_X_test,
)

retail_model_best_tfitd = cb.CatBoostClassifier(**model_cv_best_params)
retail_model_best_tfitd.fit(np.array(retail_X_train_tfidf.todense()), retail_y_train.to_numpy())

retail_y_pred_tfidf = retail_model_best_tfitd.predict(retail_X_test_tfidf)

display_metrics(retail_y_test, retail_y_pred_tfidf)

Accuracy (subset): 0.4987854251012146
Accuracy (ML): 0.9464178841753212
Precision (macro): 0.9242072294053362
Precision (micro): 0.9301772589710333
Recall (macro): 0.5011711622506411
Recall (micro): 0.503510414228879
Hamming loss: 0.053582115824678755


### Word2Vec

In [21]:
retail_X_train_w2v = w2v_vectorize(retail_X_train)
retail_X_test_w2v = w2v_vectorize(retail_X_test)

retail_model_best_w2v = cb.CatBoostClassifier(**model_cv_best_params)
retail_model_best_w2v.fit(np.array([x for x in retail_X_train_w2v]), retail_y_train.to_numpy())

retail_y_pred_w2v = retail_model_best_w2v.predict(np.array([x for x in retail_X_test_w2v]))

display_metrics(retail_y_test, retail_y_pred_w2v)

Learning rate set to 0.429805
0:	learn: 0.3985042	total: 474ms	remaining: 28s
1:	learn: 0.2914482	total: 946ms	remaining: 27.4s
2:	learn: 0.2543476	total: 1.39s	remaining: 26.3s
3:	learn: 0.2314262	total: 1.86s	remaining: 26.1s
4:	learn: 0.2171303	total: 2.33s	remaining: 25.6s
5:	learn: 0.2031062	total: 2.81s	remaining: 25.3s
6:	learn: 0.1922414	total: 3.27s	remaining: 24.8s
7:	learn: 0.1835722	total: 3.76s	remaining: 24.4s
8:	learn: 0.1755590	total: 4.25s	remaining: 24.1s
9:	learn: 0.1695620	total: 4.74s	remaining: 23.7s
10:	learn: 0.1633491	total: 5.21s	remaining: 23.2s
11:	learn: 0.1582286	total: 5.67s	remaining: 22.7s
12:	learn: 0.1535531	total: 6.12s	remaining: 22.1s
13:	learn: 0.1500142	total: 6.58s	remaining: 21.6s
14:	learn: 0.1455632	total: 7.06s	remaining: 21.2s
15:	learn: 0.1420536	total: 7.52s	remaining: 20.7s
16:	learn: 0.1388489	total: 7.97s	remaining: 20.2s
17:	learn: 0.1357579	total: 8.41s	remaining: 19.6s
18:	learn: 0.1330707	total: 8.86s	remaining: 19.1s
19:	learn: 0.

### BERT Embeddings

In [22]:
retail_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

retail_X_train_emb = retail_sbert_model.encode(retail_X_train.values.tolist())
retail_X_test_emb = retail_sbert_model.encode(retail_X_test.values.tolist())

retail_model_best_emb = cb.CatBoostClassifier(**model_cv_best_params)
retail_model_best_emb.fit(retail_X_train_emb, retail_y_train.to_numpy())

retail_y_pred_emb = retail_model_best_emb.predict(retail_X_test_emb)

display_metrics(retail_y_test, retail_y_pred_emb)

Learning rate set to 0.429805
0:	learn: 0.3966546	total: 579ms	remaining: 34.2s
1:	learn: 0.3007033	total: 1.16s	remaining: 33.8s
2:	learn: 0.2605688	total: 1.74s	remaining: 33s
3:	learn: 0.2384301	total: 2.31s	remaining: 32.3s
4:	learn: 0.2205258	total: 2.88s	remaining: 31.7s
5:	learn: 0.2089324	total: 3.48s	remaining: 31.4s
6:	learn: 0.2001929	total: 4.06s	remaining: 30.8s
7:	learn: 0.1924963	total: 4.64s	remaining: 30.2s
8:	learn: 0.1852166	total: 5.23s	remaining: 29.6s
9:	learn: 0.1797602	total: 5.8s	remaining: 29s
10:	learn: 0.1739523	total: 6.38s	remaining: 28.4s
11:	learn: 0.1694453	total: 6.95s	remaining: 27.8s
12:	learn: 0.1644780	total: 7.53s	remaining: 27.2s
13:	learn: 0.1610195	total: 8.09s	remaining: 26.6s
14:	learn: 0.1564400	total: 8.69s	remaining: 26.1s
15:	learn: 0.1529514	total: 9.29s	remaining: 25.5s
16:	learn: 0.1497986	total: 9.88s	remaining: 25s
17:	learn: 0.1463341	total: 10.4s	remaining: 24.4s
18:	learn: 0.1433056	total: 11s	remaining: 23.8s
19:	learn: 0.1407201

### Built-in

In [71]:
retail_X_train_df = pd.DataFrame(retail_X_train)

retail_train_pool = cb.Pool(
    retail_X_train_df[['description']],
    retail_y_train,
    text_features=['description'],
    feature_names=['description']
)

retail_model_best_default = cb.CatBoostClassifier(**model_cv_best_params)

retail_model_best_default.fit(retail_train_pool)

retail_X_test_df = pd.DataFrame(retail_X_test)

retail_test_pool = cb.Pool(
    retail_X_test_df[['description']],
    retail_y_test,
    text_features=['description'],
    feature_names=['description']
)

retail_y_pred_default = retail_model_best_default.predict(retail_test_pool)

display_metrics(retail_y_test, retail_y_pred_default)

Learning rate set to 0.429805
0:	learn: 0.4404770	total: 869ms	remaining: 51.3s
1:	learn: 0.3098716	total: 1.73s	remaining: 50s
2:	learn: 0.2378745	total: 2.58s	remaining: 49.1s
3:	learn: 0.1938092	total: 3.46s	remaining: 48.4s
4:	learn: 0.1661539	total: 4.31s	remaining: 47.4s
5:	learn: 0.1467184	total: 5.19s	remaining: 46.7s
6:	learn: 0.1336935	total: 6.1s	remaining: 46.2s
7:	learn: 0.1238376	total: 6.97s	remaining: 45.3s
8:	learn: 0.1169806	total: 7.8s	remaining: 44.2s
9:	learn: 0.1133102	total: 8.67s	remaining: 43.3s
10:	learn: 0.1098108	total: 9.52s	remaining: 42.4s
11:	learn: 0.1066429	total: 10.4s	remaining: 41.5s
12:	learn: 0.1048519	total: 11.2s	remaining: 40.4s
13:	learn: 0.1035351	total: 12s	remaining: 39.5s
14:	learn: 0.1032434	total: 12.8s	remaining: 38.5s
15:	learn: 0.1014160	total: 13.7s	remaining: 37.7s
16:	learn: 0.1005154	total: 14.6s	remaining: 36.9s
17:	learn: 0.0998628	total: 15.4s	remaining: 36s
18:	learn: 0.0995456	total: 16.3s	remaining: 35.1s
19:	learn: 0.098919

## Luxury

In [23]:
luxury_X_train, luxury_X_test, luxury_y_train, luxury_y_test = train_test_split(
    luxury_data['description'],
    luxury_data[[column for column in luxury_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

luxury_X_train.shape, luxury_X_test.shape

((3788,), (947,))

### TF-IDF

In [24]:
luxury_X_train_tfidf, luxury_X_test_tfidf = tfidf_vectorize(
    train_data=luxury_X_train,
    test_data=luxury_X_test,
)

luxury_model_best_tfitd = cb.CatBoostClassifier(**model_cv_best_params)
luxury_model_best_tfitd.fit(np.array(luxury_X_train_tfidf.todense()), luxury_y_train.to_numpy())

luxury_y_pred_tfidf = luxury_model_best_tfitd.predict(luxury_X_test_tfidf)

display_metrics(luxury_y_test, luxury_y_pred_tfidf)

Learning rate set to 0.240077
0:	learn: 0.3786785	total: 870ms	remaining: 51.3s
1:	learn: 0.2436383	total: 1.75s	remaining: 50.8s
2:	learn: 0.1885156	total: 2.63s	remaining: 49.9s
3:	learn: 0.1498638	total: 3.51s	remaining: 49.1s
4:	learn: 0.1255189	total: 4.39s	remaining: 48.3s
5:	learn: 0.1131854	total: 5.29s	remaining: 47.6s
6:	learn: 0.1040369	total: 6.16s	remaining: 46.7s
7:	learn: 0.0984529	total: 7.04s	remaining: 45.7s
8:	learn: 0.0904144	total: 7.93s	remaining: 44.9s
9:	learn: 0.0861143	total: 8.81s	remaining: 44.1s
10:	learn: 0.0824958	total: 9.7s	remaining: 43.2s
11:	learn: 0.0783477	total: 10.6s	remaining: 42.5s
12:	learn: 0.0753595	total: 11.5s	remaining: 41.6s
13:	learn: 0.0713350	total: 12.4s	remaining: 40.8s
14:	learn: 0.0681128	total: 13.3s	remaining: 39.9s
15:	learn: 0.0652634	total: 14.2s	remaining: 38.9s
16:	learn: 0.0629493	total: 15s	remaining: 38s
17:	learn: 0.0606157	total: 15.9s	remaining: 37.1s
18:	learn: 0.0588079	total: 16.8s	remaining: 36.2s
19:	learn: 0.056

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [25]:
luxury_X_train_w2v = w2v_vectorize(luxury_X_train)
luxury_X_test_w2v = w2v_vectorize(luxury_X_test)

luxury_model_best_w2v = cb.CatBoostClassifier(**model_cv_best_params)
luxury_model_best_w2v.fit(np.array([x for x in luxury_X_train_w2v]), luxury_y_train.to_numpy())

luxury_y_pred_w2v = luxury_model_best_w2v.predict(np.array([x for x in luxury_X_test_w2v]))

display_metrics(luxury_y_test, luxury_y_pred_w2v)

Learning rate set to 0.240077
0:	learn: 0.3677428	total: 878ms	remaining: 51.8s
1:	learn: 0.2322048	total: 1.73s	remaining: 50.2s
2:	learn: 0.1713550	total: 2.54s	remaining: 48.3s
3:	learn: 0.1398114	total: 3.35s	remaining: 46.9s
4:	learn: 0.1210572	total: 4.14s	remaining: 45.5s
5:	learn: 0.1092914	total: 4.91s	remaining: 44.2s
6:	learn: 0.0997392	total: 5.69s	remaining: 43.1s
7:	learn: 0.0922292	total: 6.46s	remaining: 42s
8:	learn: 0.0865533	total: 7.24s	remaining: 41s
9:	learn: 0.0811955	total: 8.01s	remaining: 40s
10:	learn: 0.0766602	total: 8.78s	remaining: 39.1s
11:	learn: 0.0726020	total: 9.54s	remaining: 38.2s
12:	learn: 0.0689621	total: 10.3s	remaining: 37.2s
13:	learn: 0.0655298	total: 11.1s	remaining: 36.3s
14:	learn: 0.0621653	total: 11.8s	remaining: 35.5s
15:	learn: 0.0597847	total: 12.6s	remaining: 34.6s
16:	learn: 0.0568781	total: 13.3s	remaining: 33.7s
17:	learn: 0.0542782	total: 14.1s	remaining: 32.9s
18:	learn: 0.0520844	total: 14.9s	remaining: 32s
19:	learn: 0.050292

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BERT Embeddings

In [26]:
luxury_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

luxury_X_train_emb = luxury_sbert_model.encode(luxury_X_train.values.tolist())
luxury_X_test_emb = luxury_sbert_model.encode(luxury_X_test.values.tolist())

luxury_model_best_emb = cb.CatBoostClassifier(**model_cv_best_params)
luxury_model_best_emb.fit(luxury_X_train_emb, luxury_y_train.to_numpy())

luxury_y_pred_emb = luxury_model_best_emb.predict(luxury_X_test_emb)

display_metrics(luxury_y_test, luxury_y_pred_emb)

Learning rate set to 0.240077
0:	learn: 0.3624748	total: 961ms	remaining: 56.7s
1:	learn: 0.2268740	total: 1.97s	remaining: 57s
2:	learn: 0.1661277	total: 2.99s	remaining: 56.8s
3:	learn: 0.1378021	total: 3.95s	remaining: 55.3s
4:	learn: 0.1184673	total: 4.93s	remaining: 54.2s
5:	learn: 0.1076637	total: 5.9s	remaining: 53.1s
6:	learn: 0.0991338	total: 6.89s	remaining: 52.1s
7:	learn: 0.0934767	total: 7.86s	remaining: 51.1s
8:	learn: 0.0879128	total: 8.83s	remaining: 50s
9:	learn: 0.0829271	total: 9.84s	remaining: 49.2s
10:	learn: 0.0786427	total: 10.9s	remaining: 48.4s
11:	learn: 0.0738978	total: 11.9s	remaining: 47.6s
12:	learn: 0.0704475	total: 12.9s	remaining: 46.6s
13:	learn: 0.0674536	total: 13.9s	remaining: 45.6s
14:	learn: 0.0640751	total: 14.8s	remaining: 44.5s
15:	learn: 0.0610275	total: 15.8s	remaining: 43.5s
16:	learn: 0.0582345	total: 16.8s	remaining: 42.6s
17:	learn: 0.0557003	total: 17.8s	remaining: 41.5s
18:	learn: 0.0531847	total: 18.8s	remaining: 40.5s
19:	learn: 0.050

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:
luxury_X_train_df = pd.DataFrame(luxury_X_train)

luxury_train_pool = cb.Pool(
    luxury_X_train_df[['description']],
    luxury_y_train,
    text_features=['description'],
    feature_names=['description']
)

luxury_model_best_default = cb.CatBoostClassifier(**model_cv_best_params)

luxury_model_best_default.fit(luxury_train_pool)

luxury_X_test_df = pd.DataFrame(luxury_X_test)

luxury_test_pool = cb.Pool(
    luxury_X_test_df[['description']],
    luxury_y_test,
    text_features=['description'],
    feature_names=['description']
)

luxury_y_pred_default = luxury_model_best_default.predict(luxury_test_pool)

display_metrics(luxury_y_test, luxury_y_pred_default)

Learning rate set to 0.240077
0:	learn: 0.5152438	total: 1.56s	remaining: 1m 32s
1:	learn: 0.4033848	total: 3.1s	remaining: 1m 29s
2:	learn: 0.3291920	total: 4.62s	remaining: 1m 27s
3:	learn: 0.2724818	total: 6.11s	remaining: 1m 25s
4:	learn: 0.2350121	total: 7.59s	remaining: 1m 23s
5:	learn: 0.2038896	total: 9.07s	remaining: 1m 21s
6:	learn: 0.1768296	total: 10.6s	remaining: 1m 19s
7:	learn: 0.1573058	total: 12s	remaining: 1m 18s
8:	learn: 0.1426765	total: 13.5s	remaining: 1m 16s
9:	learn: 0.1314057	total: 15s	remaining: 1m 15s
10:	learn: 0.1207989	total: 16.5s	remaining: 1m 13s
11:	learn: 0.1107227	total: 18s	remaining: 1m 11s
12:	learn: 0.1029716	total: 19.4s	remaining: 1m 10s
13:	learn: 0.0976334	total: 20.9s	remaining: 1m 8s
14:	learn: 0.0923597	total: 22.4s	remaining: 1m 7s
15:	learn: 0.0878512	total: 23.9s	remaining: 1m 5s
16:	learn: 0.0839027	total: 25.5s	remaining: 1m 4s
17:	learn: 0.0804654	total: 26.9s	remaining: 1m 2s
18:	learn: 0.0776747	total: 28.4s	remaining: 1m 1s
19:	l

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Tech

In [27]:
tech_X_train, tech_X_test, tech_y_train, tech_y_test = train_test_split(
    tech_data['description'],
    tech_data[[column for column in tech_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

tech_X_train.shape, tech_X_test.shape

((18148,), (4538,))

### TF-IDF

In [28]:
tech_X_train_tfidf, tech_X_test_tfidf = tfidf_vectorize(
    train_data=tech_X_train,
    test_data=tech_X_test,
)

tech_model_best_tfitd = cb.CatBoostClassifier(**model_cv_best_params)
tech_model_best_tfitd.fit(np.array(tech_X_train_tfidf.todense()), tech_y_train.to_numpy())

tech_y_pred_tfidf = tech_model_best_tfitd.predict(tech_X_test_tfidf)

display_metrics(tech_y_test, tech_y_pred_tfidf)

Learning rate set to 0.468694
0:	learn: 0.0786642	total: 22.6s	remaining: 22m 12s
1:	learn: 0.0375616	total: 45.8s	remaining: 22m 7s
2:	learn: 0.0313114	total: 1m 8s	remaining: 21m 48s
3:	learn: 0.0299287	total: 1m 31s	remaining: 21m 15s
4:	learn: 0.0291313	total: 1m 53s	remaining: 20m 47s
5:	learn: 0.0283481	total: 2m 16s	remaining: 20m 32s
6:	learn: 0.0279486	total: 2m 39s	remaining: 20m 4s
7:	learn: 0.0276531	total: 3m 1s	remaining: 19m 40s
8:	learn: 0.0270235	total: 3m 24s	remaining: 19m 19s
9:	learn: 0.0267939	total: 3m 47s	remaining: 18m 58s
10:	learn: 0.0265232	total: 4m 10s	remaining: 18m 36s
11:	learn: 0.0261580	total: 4m 33s	remaining: 18m 14s
12:	learn: 0.0259507	total: 4m 56s	remaining: 17m 53s
13:	learn: 0.0256498	total: 5m 19s	remaining: 17m 30s
14:	learn: 0.0253899	total: 5m 42s	remaining: 17m 7s
15:	learn: 0.0251350	total: 6m 5s	remaining: 16m 44s
16:	learn: 0.0247627	total: 6m 27s	remaining: 16m 21s
17:	learn: 0.0245022	total: 6m 51s	remaining: 15m 59s
18:	learn: 0.024

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.5800204278958203
Precision (micro): 0.9093113482056256


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.16148175760401037
Recall (micro): 0.14436402833384662
Hamming loss: 0.004990147770198837


### Word2Vec

In [29]:
tech_X_train_w2v = w2v_vectorize(tech_X_train)
tech_X_test_w2v = w2v_vectorize(tech_X_test)

tech_model_best_w2v = cb.CatBoostClassifier(**model_cv_best_params)
tech_model_best_w2v.fit(np.array([x for x in tech_X_train_w2v]), tech_y_train.to_numpy())

tech_y_pred_w2v = tech_model_best_w2v.predict(np.array([x for x in tech_X_test_w2v]))

display_metrics(tech_y_test, tech_y_pred_w2v)

Learning rate set to 0.468694
0:	learn: 0.0797304	total: 12s	remaining: 11m 47s
1:	learn: 0.0362090	total: 23.5s	remaining: 11m 22s
2:	learn: 0.0283680	total: 35.5s	remaining: 11m 13s
3:	learn: 0.0249821	total: 47.6s	remaining: 11m 6s
4:	learn: 0.0227606	total: 1m	remaining: 11m 3s
5:	learn: 0.0208176	total: 1m 12s	remaining: 10m 49s
6:	learn: 0.0190150	total: 1m 24s	remaining: 10m 40s
7:	learn: 0.0174177	total: 1m 36s	remaining: 10m 28s
8:	learn: 0.0162239	total: 1m 48s	remaining: 10m 14s
9:	learn: 0.0151811	total: 2m	remaining: 10m
10:	learn: 0.0141849	total: 2m 12s	remaining: 9m 48s
11:	learn: 0.0130925	total: 2m 24s	remaining: 9m 37s
12:	learn: 0.0124720	total: 2m 36s	remaining: 9m 25s
13:	learn: 0.0117676	total: 2m 48s	remaining: 9m 13s
14:	learn: 0.0110258	total: 3m 1s	remaining: 9m 3s
15:	learn: 0.0104544	total: 3m 13s	remaining: 8m 51s
16:	learn: 0.0099140	total: 3m 26s	remaining: 8m 41s
17:	learn: 0.0093786	total: 3m 38s	remaining: 8m 30s
18:	learn: 0.0088983	total: 3m 50s	rem

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.9233314530371447
Precision (micro): 0.9887296094908552


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.8295964478358093
Recall (micro): 0.7700184785956268
Hamming loss: 0.0013694201978218226


### BERT Embeddings

In [30]:
tech_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

tech_X_train_emb = tech_sbert_model.encode(tech_X_train.values.tolist())
tech_X_test_emb = tech_sbert_model.encode(tech_X_test.values.tolist())

tech_model_best_emb = cb.CatBoostClassifier(**model_cv_best_params)
tech_model_best_emb.fit(tech_X_train_emb, tech_y_train.to_numpy())

tech_y_pred_emb = tech_model_best_emb.predict(tech_X_test_emb)

display_metrics(tech_y_test, tech_y_pred_emb)

Learning rate set to 0.468694
0:	learn: 0.0823899	total: 14.6s	remaining: 14m 23s
1:	learn: 0.0359516	total: 29.3s	remaining: 14m 10s
2:	learn: 0.0279928	total: 43.9s	remaining: 13m 53s
3:	learn: 0.0245515	total: 58.9s	remaining: 13m 45s
4:	learn: 0.0223066	total: 1m 14s	remaining: 13m 35s
5:	learn: 0.0204026	total: 1m 29s	remaining: 13m 29s
6:	learn: 0.0186798	total: 1m 44s	remaining: 13m 12s
7:	learn: 0.0172138	total: 1m 59s	remaining: 12m 57s
8:	learn: 0.0159135	total: 2m 15s	remaining: 12m 46s
9:	learn: 0.0147969	total: 2m 30s	remaining: 12m 30s
10:	learn: 0.0137312	total: 2m 44s	remaining: 12m 14s
11:	learn: 0.0128838	total: 2m 59s	remaining: 11m 59s
12:	learn: 0.0119936	total: 3m 14s	remaining: 11m 44s
13:	learn: 0.0113412	total: 3m 30s	remaining: 11m 33s
14:	learn: 0.0107671	total: 3m 46s	remaining: 11m 18s
15:	learn: 0.0101947	total: 4m 1s	remaining: 11m 3s
16:	learn: 0.0096728	total: 4m 16s	remaining: 10m 49s
17:	learn: 0.0093033	total: 4m 31s	remaining: 10m 34s
18:	learn: 0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.9235904318110776
Precision (micro): 0.9879144716454912


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.8208206192710897
Recall (micro): 0.7363720357252849
Hamming loss: 0.0015637268366614234


In [73]:
tech_X_train_df = pd.DataFrame(tech_X_train)

tech_train_pool = cb.Pool(
    tech_X_train_df[['description']],
    tech_y_train,
    text_features=['description'],
    feature_names=['description']
)

tech_model_best_default = cb.CatBoostClassifier(**model_cv_best_params)

tech_model_best_default.fit(tech_train_pool)

tech_X_test_df = pd.DataFrame(tech_X_test)

tech_test_pool = cb.Pool(
    tech_X_test_df[['description']],
    tech_y_test,
    text_features=['description'],
    feature_names=['description']
)

tech_y_pred_default = tech_model_best_default.predict(tech_test_pool)

display_metrics(tech_y_test, tech_y_pred_default)

Learning rate set to 0.468694
0:	learn: 0.3396812	total: 38.2s	remaining: 37m 35s
1:	learn: 0.1993526	total: 1m 17s	remaining: 37m 30s
2:	learn: 0.1267675	total: 1m 57s	remaining: 37m 7s
3:	learn: 0.0860042	total: 2m 39s	remaining: 37m 15s
4:	learn: 0.0624199	total: 3m 21s	remaining: 36m 51s
5:	learn: 0.0484620	total: 4m 3s	remaining: 36m 27s
6:	learn: 0.0403460	total: 4m 44s	remaining: 35m 52s
7:	learn: 0.0352396	total: 5m 29s	remaining: 35m 39s
8:	learn: 0.0320482	total: 6m 9s	remaining: 34m 54s
9:	learn: 0.0299567	total: 6m 51s	remaining: 34m 15s
10:	learn: 0.0283949	total: 7m 34s	remaining: 33m 45s
11:	learn: 0.0273644	total: 8m 20s	remaining: 33m 22s
12:	learn: 0.0265332	total: 9m 4s	remaining: 32m 47s
13:	learn: 0.0259426	total: 9m 48s	remaining: 32m 12s
14:	learn: 0.0254179	total: 10m 28s	remaining: 31m 24s
15:	learn: 0.0250376	total: 11m 8s	remaining: 30m 38s
16:	learn: 0.0246920	total: 11m 51s	remaining: 29m 58s
17:	learn: 0.0243725	total: 12m 33s	remaining: 29m 17s
18:	learn:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.3272404456520776
Precision (micro): 0.8659432387312187
Recall (macro): 0.1393918603086439


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (micro): 0.3993686479827533
Hamming loss: 0.00379957800130892
