In [1]:
import torch
import pandas as pd
import catboost as cb
from sentence_transformers import SentenceTransformer
import numpy as np
import gensim.downloader as api
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split

from preprocessing.utils import is_sentence_in_boundaries
from datasets_utils import get_luxury_data, get_tech_data, get_retail_data, get_big_basket_data
from preprocess import preprocess, with_category_features
from utils import (
    tfidf_vectorize,
    w2v_vectorize,
    display_metrics,
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
device

'cuda'

In [3]:
big_basket_data = get_big_basket_data()
luxury_data = get_luxury_data()
tech_data = get_tech_data()
retail_data = get_retail_data()

datasets = [big_basket_data]
dataset_names = ['Big basket']
datasets = [big_basket_data, retail_data, luxury_data, tech_data]
dataset_names = ['Big basket', 'Retail', 'Luxury', 'Tech']

# Get datasets with description column preprocessed
big_basket_data['description'] = big_basket_data['description'].apply(preprocess)
tech_data['description'] = tech_data['description'].apply(preprocess)
luxury_data['description'] = luxury_data['description'].apply(preprocess)
retail_data['description'] = retail_data['description'].apply(preprocess)

# Preprocess categories
big_basket_data = with_category_features(big_basket_data)
tech_data = with_category_features(tech_data)
luxury_data = with_category_features(luxury_data)
retail_data = with_category_features(retail_data)

big_basket_data = big_basket_data[big_basket_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]
retail_data = retail_data[retail_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=250))]
luxury_data = luxury_data[luxury_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=100))]
tech_data = tech_data[tech_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]

  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 

In [14]:
grid = {
    'iterations': [100, 500],
    'depth': [4, 6],
}

## Big Basket

In [4]:
big_basket_X_train, big_basket_X_test, big_basket_y_train, big_basket_y_test = train_test_split(
    big_basket_data['description'],
    big_basket_data[[column for column in big_basket_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

In [5]:
big_basket_X_train.shape, big_basket_X_test.shape

((21156,), (5289,))

### TF-IDF

In [6]:
big_basket_X_train_tfidf, big_basket_X_test_tfidf = tfidf_vectorize(
    train_data=big_basket_X_train,
    test_data=big_basket_X_test,
)

In [None]:
model_tfidf = cb.CatBoostClassifier(
    loss_function='MultiLogloss',
    allow_const_label=True,
    class_names=big_basket_y_train.columns.values,
    random_seed=13,
)

big_basket_tfidf_pool = cb.Pool(big_basket_X_train_tfidf, big_basket_y_train)

model_tfidf = model_tfidf.grid_search(
    grid,
    big_basket_tfidf_pool,
    cv=3,
    verbose=True,
)

model_tfidf.fit(big_basket_tfidf_pool, verbose=50)

0:	learn: 0.6262737	test: 0.6262855	best: 0.6262855 (0)	total: 3.63s	remaining: 5m 59s
1:	learn: 0.5666747	test: 0.5667050	best: 0.5667050 (1)	total: 7.13s	remaining: 5m 49s
2:	learn: 0.5137871	test: 0.5138311	best: 0.5138311 (2)	total: 10.7s	remaining: 5m 44s
3:	learn: 0.4654789	test: 0.4655167	best: 0.4655167 (3)	total: 14.2s	remaining: 5m 40s
4:	learn: 0.4242027	test: 0.4242568	best: 0.4242568 (4)	total: 17.7s	remaining: 5m 36s
5:	learn: 0.3875503	test: 0.3876219	best: 0.3876219 (5)	total: 21.3s	remaining: 5m 33s
6:	learn: 0.3553496	test: 0.3554219	best: 0.3554219 (6)	total: 24.8s	remaining: 5m 29s
7:	learn: 0.3267192	test: 0.3268088	best: 0.3268088 (7)	total: 28.3s	remaining: 5m 25s
8:	learn: 0.3014947	test: 0.3016151	best: 0.3016151 (8)	total: 31.9s	remaining: 5m 22s
9:	learn: 0.2777233	test: 0.2778453	best: 0.2778453 (9)	total: 35.4s	remaining: 5m 18s
10:	learn: 0.2576310	test: 0.2577339	best: 0.2577339 (10)	total: 39s	remaining: 5m 15s
11:	learn: 0.2390418	test: 0.2391748	best: 

In [6]:
# model = cb.CatBoostClassifier(
#     iterations=100,
#     loss_function='MultiLogloss',
#     class_names=big_basket_y_train.columns.values,
#     allow_const_label=True,
#     random_seed=13,
# )

Learning rate set to 0.318232
0:	learn: 0.2331716	total: 11.4s	remaining: 18m 52s
50:	learn: 0.0357299	total: 9m 56s	remaining: 9m 32s
99:	learn: 0.0270354	total: 19m 25s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f3e8650bd30>

In [8]:
big_basket_y_pred_tfidf = model.predict(big_basket_X_test_tfidf)

In [24]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)}")

Accuracy: 0.4165451895043732
Precision (macro): 0.7520990845943957
Precision (micro): 0.8988735919899875
Recall (macro): 0.34214281747008646
Recall (micro): 0.5756191392161577
Hamming loss: 0.01069291040592061


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
model_default = cb.CatBoostClassifier(
    iterations=100,
    loss_function='MultiLogloss',
    class_names=big_basket_y_train.columns.values,
    allow_const_label=True,
    random_seed=13,
)

big_basket_train_pool = cb.Pool(pd.DataFrame(big_basket_X_train.values, columns=['description']), big_basket_y_train.to_numpy(), text_features=['description'], feature_names=['description'])

model_default.fit(big_basket_train_pool, verbose=50)

Learning rate set to 0.318232
0:	learn: 0.4466780	total: 6.18s	remaining: 10m 11s
50:	learn: 0.0282392	total: 6m 5s	remaining: 5m 51s
99:	learn: 0.0229303	total: 11m 31s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f3e5947d540>

In [27]:
big_basket_y_pred_default = model_default.predict(pd.DataFrame(big_basket_X_test.values, columns=['description']))

In [29]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_default)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_default, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_default, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_default, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_default, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_default)}")

Accuracy: 0.6065962099125365
Precision (macro): 0.7169155947747387
Precision (micro): 0.8803348050036792
Recall (macro): 0.5097136742134956
Recall (micro): 0.7670914482648072
Hamming loss: 0.007370977237048665


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [33]:
wv = api.load('word2vec-google-news-300')

In [35]:
def to_w2v_embedding(sentence):
    embeddings = []

    for word in sentence.split():
        if word in wv:
            embeddings.append(wv[word])
    embeddings = np.array(embeddings)
    return np.mean(embeddings, axis=0)

In [36]:
big_basket_X_train_w2v = big_basket_X_train.apply(to_w2v_embedding)
big_basket_X_test_w2v = big_basket_X_test.apply(to_w2v_embedding)

In [None]:
model_w2v = cb.CatBoostClassifier()

In [None]:
big_basket_w2v_pool = cb.Pool(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train)

model_w2v = cb.grid_search(
    grid,
    big_basket_w2v_pool,
    cv=3,
    verbose=True,
)

In [42]:
model_w2v = cb.CatBoostClassifier(
    iterations=100,
    loss_function='MultiLogloss',
    class_names=big_basket_y_train.columns.values,
    allow_const_label=True,
    random_seed=13,
)

model_w2v.fit(big_basket_w2v_pool, verbose=10)

Learning rate set to 0.318232
0:	learn: 0.2148468	total: 2.42s	remaining: 3m 59s
10:	learn: 0.0420730	total: 27.7s	remaining: 3m 43s
20:	learn: 0.0309566	total: 52.1s	remaining: 3m 16s
30:	learn: 0.0249542	total: 1m 16s	remaining: 2m 50s
40:	learn: 0.0207562	total: 1m 40s	remaining: 2m 24s
50:	learn: 0.0177137	total: 2m 4s	remaining: 1m 59s
60:	learn: 0.0153369	total: 2m 29s	remaining: 1m 35s
70:	learn: 0.0136097	total: 2m 53s	remaining: 1m 10s
80:	learn: 0.0120163	total: 3m 16s	remaining: 46.2s
90:	learn: 0.0107490	total: 3m 40s	remaining: 21.9s
99:	learn: 0.0097063	total: 4m 2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f3d2eec73a0>

In [43]:
big_basket_y_pred_w2v = model_w2v.predict(np.array([x for x in big_basket_X_test_w2v]))

In [44]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)}")

Accuracy: 0.5134839650145773
Precision (macro): 0.7595887235355991
Precision (micro): 0.8722147372784744
Recall (macro): 0.4784695681525681
Recall (micro): 0.6745211188587
Hamming loss: 0.009275482170890335


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BERT embeddings

In [6]:
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

In [7]:
big_basket_X_train_emb = sbert_model.encode(big_basket_X_train.values.tolist())
big_basket_X_test_emb = sbert_model.encode(big_basket_X_test.values.tolist())

In [8]:
big_basket_X_train_emb.shape, big_basket_X_test_emb.shape

((21156, 384), (5289, 384))

In [9]:
model_emb = cb.CatBoostClassifier(
    iterations=100,
    loss_function='MultiLogloss',
    class_names=big_basket_y_train.columns.values,
    allow_const_label=True,
    random_seed=13,
)

In [10]:
big_basket_emb_pool = cb.Pool(big_basket_X_train_emb, big_basket_y_train)

In [12]:
model_emb.fit(big_basket_emb_pool, verbose=10)

Learning rate set to 0.313252
0:	learn: 0.2270946	total: 3.1s	remaining: 5m 7s
10:	learn: 0.0446137	total: 34.1s	remaining: 4m 35s
20:	learn: 0.0324338	total: 1m 4s	remaining: 4m 2s
30:	learn: 0.0257146	total: 1m 35s	remaining: 3m 32s
40:	learn: 0.0214635	total: 2m 6s	remaining: 3m 2s
50:	learn: 0.0183347	total: 2m 37s	remaining: 2m 30s
60:	learn: 0.0162096	total: 3m 7s	remaining: 1m 59s
70:	learn: 0.0143579	total: 3m 37s	remaining: 1m 28s
80:	learn: 0.0128956	total: 4m 6s	remaining: 57.9s
90:	learn: 0.0117018	total: 4m 36s	remaining: 27.3s
99:	learn: 0.0107482	total: 5m 2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f1bc01cba60>

In [13]:
big_basket_y_pred_emb = model_emb.predict(big_basket_X_test_emb)

In [14]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_emb)}")

Accuracy: 0.476460578559274
Precision (macro): 0.7453591312205193
Precision (micro): 0.8659898477157361
Recall (macro): 0.459348057321541
Recall (micro): 0.6393237841439041
Hamming loss: 0.010033523859388863


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Retail

## Luxury

## Tech