In [1]:
import torch
import pandas as pd
import catboost as cb
from sentence_transformers import SentenceTransformer
import numpy as np
import gensim.downloader as api
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.ensemble import RakelO
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from preprocessing.utils import is_sentence_in_boundaries
from datasets_utils import get_luxury_data, get_tech_data, get_retail_data, get_big_basket_data
from preprocess import preprocess, with_category_features
from preprocess import preprocess, with_category_features
from utils import (
    tfidf_vectorize,
    w2v_vectorize,
    display_metrics,
    accuracy_ml_score,
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
device

'cuda'

In [3]:
big_basket_data = get_big_basket_data()
luxury_data = get_luxury_data()
tech_data = get_tech_data()
retail_data = get_retail_data()

# datasets = [big_basket_data]
# dataset_names = ['Big basket']
datasets = [big_basket_data, retail_data, luxury_data, tech_data]
dataset_names = ['Big basket', 'Retail', 'Luxury', 'Tech']

# Get datasets with description column preprocessed
big_basket_data['description'] = big_basket_data['description'].apply(preprocess)
tech_data['description'] = tech_data['description'].apply(preprocess)
luxury_data['description'] = luxury_data['description'].apply(preprocess)
retail_data['description'] = retail_data['description'].apply(preprocess)

# Preprocess categories
big_basket_data = with_category_features(big_basket_data)
tech_data = with_category_features(tech_data)
luxury_data = with_category_features(luxury_data)
retail_data = with_category_features(retail_data)

big_basket_data = big_basket_data[big_basket_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]
retail_data = retail_data[retail_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=250))]
luxury_data = luxury_data[luxury_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=100))]
tech_data = tech_data[tech_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]

  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 

## Big Basket

In [4]:
big_basket_X_train, big_basket_X_test, big_basket_y_train, big_basket_y_test = train_test_split(
    big_basket_data['description'],
    big_basket_data[[column for column in big_basket_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

In [5]:
big_basket_X_train.shape, big_basket_X_test.shape

((21156,), (5289,))

### TF-IDF

In [8]:
big_basket_X_train_tfidf, big_basket_X_test_tfidf = tfidf_vectorize(
    train_data=big_basket_X_train,
    test_data=big_basket_X_test,
)

In [33]:
big_basket_tfidf_model = GridSearchCV(
    RakelO(
        base_classifier=MultinomialNB(alpha=0.7),
        base_classifier_require_dense=[True, True],
        labelset_size=big_basket_y_train.to_numpy().shape[1],
        model_count=6,
    ),
    param_grid=[
        {
            'base_classifier': [MultinomialNB()],
            'base_classifier__alpha': [0.7],
        },
        {
            'base_classifier': [DecisionTreeClassifier()],
            'base_classifier__criterion': ['log_loss'],
        },
    ],
)

big_basket_tfidf_model.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

In [35]:
big_basket_tfidf_model.best_params_, big_basket_tfidf_model.best_score_

({'base_classifier': DecisionTreeClassifier(),
  'base_classifier__criterion': 'log_loss'},
 0.5784179601545707)

In [9]:
big_basket_tfidf_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=big_basket_y_train.to_numpy().shape[1],
    model_count=6,
)

big_basket_tfidf_best_model.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

In [10]:
big_basket_y_pred_tfidf = big_basket_tfidf_best_model.predict(big_basket_X_test_tfidf)

In [13]:
display_metrics(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)

Accuracy (subset): 0.598789941387786
Accuracy (ML): 0.9871794871794871
Precision (macro): 0.5813395382934666
Precision (micro): 0.7145454545454546
Recall (macro): 0.5143403141253783
Recall (micro): 0.6872918054630246
Hamming loss: 0.01282051282051282


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [14]:
big_basket_X_train_w2v = w2v_vectorize(big_basket_X_train)
big_basket_X_test_w2v = w2v_vectorize(big_basket_X_test)

In [42]:
big_basket_w2v_model = GridSearchCV(
    RakelO(
        base_classifier=MultinomialNB(alpha=0.7),
        base_classifier_require_dense=[True, True],
        labelset_size=big_basket_y_train.to_numpy().shape[1],
        model_count=6,
    ),
    param_grid=[
        {
            'base_classifier': [MultinomialNB()],
            'base_classifier__alpha': [0.7],
        },
        {
            'base_classifier': [DecisionTreeClassifier()],
            'base_classifier__criterion': ['log_loss'],
        },
    ],
)

big_basket_w2v_model.fit(big_basket_X_train_w2v, big_basket_y_train.to_numpy())

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/skmultilearn/ensemble/rakelo.py", line 121, in fit
    return self.classifier.fit(X, y)
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/skmultilearn/problem_transform/br.py", line 161, in fit
    classifier.fit(self._ensure_input_format(
  File "/

In [43]:
big_basket_w2v_model.best_params_, big_basket_w2v_model.best_score_

({'base_classifier': DecisionTreeClassifier(),
  'base_classifier__criterion': 'log_loss'},
 0.4820854401239568)

In [16]:
big_basket_w2v_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=big_basket_y_train.to_numpy().shape[1],
    model_count=6,
)

big_basket_w2v_best_model.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [18]:
big_basket_y_pred_w2v = big_basket_w2v_best_model.predict(np.array([x for x in big_basket_X_test_w2v]))

In [20]:
display_metrics(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)

Accuracy (subset): 0.497447532614861
Accuracy (ML): 0.985908707477057
Precision (macro): 0.5567118935981195
Precision (micro): 0.7037427012539486
Recall (macro): 0.4384078727457376
Recall (micro): 0.6122584943371085
Hamming loss: 0.014091292522943118


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BERT embeddings

In [21]:
big_basket_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

In [22]:
big_basket_X_train_emb = big_basket_sbert_model.encode(big_basket_X_train.values.tolist())
big_basket_X_test_emb = big_basket_sbert_model.encode(big_basket_X_test.values.tolist())

In [23]:
big_basket_X_train_emb.shape, big_basket_X_test_emb.shape

((21156, 384), (5289, 384))

In [49]:
big_basket_emb_model = GridSearchCV(
    RakelO(
        base_classifier=MultinomialNB(alpha=0.7),
        base_classifier_require_dense=[True, True],
        labelset_size=big_basket_y_train.to_numpy().shape[1],
        model_count=6,
    ),
    param_grid=[
        {
            'base_classifier': [MultinomialNB()],
            'base_classifier__alpha': [0.7],
        },
        {
            'base_classifier': [DecisionTreeClassifier()],
            'base_classifier__criterion': ['log_loss'],
        },
    ],
)

big_basket_emb_model.fit(big_basket_X_train_emb, big_basket_y_train.to_numpy())

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/skmultilearn/ensemble/rakelo.py", line 121, in fit
    return self.classifier.fit(X, y)
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/skmultilearn/problem_transform/br.py", line 161, in fit
    classifier.fit(self._ensure_input_format(
  File "/

In [50]:
big_basket_emb_model.best_params_, big_basket_emb_model.best_score_

({'base_classifier': DecisionTreeClassifier(),
  'base_classifier__criterion': 'log_loss'},
 0.4463510393847911)

In [24]:
big_basket_emb_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=big_basket_y_train.to_numpy().shape[1],
    model_count=6,
)

big_basket_emb_best_model.fit(big_basket_X_train_emb, big_basket_y_train.to_numpy())

In [25]:
big_basket_y_pred_emb = big_basket_emb_best_model.predict(big_basket_X_test_emb)

In [27]:
display_metrics(big_basket_y_test.to_numpy(), big_basket_y_pred_emb)

Accuracy (subset): 0.4639818491208168
Accuracy (ML): 0.9848851753276029
Precision (macro): 0.5464503809467235
Precision (micro): 0.6848478783026422
Recall (macro): 0.39600513375125074
Recall (micro): 0.5698700866089274
Hamming loss: 0.015114824672396993


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Retail

In [30]:
retail_X_train, retail_X_test, retail_y_train, retail_y_test = train_test_split(
    retail_data['description'],
    retail_data[[column for column in retail_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

### TF-IDF

In [31]:
retail_X_train_tfidf, retail_X_test_tfidf = tfidf_vectorize(
    train_data=retail_X_train,
    test_data=retail_X_test,
)

retail_tfidf_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=retail_y_train.to_numpy().shape[1],
    model_count=6,
)

retail_tfidf_best_model.fit(retail_X_train_tfidf, retail_y_train.to_numpy())

retail_y_pred_tfidf = retail_tfidf_best_model.predict(retail_X_test_tfidf)

display_metrics(retail_y_test.to_numpy(), retail_y_pred_tfidf)

Accuracy (subset): 0.6240215924426451
Accuracy (ML): 0.9263744645895674
Precision (macro): 0.6361174174732991
Precision (micro): 0.6360479041916167
Recall (macro): 0.6211240809655628
Recall (micro): 0.6214603323192137
Hamming loss: 0.07362553541043243


### Word2Vec

In [32]:
retail_X_train_w2v = w2v_vectorize(retail_X_train)
retail_X_test_w2v = w2v_vectorize(retail_X_test)

retail_w2v_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=retail_y_train.to_numpy().shape[1],
    model_count=6,
)

retail_w2v_best_model.fit(np.array([x for x in retail_X_train_w2v]), retail_y_train.to_numpy())

retail_y_pred_w2v = retail_w2v_best_model.predict(np.array([x for x in retail_X_test_w2v]))

display_metrics(retail_y_test.to_numpy(), retail_y_pred_w2v)

Accuracy (subset): 0.5600539811066126
Accuracy (ML): 0.9195329460775686
Precision (macro): 0.606971085338987
Precision (micro): 0.6077305778798316
Recall (macro): 0.5576893496454014
Recall (micro): 0.5574537795459864
Hamming loss: 0.0804670539224315


### BERT Embeddings

In [33]:
retail_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

retail_X_train_emb = retail_sbert_model.encode(retail_X_train.values.tolist())
retail_X_test_emb = retail_sbert_model.encode(retail_X_test.values.tolist())

retail_emb_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=retail_y_train.to_numpy().shape[1],
    model_count=6,
)

retail_emb_best_model.fit(retail_X_train_emb, retail_y_train.to_numpy())

retail_y_pred_emb = retail_emb_best_model.predict(retail_X_test_emb)

display_metrics(retail_y_test.to_numpy(), retail_y_pred_emb)

Accuracy (subset): 0.5246963562753036
Accuracy (ML): 0.9131960335621664
Precision (macro): 0.5719989188254623
Precision (micro): 0.5734748689090676
Recall (macro): 0.5237189310070938
Recall (micro): 0.5246899134097823
Hamming loss: 0.08680396643783371


## Luxury

In [36]:
luxury_X_train, luxury_X_test, luxury_y_train, luxury_y_test = train_test_split(
    luxury_data['description'],
    luxury_data[[column for column in luxury_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

### TF-IDF

In [37]:
luxury_X_train_tfidf, luxury_X_test_tfidf = tfidf_vectorize(
    train_data=luxury_X_train,
    test_data=luxury_X_test,
)

luxury_tfidf_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=luxury_y_train.to_numpy().shape[1],
    model_count=6,
)

luxury_tfidf_best_model.fit(luxury_X_train_tfidf, luxury_y_train.to_numpy())

luxury_y_pred_tfidf = luxury_tfidf_best_model.predict(luxury_X_test_tfidf)

display_metrics(luxury_y_test.to_numpy(), luxury_y_pred_tfidf)

Accuracy (subset): 0.7444561774023232
Accuracy (ML): 0.9848794625806017
Precision (macro): 0.7053314392027467
Precision (micro): 0.8247480403135499
Recall (macro): 0.6781776732461658
Recall (micro): 0.8036006546644845
Hamming loss: 0.015120537419398323


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [38]:
luxury_X_train_w2v = w2v_vectorize(luxury_X_train)
luxury_X_test_w2v = w2v_vectorize(luxury_X_test)

luxury_w2v_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=luxury_y_train.to_numpy().shape[1],
    model_count=6,
)

luxury_w2v_best_model.fit(np.array([x for x in luxury_X_train_w2v]), luxury_y_train.to_numpy())

luxury_y_pred_w2v = luxury_w2v_best_model.predict(np.array([x for x in luxury_X_test_w2v]))

display_metrics(luxury_y_test.to_numpy(), luxury_y_pred_w2v)

Accuracy (subset): 0.4424498416050686
Accuracy (ML): 0.9686130894875195
Precision (macro): 0.5260292243564947
Precision (micro): 0.6453333333333333
Recall (macro): 0.38276432356001566
Recall (micro): 0.5280960174577196
Hamming loss: 0.031386910512480624


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BERT Embeddings

In [39]:
luxury_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

luxury_X_train_emb = luxury_sbert_model.encode(luxury_X_train.values.tolist())
luxury_X_test_emb = luxury_sbert_model.encode(luxury_X_test.values.tolist())

luxury_emb_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=luxury_y_train.to_numpy().shape[1],
    model_count=6,
)

luxury_emb_best_model.fit(luxury_X_train_emb, luxury_y_train.to_numpy())

luxury_y_pred_emb = luxury_emb_best_model.predict(luxury_X_test_emb)

display_metrics(luxury_y_test.to_numpy(), luxury_y_pred_emb)

Accuracy (subset): 0.40865892291446676
Accuracy (ML): 0.9663438855063021
Precision (macro): 0.4693622430488629
Precision (micro): 0.6124916051040967
Recall (macro): 0.36869767547484106
Recall (micro): 0.49754500818330605
Hamming loss: 0.0336561144936979


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Tech

In [4]:
tech_X_train, tech_X_test, tech_y_train, tech_y_test = train_test_split(
    tech_data['description'],
    tech_data[[column for column in tech_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

### TF-IDF

In [46]:
tech_X_train_tfidf, tech_X_test_tfidf = tfidf_vectorize(
    train_data=tech_X_train,
    test_data=tech_X_test,
)

tech_tfidf_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=tech_y_train.to_numpy().shape[1],
    model_count=6,
)

tech_tfidf_best_model.fit(tech_X_train_tfidf, tech_y_train.to_numpy())

tech_y_pred_tfidf = tech_tfidf_best_model.predict(tech_X_test_tfidf)

display_metrics(tech_y_test.to_numpy(), tech_y_pred_tfidf)

Accuracy (subset): 0.8891582194799471
Accuracy (ML): 0.9990646784975856
Precision (macro): 0.8805824161317456


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (micro): 0.9366864856178692
Recall (macro): 0.8513876420426756
Recall (micro): 0.8975977825685247
Hamming loss: 0.00093532150241426


### Word2Vec

In [47]:
tech_X_train_w2v = w2v_vectorize(tech_X_train)
tech_X_test_w2v = w2v_vectorize(tech_X_test)

tech_w2v_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=tech_y_train.to_numpy().shape[1],
    model_count=6,
)

tech_w2v_best_model.fit(np.array([x for x in tech_X_train_w2v]), tech_y_train.to_numpy())

tech_y_pred_w2v = tech_w2v_best_model.predict(np.array([x for x in tech_X_test_w2v]))

display_metrics(tech_y_test.to_numpy(), tech_y_pred_w2v)

Accuracy (subset): 0.8884971353018951
Accuracy (ML): 0.9993667369997818
Precision (macro): 0.9228556768932341
Precision (micro): 0.993170565135735


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.8503068599451714
Recall (micro): 0.8957499230058515
Hamming loss: 0.0006332630002181534


### BERT Embeddings

In [None]:
tech_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

tech_X_train_emb = tech_sbert_model.encode(tech_X_train.values.tolist())
tech_X_test_emb = tech_sbert_model.encode(tech_X_test.values.tolist())

tech_emb_best_model = RakelO(
    base_classifier=DecisionTreeClassifier(criterion='log_loss'),
    base_classifier_require_dense=[True, True],
    labelset_size=tech_y_train.to_numpy().shape[1],
    model_count=6,
)

tech_emb_best_model.fit(tech_X_train_emb, tech_y_train.to_numpy())

tech_y_pred_emb = tech_emb_best_model.predict(tech_X_test_emb)

display_metrics(tech_y_test.to_numpy(), tech_y_pred_emb)