In [1]:
import torch
import pandas as pd
import catboost as cb
from sentence_transformers import SentenceTransformer
import numpy as np
import gensim.downloader as api
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from preprocessing.utils import is_sentence_in_boundaries
from datasets_utils import get_luxury_data, get_tech_data, get_retail_data, get_big_basket_data
from preprocess import preprocess, with_category_features
from utils import (
    tfidf_vectorize,
    w2v_vectorize,
    display_metrics,
    accuracy_ml_score,
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
device

'cuda'

In [3]:
big_basket_data = get_big_basket_data()
luxury_data = get_luxury_data()
tech_data = get_tech_data()
retail_data = get_retail_data()

# datasets = [big_basket_data]
# dataset_names = ['Big basket']
datasets = [big_basket_data, retail_data, luxury_data, tech_data]
dataset_names = ['Big basket', 'Retail', 'Luxury', 'Tech']

# Get datasets with description column preprocessed
big_basket_data['description'] = big_basket_data['description'].apply(preprocess)
tech_data['description'] = tech_data['description'].apply(preprocess)
luxury_data['description'] = luxury_data['description'].apply(preprocess)
retail_data['description'] = retail_data['description'].apply(preprocess)

# Preprocess categories
big_basket_data = with_category_features(big_basket_data)
tech_data = with_category_features(tech_data)
luxury_data = with_category_features(luxury_data)
retail_data = with_category_features(retail_data)

big_basket_data = big_basket_data[big_basket_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]
retail_data = retail_data[retail_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=250))]
luxury_data = luxury_data[luxury_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=100))]
tech_data = tech_data[tech_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]

  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 

## Big Basket

In [4]:
big_basket_X_train, big_basket_X_test, big_basket_y_train, big_basket_y_test = train_test_split(
    big_basket_data['description'],
    big_basket_data[[column for column in big_basket_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

In [5]:
big_basket_X_train.shape, big_basket_X_test.shape

((21156,), (5289,))

### TF-IDF

In [6]:
big_basket_X_train_tfidf, big_basket_X_test_tfidf = tfidf_vectorize(
    train_data=big_basket_X_train,
    test_data=big_basket_X_test,
)

In [11]:
big_basket_tfidf_model = GridSearchCV(
    ClassifierChain(classifier=DecisionTreeClassifier(criterion='log_loss')),
    param_grid=[
        {
            'classifier': [MultinomialNB()],
            'classifier__alpha': [0.7],
        },
        {
            'classifier': [DecisionTreeClassifier()],
            'classifier__criterion': ['log_loss'],
        },
    ],
)

big_basket_tfidf_model.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

In [7]:
big_basket_tfidf_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

big_basket_tfidf_model.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

In [12]:
big_basket_tfidf_model.best_params_, big_basket_tfidf_model.best_score_

({'classifier': DecisionTreeClassifier(), 'classifier__criterion': 'log_loss'},
 0.619729925712593)

In [8]:
big_basket_y_pred_tfidf = big_basket_tfidf_model.predict(big_basket_X_test_tfidf)

In [9]:
display_metrics(big_basket_y_test.to_numpy(), np.array(big_basket_y_pred_tfidf.todense()))

Accuracy (subset): 0.6337681981470977
Accuracy (ML): 0.989693776633652
Precision (macro): 0.6556083658642693
Precision (micro): 0.7616178291374329
Recall (macro): 0.6395108905566944


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (micro): 0.768404397068621
Hamming loss: 0.010306223366348153


### Word2Vec

In [13]:
big_basket_X_train_w2v = w2v_vectorize(big_basket_X_train)
big_basket_X_test_w2v = w2v_vectorize(big_basket_X_test)

In [19]:
big_basket_w2v_model = GridSearchCV(
    ClassifierChain(classifier=DecisionTreeClassifier(criterion='log_loss')),
    param_grid=[
        {
            'classifier': [MultinomialNB()],
            'classifier__alpha': [0.7],
        },
        {
            'classifier': [DecisionTreeClassifier()],
            'classifier__criterion': ['log_loss'],
        },
    ],
    n_jobs=8,
)

big_basket_w2v_model.fit(big_basket_X_train_w2v, big_basket_y_train.to_numpy())

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/skmultilearn/problem_transform/cc.py", line 154, in fit
    self.classifiers_[label] = self.classifier.fit(self._ensure_input_format(
  File "/home/stepan/HSEPythonCourse/thesis/hse_thesis_final/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(est

In [20]:
big_basket_w2v_model.best_params_, big_basket_w2v_model.best_score_

({'classifier': DecisionTreeClassifier(), 'classifier__criterion': 'log_loss'},
 0.46265871577996415)

In [19]:
big_basket_w2v_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

big_basket_w2v_model.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [21]:
big_basket_y_pred_w2v = big_basket_w2v_model.predict(np.array([x for x in big_basket_X_test_w2v]))

In [22]:
display_metrics(big_basket_y_test.to_numpy(), np.array(big_basket_y_pred_w2v.todense()))

Accuracy (subset): 0.4758933635847986
Accuracy (ML): 0.9835234957895196
Precision (macro): 0.45482268410855237
Precision (micro): 0.6179793285794407
Recall (macro): 0.4749428744371764


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (micro): 0.6423217854763491
Hamming loss: 0.016476504210480386


### BERT embeddings

In [25]:
big_basket_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

In [26]:
big_basket_X_train_emb = big_basket_sbert_model.encode(big_basket_X_train.values.tolist())
big_basket_X_test_emb = big_basket_sbert_model.encode(big_basket_X_test.values.tolist())

In [27]:
big_basket_X_train_emb.shape, big_basket_X_test_emb.shape

((21156, 384), (5289, 384))

In [36]:
big_basket_emb_model = GridSearchCV(
    ClassifierChain(classifier=DecisionTreeClassifier(criterion='log_loss')),
    param_grid=[
        {
            'classifier': [MultinomialNB()],
            'classifier__alpha': [0.7],
        },
        {
            'classifier': [DecisionTreeClassifier()],
            'classifier__criterion': ['log_loss'],
        },
    ],
    n_jobs=8,
)

big_basket_emb_model.fit(big_basket_X_train_emb, big_basket_y_train.to_numpy())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [28]:
big_basket_emb_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

big_basket_emb_model.fit(big_basket_X_train_emb, big_basket_y_train.to_numpy())

In [29]:
big_basket_y_pred_emb = big_basket_emb_model.predict(big_basket_X_test_emb)

In [30]:
display_metrics(big_basket_y_test.to_numpy(), np.array(big_basket_y_pred_emb.todense()))

Accuracy (subset): 0.44583096993760635
Accuracy (ML): 0.9819945605538346
Precision (macro): 0.43733991249492
Precision (micro): 0.5819825436408977
Recall (macro): 0.4535442029948647
Recall (micro): 0.6219187208527648
Hamming loss: 0.01800543944616548


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Retail

In [4]:
retail_X_train, retail_X_test, retail_y_train, retail_y_test = train_test_split(
    retail_data['description'],
    retail_data[[column for column in retail_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

### TF-IDF

In [4]:
retail_X_train_tfidf, retail_X_test_tfidf = tfidf_vectorize(
    train_data=retail_X_train,
    test_data=retail_X_test,
)

retail_tfidf_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

retail_tfidf_model.fit(retail_X_train_tfidf, retail_y_train.to_numpy())

retail_y_pred_tfidf = retail_tfidf_model.predict(retail_X_test_tfidf)

display_metrics(retail_y_test.to_numpy(), np.array(retail_y_pred_tfidf.todense()))

Accuracy (subset): 0.6723346828609986
Accuracy (ML): 0.9371472158657513
Precision (macro): 0.6950372602354618
Precision (micro): 0.684393063583815
Recall (macro): 0.6911678123015544
Recall (micro): 0.6927217411654575
Hamming loss: 0.06285278413424866


### Word2Vec

In [5]:
retail_X_train_w2v = w2v_vectorize(retail_X_train)
retail_X_test_w2v = w2v_vectorize(retail_X_test)

retail_w2v_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

retail_w2v_model.fit(np.array([x for x in retail_X_train_w2v]), retail_y_train.to_numpy())

retail_y_pred_w2v = retail_w2v_model.predict(np.array([x for x in retail_X_test_w2v]))

display_metrics(retail_y_test.to_numpy(), np.array(retail_y_pred_w2v.todense()))

Accuracy (subset): 0.5479082321187584
Accuracy (ML): 0.9110954644135422
Precision (macro): 0.5705298040709857
Precision (micro): 0.5521169138190415
Recall (macro): 0.6026049699109812
Recall (micro): 0.6012169435993447
Hamming loss: 0.08890453558645778


### BERT Embeddings

In [5]:
retail_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

retail_X_train_emb = retail_sbert_model.encode(retail_X_train.values.tolist())
retail_X_test_emb = retail_sbert_model.encode(retail_X_test.values.tolist())

retail_emb_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

retail_emb_model.fit(retail_X_train_emb, retail_y_train.to_numpy())

retail_y_pred_emb = retail_emb_model.predict(retail_X_test_emb)

display_metrics(retail_y_test.to_numpy(), np.array(retail_y_pred_emb.todense()))

Accuracy (subset): 0.5292847503373819
Accuracy (ML): 0.9072581118347707
Precision (macro): 0.5456093994288039
Precision (micro): 0.5348811977866985
Recall (macro): 0.577693782310587
Recall (micro): 0.5768780716124503
Hamming loss: 0.09274188816522913


## Luxury

In [6]:
luxury_X_train, luxury_X_test, luxury_y_train, luxury_y_test = train_test_split(
    luxury_data['description'],
    luxury_data[[column for column in luxury_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

### TF-IDF

In [7]:
luxury_X_train_tfidf, luxury_X_test_tfidf = tfidf_vectorize(
    train_data=luxury_X_train,
    test_data=luxury_X_test,
)

luxury_tfidf_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

luxury_tfidf_model.fit(luxury_X_train_tfidf, luxury_y_train.to_numpy())

luxury_y_pred_tfidf = luxury_tfidf_model.predict(luxury_X_test_tfidf)

display_metrics(luxury_y_test.to_numpy(), np.array(luxury_y_pred_tfidf.todense()))

Accuracy (subset): 0.7243928194297783
Accuracy (ML): 0.9879125570109415
Precision (macro): 0.7757753598458432
Precision (micro): 0.8547945205479452
Recall (macro): 0.7802783956375158
Recall (micro): 0.851063829787234
Hamming loss: 0.012087442989058393


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [8]:
luxury_X_train_w2v = w2v_vectorize(luxury_X_train)
luxury_X_test_w2v = w2v_vectorize(luxury_X_test)

luxury_w2v_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

luxury_w2v_model.fit(np.array([x for x in luxury_X_train_w2v]), luxury_y_train.to_numpy())

luxury_y_pred_w2v = luxury_w2v_model.predict(np.array([x for x in luxury_X_test_w2v]))

display_metrics(luxury_y_test.to_numpy(), np.array(luxury_y_pred_w2v.todense()))

Accuracy (subset): 0.3738120380147835
Accuracy (ML): 0.9638724752297286
Precision (macro): 0.43833249213533154
Precision (micro): 0.5605815831987075
Recall (macro): 0.4547841145816238
Recall (micro): 0.5679214402618658
Hamming loss: 0.03612752477027118


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BERT Embeddings

In [9]:
luxury_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

luxury_X_train_emb = luxury_sbert_model.encode(luxury_X_train.values.tolist())
luxury_X_test_emb = luxury_sbert_model.encode(luxury_X_test.values.tolist())

luxury_emb_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

luxury_emb_model.fit(luxury_X_train_emb, luxury_y_train.to_numpy())

luxury_y_pred_emb = luxury_emb_model.predict(luxury_X_test_emb)

display_metrics(luxury_y_test.to_numpy(), np.array(luxury_y_pred_emb.todense()))

Accuracy (subset): 0.3706441393875396
Accuracy (ML): 0.962726639556045
Precision (macro): 0.43058084288554965
Precision (micro): 0.5475409836065573
Recall (macro): 0.4189528190184782
Recall (micro): 0.546644844517185
Hamming loss: 0.037273360443955156


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Tech

In [10]:
tech_X_train, tech_X_test, tech_y_train, tech_y_test = train_test_split(
    tech_data['description'],
    tech_data[[column for column in tech_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13
)

### TF-IDF

In [11]:
tech_X_train_tfidf, tech_X_test_tfidf = tfidf_vectorize(
    train_data=tech_X_train,
    test_data=tech_X_test,
)

tech_tfidf_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

tech_tfidf_model.fit(tech_X_train_tfidf, tech_y_train.to_numpy())

tech_y_pred_tfidf = tech_tfidf_model.predict(tech_X_test_tfidf)

display_metrics(tech_y_test.to_numpy(), np.array(tech_y_pred_tfidf.todense()))

Accuracy (subset): 0.8913618334067871
Accuracy (ML): 0.9991194376412587


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.8751858191848155
Precision (micro): 0.9248068006182381


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.8696999252314739
Recall (micro): 0.9213889744379428
Hamming loss: 0.0008805623587412816


### Word2Vec

In [12]:
tech_X_train_w2v = w2v_vectorize(tech_X_train)
tech_X_test_w2v = w2v_vectorize(tech_X_test)

tech_w2v_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

tech_w2v_model.fit(np.array([x for x in tech_X_train_w2v]), tech_y_train.to_numpy())

tech_y_pred_w2v = tech_w2v_model.predict(np.array([x for x in tech_X_test_w2v]))

display_metrics(tech_y_test.to_numpy(), np.array(tech_y_pred_w2v.todense()))

Accuracy (subset): 0.8884971353018951
Accuracy (ML): 0.9987069776397218


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.8191524466191092
Precision (micro): 0.8734224201930215


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.8538772066767027
Recall (micro): 0.9058361564521097
Hamming loss: 0.0012930223602780704


### BERT Embeddings

In [13]:
tech_sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

tech_X_train_emb = tech_sbert_model.encode(tech_X_train.values.tolist())
tech_X_test_emb = tech_sbert_model.encode(tech_X_test.values.tolist())

tech_emb_model = ClassifierChain(
    classifier=DecisionTreeClassifier(criterion='log_loss'),
)

tech_emb_model.fit(tech_X_train_emb, tech_y_train.to_numpy())

tech_y_pred_emb = tech_emb_model.predict(tech_X_test_emb)

display_metrics(tech_y_test.to_numpy(), np.array(tech_y_pred_emb.todense()))

Accuracy (subset): 0.8889378580872631
Accuracy (ML): 0.9987529046634477


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (macro): 0.8144362520560097
Precision (micro): 0.8802753666566896


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (macro): 0.8544029577460736
Recall (micro): 0.9057591623036649
Hamming loss: 0.0012470953365523467
