In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import gensim.downloader as api

from preprocessing.utils import is_sentence_in_boundaries
from datasets_utils import get_luxury_data, get_tech_data, get_retail_data, get_big_basket_data
from preprocess import preprocess, with_category_features

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
device

'cuda'

In [3]:
luxury_data = get_luxury_data()
tech_data = get_tech_data()
retail_data = get_retail_data()
big_basket_data = get_big_basket_data()

datasets = [big_basket_data, retail_data, luxury_data, tech_data]
dataset_names = ['Big basket', 'Retail', 'Luxury', 'Tech']

# Get datasets with description column preprocessed
tech_data['description'] = tech_data['description'].apply(preprocess)
luxury_data['description'] = luxury_data['description'].apply(preprocess)
retail_data['description'] = retail_data['description'].apply(preprocess)
big_basket_data['description'] = big_basket_data['description'].apply(preprocess)

# Preprocess categories
tech_data = with_category_features(tech_data)
luxury_data = with_category_features(luxury_data)
retail_data = with_category_features(retail_data)
big_basket_data = with_category_features(big_basket_data)

big_basket_data = big_basket_data[big_basket_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]
retail_data = retail_data[retail_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=250))]
luxury_data = luxury_data[luxury_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=100))]
tech_data = tech_data[tech_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]

  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 

In [4]:
big_basket_X_train, big_basket_X_test, big_basket_y_train, big_basket_y_test = train_test_split(
    big_basket_data['description'],
    big_basket_data[[column for column in big_basket_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13,
)

### TF-IDF

In [5]:
def tfidf_vectorize(train_data, test_data):
    tfidf = TfidfVectorizer()
    tfidf_train_data = tfidf.fit_transform(train_data)
    tfidf_test_data = tfidf.transform(test_data)
    return tfidf_train_data, tfidf_test_data

In [6]:
big_basket_X_train_tfidf, big_basket_X_test_tfidf = tfidf_vectorize(
    train_data=big_basket_X_train,
    test_data=big_basket_X_test,
)

In [7]:
ml_knn_tfidf_grid_cv = GridSearchCV(
    MLkNN(),
    param_grid={
        'k': range(1,3),
        's': [0.5, 0.7, 1.0]
    },
    scoring={
        'accuracy': make_scorer(accuracy_score),
        'micro_precision': make_scorer(precision_score, average='micro'),
        'macro_precision': make_scorer(precision_score, average='macro'),
        'micro_recall': make_scorer(recall_score, average='micro'),
        'macro_recall': make_scorer(recall_score, average='macro'),
        'hamming_loss': make_scorer(hamming_loss),
    },
    refit='hamming_loss',
    verbose=3,
)

ml_knn_tfidf_grid_cv.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
ml_knn_tfidf_grid_cv.best_params_

In [8]:
ml_knn_tfidf_best = MLkNN(**ml_knn_tfidf_grid_cv.best_params_)
ml_knn_tfidf_best.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

In [9]:
big_basket_y_pred_tfidf = ml_knn_tfidf_best.predict(big_basket_X_test_tfidf)

In [10]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)}")

Accuracy: 0.7816222348269994
Precision (macro): 0.7376699073085848
Precision (micro): 0.8514760914760915
Recall (macro): 0.7288412123297471
Recall (micro): 0.8526815456362425
Hamming loss: 0.006462978314935207


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [None]:
api.info()['models'].keys()

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
def to_w2v_embedding(sentence):
    embeddings = []

    for word in sentence.split():
        if word in wv:
            embeddings.append(wv[word])
    embeddings = np.array(embeddings)
    return np.mean(embeddings, axis=0)

In [None]:
big_basket_X_train_w2v = big_basket_X_train.apply(to_w2v_embedding)
big_basket_X_test_w2v = big_basket_X_test.apply(to_w2v_embedding)

In [None]:
ml_knn_w2v_grid_cv = GridSearchCV(
    MLkNN(),
    param_grid={
        'k': range(1,3),
        's': [0.5, 0.7, 1.0]
    },
    scoring={
        'accuracy': make_scorer(accuracy_score),
        'micro_precision': make_scorer(precision_score, average='micro'),
        'macro_precision': make_scorer(precision_score, average='macro'),
        'micro_recall': make_scorer(recall_score, average='micro'),
        'macro_recall': make_scorer(recall_score, average='macro'),
        'hamming_loss': make_scorer(hamming_loss),
    },
    refit='hamming_loss',
    verbose=3,
)

ml_knn_w2v_grid_cv.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [None]:
ml_knn_w2v_grid_cv.best_params_

In [None]:
ml_knn_w2v_best = MLkNN(**ml_knn_w2v_grid_cv.best_params_)
ml_knn_w2v_best.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [None]:
big_basket_y_pred_w2v = ml_knn_w2v_best.predict(np.array([x for x in big_basket_X_test_w2v]))

In [None]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)}")

### BERT embeddings

In [5]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True).to(device)

In [6]:
class MLDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len, tokenizer, target_cols):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.target_cols = target_cols


    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df['description'][index]
        tokenized_text = self.tokenizer.tokenize(text)
        tokens_count = len(tokenized_text)
        tokenized_text = tokenized_text + ([''] * (self.max_len - len(tokenized_text)))
        segments_ids = ([1] * tokens_count) + ([0] * (len(tokenized_text) - tokens_count))
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        
        tokens_tensor = torch.tensor(indexed_tokens).to(device)
        segments_tensors = torch.tensor(segments_ids).to(device)

        return { 'token_ids': tokens_tensor, 'segment_ids': segments_tensors }


big_basket_bert_train_dataset = MLDataset(
    pd.concat([
        pd.DataFrame(big_basket_X_train.values, columns=['description']).reset_index().drop(['index'], axis=1),
        big_basket_y_train.reset_index().drop(['index'], axis=1),
    ], axis=1),
    512,
    bert_tokenizer,
    big_basket_y_train.columns.values
)
big_basket_bert_test_dataset = MLDataset(
    pd.concat([
        pd.DataFrame(big_basket_X_test.values, columns=['description']).reset_index().drop(['index'], axis=1),
        big_basket_y_test.reset_index().drop(['index'], axis=1),
    ], axis=1),
    512,
    bert_tokenizer,
    big_basket_y_train.columns.values
)

big_basket_bert_train_loader = torch.utils.data.DataLoader(
    big_basket_bert_train_dataset,
    batch_size=4,
    # num_workers=4,
    shuffle=True,
    # pin_memory=True
)
big_basket_bert_test_loader = torch.utils.data.DataLoader(
    big_basket_bert_test_dataset,
    batch_size=4,
    # num_workers=4,
    shuffle=False,
    # pin_memory=True
)

In [7]:
# test = next(iter(big_basket_bert_train_loader))

# test

In [8]:
# test_output = bert_model(test['token_ids'], test['segment_ids'])

# test['token_ids'].shape

In [9]:
# test_output.hidden_states[11].mean(axis=0).shape

In [10]:
# torch.stack(test_output.hidden_states, dim=0).shape

In [11]:
# token_embeddings = torch.stack(test_output.hidden_states, dim=0)

In [12]:
# token_embeddings.shape

In [13]:
# new_token_embeddings = torch.squeeze(token_embeddings, dim=1)

In [14]:
# new_token_embeddings.shape

In [15]:
# new_token_embeddings = new_token_embeddings.permute(1, 2, 0, 3)

In [16]:
# new_token_embeddings.shape

In [17]:
# torch.zeros((200, 768)).unsqueeze(0).shape

In [18]:
# torch.cat((torch.randn(200, 768).unsqueeze(0), torch.randn(200, 768).unsqueeze(0))).shape

In [None]:
bert_model.eval()

bert_train_embeddings = torch.zeros((512, 768)).unsqueeze(0).to(device)
bert_test_embeddings = torch.zeros((512, 768)).unsqueeze(0).to(device)

with torch.no_grad():
    for batch_idx, data in enumerate(big_basket_bert_train_loader):
        if ((batch_idx + 1) % 300) == 0:
            print(f"Batch: {batch_idx + 1} | {bert_train_embeddings.shape}")

        output = bert_model(data['token_ids'], data['segment_ids'])
        sentence_vector = output.hidden_states[11].mean(axis=0).unsqueeze(0).to(device)
        bert_train_embeddings = torch.cat((bert_train_embeddings, sentence_vector), axis=0).to(device)


    for batch_idx, data in enumerate(big_basket_bert_test_loader):
        if ((batch_idx + 1) % 100) == 0:
            print(f"Batch: {batch_idx + 1}")

        output = bert_model(data['token_ids'], data['segment_ids'])
        sentence_vector = output.hidden_states[11].mean(axis=0).unsqueeze(0).to(device)
        bert_test_embeddings = torch.cat((bert_test_embeddings, sentence_vector), axis=0).to(device)

In [13]:
# token_embeddings = torch.stack(output[2], dim=0)
# torch.squeeze(token_embeddings, dim=1).shape

In [9]:
torch.mean(torch.rand((8, 200, 768)), axis=1).shape

torch.Size([8, 768])

In [98]:
torch.cat(
    (
    torch.rand((510, 768)),
    torch.rand((510, 768))
    ),
    axis=0
).shape

torch.Size([1020, 768])

In [68]:
big_basket_y_train.to_numpy().shape

(21156, 104)

In [130]:
test = [big_basket_bert_train_dataset[i].cpu().detach().numpy().ravel() for i in range(len(big_basket_bert_train_dataset))]

In [141]:
test_2 = [big_basket_bert_test_dataset[i].cpu().detach().numpy().ravel() for i in range(len(big_basket_bert_test_dataset))]

In [138]:
np.array(test)[:,1:-1].shape

(21156, 510)

In [139]:
ml_knn_emb_grid_cv = GridSearchCV(
    MLkNN(),
    param_grid={
        'k': range(1,3),
        's': [0.5, 0.7, 1.0]
    },
    scoring={
        'accuracy': make_scorer(accuracy_score),
        'micro_precision': make_scorer(precision_score, average='micro'),
        'macro_precision': make_scorer(precision_score, average='macro'),
        'micro_recall': make_scorer(recall_score, average='micro'),
        'macro_recall': make_scorer(recall_score, average='macro'),
        'hamming_loss': make_scorer(hamming_loss),
    },
    refit='hamming_loss',
    verbose=3,
)

ml_knn_emb_grid_cv.fit(np.array(test)[:,1:-1], big_basket_y_train.to_numpy())

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [9]:
ml_knn_emb_grid_cv.best_params_

{'k': 1, 's': 0.5}

In [10]:
        # ml_knn_emb_best = MLkNN(**ml_knn_emb_grid_cv.best_params_)
ml_knn_emb_best = MLkNN(k=1, s=0.5)
# ml_knn_emb_best.fit(np.array(test)[:,1:-1], big_basket_y_train.to_numpy())
ml_knn_emb_best.fit(bert_train_embeddings.cpu().detach(), big_basket_y_train.to_numpy())

In [11]:
# big_basket_y_pred_emb = ml_knn_emb_best.predict(np.array(test_2)[:,1:-1])
big_basket_y_pred_emb = ml_knn_emb_best.predict(bert_test_embeddings.cpu().detach())

In [12]:
np.asarray(big_basket_y_pred_emb.todense())[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [13]:
big_basket_y_test.to_numpy()[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [17]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), np.asarray(big_basket_y_pred_emb.todense()))}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), np.asarray(big_basket_y_pred_emb.todense()), average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), np.asarray(big_basket_y_pred_emb.todense()), average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), np.asarray(big_basket_y_pred_emb.todense()), average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), np.asarray(big_basket_y_pred_emb.todense()), average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), np.asarray(big_basket_y_pred_emb.todense()))}")

Accuracy: 0.02836074872376631
Precision (macro): 0.024138113687213546
Precision (micro): 0.09752827817343947
Recall (macro): 0.02431161118197804
Recall (micro): 0.09693537641572285
Hamming loss: 0.039295998952833894


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
