In [40]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import gensim.downloader as api

from preprocessing.utils import is_sentence_in_boundaries
from datasets_utils import get_luxury_data, get_tech_data, get_retail_data, get_big_basket_data
from preprocess import preprocess, with_category_features

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [41]:
device

'cuda'

In [42]:
# luxury_data = get_luxury_data()
# tech_data = get_tech_data()
# retail_data = get_retail_data()
big_basket_data = get_big_basket_data()

datasets = [big_basket_data]
dataset_names = ['Big basket']
# datasets = [big_basket_data, retail_data, luxury_data, tech_data]
# dataset_names = ['Big basket', 'Retail', 'Luxury', 'Tech']

# Get datasets with description column preprocessed
# tech_data['description'] = tech_data['description'].apply(preprocess)
# luxury_data['description'] = luxury_data['description'].apply(preprocess)
# retail_data['description'] = retail_data['description'].apply(preprocess)
big_basket_data['description'] = big_basket_data['description'].apply(preprocess)

# Preprocess categories
# tech_data = with_category_features(tech_data)
# luxury_data = with_category_features(luxury_data)
# retail_data = with_category_features(retail_data)
big_basket_data = with_category_features(big_basket_data)

big_basket_data = big_basket_data[big_basket_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]
# retail_data = retail_data[retail_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=250))]
# luxury_data = luxury_data[luxury_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=100))]
# tech_data = tech_data[tech_data['description'].apply(lambda x: is_sentence_in_boundaries(x, max_tokens=200))]

  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)
  data[category] = data['category'].apply(lambda x: 1 if category in x else 0)


In [25]:
big_basket_X_train, big_basket_X_test, big_basket_y_train, big_basket_y_test = train_test_split(
    big_basket_data['description'],
    big_basket_data[[column for column in big_basket_data.columns if column != 'description']],
    test_size=0.2,
    random_state=13,
)

### TF-IDF

In [5]:
def tfidf_vectorize(train_data, test_data):
    tfidf = TfidfVectorizer()
    tfidf_train_data = tfidf.fit_transform(train_data)
    tfidf_test_data = tfidf.transform(test_data)
    return tfidf_train_data, tfidf_test_data

In [6]:
big_basket_X_train_tfidf, big_basket_X_test_tfidf = tfidf_vectorize(
    train_data=big_basket_X_train,
    test_data=big_basket_X_test,
)

In [7]:
ml_knn_tfidf_grid_cv = GridSearchCV(
    MLkNN(),
    param_grid={
        'k': range(1,3),
        's': [0.5, 0.7, 1.0]
    },
    scoring={
        'accuracy': make_scorer(accuracy_score),
        'micro_precision': make_scorer(precision_score, average='micro'),
        'macro_precision': make_scorer(precision_score, average='macro'),
        'micro_recall': make_scorer(recall_score, average='micro'),
        'macro_recall': make_scorer(recall_score, average='macro'),
        'hamming_loss': make_scorer(hamming_loss),
    },
    refit='hamming_loss',
    verbose=3,
)

ml_knn_tfidf_grid_cv.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
ml_knn_tfidf_grid_cv.best_params_

In [8]:
ml_knn_tfidf_best = MLkNN(**ml_knn_tfidf_grid_cv.best_params_)
ml_knn_tfidf_best.fit(big_basket_X_train_tfidf, big_basket_y_train.to_numpy())

In [9]:
big_basket_y_pred_tfidf = ml_knn_tfidf_best.predict(big_basket_X_test_tfidf)

In [10]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_tfidf)}")

Accuracy: 0.7816222348269994
Precision (macro): 0.7376699073085848
Precision (micro): 0.8514760914760915
Recall (macro): 0.7288412123297471
Recall (micro): 0.8526815456362425
Hamming loss: 0.006462978314935207


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec

In [None]:
api.info()['models'].keys()

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
def to_w2v_embedding(sentence):
    embeddings = []

    for word in sentence.split():
        if word in wv:
            embeddings.append(wv[word])
    embeddings = np.array(embeddings)
    return np.mean(embeddings, axis=0)

In [None]:
big_basket_X_train_w2v = big_basket_X_train.apply(to_w2v_embedding)
big_basket_X_test_w2v = big_basket_X_test.apply(to_w2v_embedding)

In [None]:
ml_knn_w2v_grid_cv = GridSearchCV(
    MLkNN(),
    param_grid={
        'k': range(1,3),
        's': [0.5, 0.7, 1.0]
    },
    scoring={
        'accuracy': make_scorer(accuracy_score),
        'micro_precision': make_scorer(precision_score, average='micro'),
        'macro_precision': make_scorer(precision_score, average='macro'),
        'micro_recall': make_scorer(recall_score, average='micro'),
        'macro_recall': make_scorer(recall_score, average='macro'),
        'hamming_loss': make_scorer(hamming_loss),
    },
    refit='hamming_loss',
    verbose=3,
)

ml_knn_w2v_grid_cv.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [None]:
ml_knn_w2v_grid_cv.best_params_

In [None]:
ml_knn_w2v_best = MLkNN(**ml_knn_w2v_grid_cv.best_params_)
ml_knn_w2v_best.fit(np.array([x for x in big_basket_X_train_w2v]), big_basket_y_train.to_numpy())

In [None]:
big_basket_y_pred_w2v = ml_knn_w2v_best.predict(np.array([x for x in big_basket_X_test_w2v]))

In [None]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_w2v)}")

### BERT embeddings

In [28]:
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True).to(device)

In [6]:
class MLDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len, tokenizer, target_cols):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.target_cols = target_cols


    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df['description'][index]
        tokenized_text = self.tokenizer.tokenize("[CLS] " + text + " [SEP]")
        tokens_count = len(tokenized_text)
        tokenized_text = tokenized_text + ([''] * (self.max_len - len(tokenized_text)))
        segments_ids = ([1] * tokens_count) + ([0] * (len(tokenized_text) - tokens_count))
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        
        tokens_tensor = torch.tensor(indexed_tokens)
        segments_tensors = torch.tensor(segments_ids)

        return { 'token_ids': tokens_tensor, 'segment_ids': segments_tensors }


big_basket_bert_train_dataset = MLDataset(
    pd.concat([
        pd.DataFrame(big_basket_X_train.values, columns=['description']).reset_index().drop(['index'], axis=1),
        big_basket_y_train.reset_index().drop(['index'], axis=1),
    ], axis=1),
    200,
    bert_tokenizer,
    big_basket_y_train.columns.values
)
big_basket_bert_test_dataset = MLDataset(
    pd.concat([
        pd.DataFrame(big_basket_X_test.values, columns=['description']).reset_index().drop(['index'], axis=1),
        big_basket_y_test.reset_index().drop(['index'], axis=1),
    ], axis=1),
    200,
    bert_tokenizer,
    big_basket_y_train.columns.values
)

big_basket_bert_train_loader = torch.utils.data.DataLoader(
    big_basket_bert_train_dataset,
    batch_size=1,
    # num_workers=4,
    shuffle=True,
    # pin_memory=True
)
big_basket_bert_test_loader = torch.utils.data.DataLoader(
    big_basket_bert_test_dataset,
    batch_size=1,
    # num_workers=4,
    shuffle=False,
    # pin_memory=True
)

In [7]:
bert_model.eval()

# bert_train_embeddings = []
# bert_test_embeddings = []
bert_train_embeddings = torch.zeros((768)).unsqueeze(0).to(device)
bert_test_embeddings = torch.zeros((768)).unsqueeze(0).to(device)

with torch.no_grad():
    for batch_idx, data in enumerate(big_basket_bert_train_loader):
        if ((batch_idx + 1) % 500) == 0:
            print(f"Batch: {batch_idx + 1}")

        # if batch_idx <= (21000 - 1):
        #     continue

        output = bert_model(data['token_ids'].to(device), data['segment_ids'].to(device))
        sentence_vector = torch.squeeze(torch.stack(output[2], dim=0), dim=1).permute(1, 0, 2)
        sentence_vector = torch.sum(sentence_vector[:,-4:], dim=1)
        sentence_vector = torch.mean(sentence_vector, dim=0).unsqueeze(0)
    #     stacked_hs = torch.stack(output[2], dim=0)
    #     token_embeddings = torch.squeeze(stacked_hs, dim=1)
    #     token_embeddings = token_embeddings.permute(1,0,2)
    #     vec = torch.sum(token_embeddings[:,-4:], dim=1)
    #     # sentence_vector = vec[:200].unsqueeze(0).to(device)
    #     sentence_vector = vec[:200].detach().cpu()
    #     # sentence_vector = output[0][0].mean(axis=0).unsqueeze(0).to(device)
        bert_train_embeddings = torch.cat((bert_train_embeddings, sentence_vector), axis=0).to(device)
    #     bert_train_embeddings.append(sentence_vector)

        # if batch_idx == (21000 - 1):
        #     torch.save(bert_train_embeddings[1:], f"big_basket_emb_train_{bert_train_embeddings[1:].shape[0]}.pt")
        #     break


    for batch_idx, data in enumerate(big_basket_bert_test_loader):
        if ((batch_idx + 1) % 500) == 0:
            print(f"Batch: {batch_idx + 1}")

        # if batch_idx <= (5000 - 1):
        #     continue

        output = bert_model(data['token_ids'].to(device), data['segment_ids'].to(device))
        sentence_vector = torch.squeeze(torch.stack(output[2], dim=0), dim=1).permute(1, 0, 2)
        sentence_vector = torch.sum(sentence_vector[:,-4:], dim=1)
        sentence_vector = torch.mean(sentence_vector, dim=0).unsqueeze(0)
        # stacked_hs = torch.stack(output[2], dim=0)
        # token_embeddings = torch.squeeze(stacked_hs, dim=1)
        # token_embeddings = token_embeddings.permute(1,0,2)
        # vec = torch.sum(token_embeddings[:,-4:], dim=1)
        # sentence_vector = vec[:200].unsqueeze(0).to(device)
        # sentence_vector = output[0][0].mean(axis=0).unsqueeze(0).to(device)
        # sentence_vector = vec[:200].detach().cpu()
        # bert_test_embeddings.append(sentence_vector)
        bert_test_embeddings = torch.cat((bert_test_embeddings, sentence_vector), axis=0).to(device)

        # if batch_idx == (5000 - 1):
        #     torch.save(bert_test_embeddings[1:], f"big_basket_emb_test_{bert_test_embeddings[1:].shape[0]}.pt")
        #     break

Batch: 500
Batch: 1000
Batch: 1500
Batch: 2000
Batch: 2500
Batch: 3000
Batch: 3500
Batch: 4000
Batch: 4500
Batch: 5000
Batch: 5500
Batch: 6000
Batch: 6500
Batch: 7000
Batch: 7500
Batch: 8000
Batch: 8500
Batch: 9000
Batch: 9500
Batch: 10000
Batch: 10500
Batch: 11000
Batch: 11500
Batch: 12000
Batch: 12500
Batch: 13000
Batch: 13500
Batch: 14000
Batch: 14500
Batch: 15000
Batch: 15500
Batch: 16000
Batch: 16500
Batch: 17000
Batch: 17500
Batch: 18000
Batch: 18500
Batch: 19000
Batch: 19500
Batch: 20000
Batch: 20500
Batch: 21000
Batch: 500
Batch: 1000
Batch: 1500
Batch: 2000
Batch: 2500
Batch: 3000
Batch: 3500
Batch: 4000
Batch: 4500
Batch: 5000


In [13]:
# bert_train_embeddings[1:].shape

In [9]:
# torch.save(bert_train_embeddings[1:], f"big_basket_lhs_4_sum_200_emb_train_{bert_train_embeddings[1:].shape[0]}.pt")

In [10]:
# torch.save(bert_test_embeddings[1:], f"big_basket_lhs_4_sum_200_emb_test_{bert_test_embeddings[1:].shape[0]}.pt")

In [9]:
# for i in range(1, 7):
#     opt = torch.load(f"./big_basket_pt/big_basket_emb_test_1000_{i}.pt", map_location='cpu')
#     opt = torch.mean(opt, dim=1)
#     torch.save(opt, f"./big_basket_pt/big_basket_opt_emb_test_1000_{i}.pt")

In [139]:
ml_knn_emb_grid_cv = GridSearchCV(
    MLkNN(),
    param_grid={
        'k': range(1,3),
        's': [0.5, 0.7, 1.0]
    },
    scoring={
        'accuracy': make_scorer(accuracy_score),
        'micro_precision': make_scorer(precision_score, average='micro'),
        'macro_precision': make_scorer(precision_score, average='macro'),
        'micro_recall': make_scorer(recall_score, average='micro'),
        'macro_recall': make_scorer(recall_score, average='macro'),
        'hamming_loss': make_scorer(hamming_loss),
    },
    refit='hamming_loss',
    verbose=3,
)

ml_knn_emb_grid_cv.fit(np.array(test)[:,1:-1], big_basket_y_train.to_numpy())

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [9]:
ml_knn_emb_grid_cv.best_params_

{'k': 1, 's': 0.5}

In [14]:
# X_train_finally = np.concatenate([
#     torch.load(f"./big_basket_pt/big_basket_opt_emb_train_1000_{i}.pt", map_location='cpu').numpy()
#     for i in range(1, 23)
# ], axis=0)

X_train_finally = torch.load('./big_basket_lhs_4_sum_200_emb_train_21156.pt', map_location='cpu').numpy()
X_test_finally = torch.load('./big_basket_lhs_4_sum_200_emb_test_5289.pt', map_location='cpu').numpy()

In [34]:
X_train_finally = sbert_model.encode(big_basket_X_train.values.tolist())
X_test_finally = sbert_model.encode(big_basket_X_test.values.tolist())

In [35]:
X_train_finally.shape, X_test_finally.shape

((21156, 384), (5289, 384))

In [36]:
ml_knn_emb_best = MLkNN(k=1, s=0.5)

In [37]:
ml_knn_emb_best.fit(
    X_train_finally,
    big_basket_y_train.to_numpy()
)

In [25]:
# X_test_finally = np.concatenate([
#     torch.load(f"./big_basket_pt/big_basket_opt_emb_test_1000_{i}.pt", map_location='cpu').numpy()
#     for i in range(1, 7)
# ], axis=0)

In [38]:
big_basket_y_pred_emb = ml_knn_emb_best.predict(X_test_finally)

In [10]:
        # ml_knn_emb_best = MLkNN(**ml_knn_emb_grid_cv.best_params_)

# ml_knn_emb_best.fit(np.array(test)[:,1:-1], big_basket_y_train.to_numpy())

for i in range(1, 23):
    ml_knn_emb_best = MLkNN(k=1, s=0.5)

    X = torch.load(f"./big_basket_pt/big_basket_emb_train_1000_{i}.pt", map_location='cpu', mmap=True)

    ml_knn_emb_best.fit(
        X.reshape(X.shape[0], -1),
        big_basket_y_train[(i - 1) * X.shape[0]:i * X.shape[0]].to_numpy()
    )



In [11]:
# big_basket_y_pred_emb = ml_knn_emb_best.predict(np.array(test_2)[:,1:-1])
big_basket_y_pred_emb = ml_knn_emb_best.predict(bert_test_embeddings.cpu().detach())

In [32]:
np.asarray(big_basket_y_pred_emb.todense())[6]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [33]:
big_basket_y_test.to_numpy()[6]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [18]:
big_basket_y_test = big_basket_y_test[:X_test_finally.shape[0]]

In [15]:
big_basket_y_pred_emb.shape

(1000, 104)

In [39]:
print(f"Accuracy: {accuracy_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb)}")
print(f"Precision (macro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='macro')}")
print(f"Precision (micro): {precision_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='micro')}")
print(f"Recall (macro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='macro')}")
print(f"Recall (micro): {recall_score(big_basket_y_test.to_numpy(), big_basket_y_pred_emb, average='micro')}")
print(f"Hamming loss: {hamming_loss(big_basket_y_test.to_numpy(), big_basket_y_pred_emb)}")

Accuracy: 0.7653620722253734
Precision (macro): 0.7195929893540998
Precision (micro): 0.8395833333333333
Recall (macro): 0.7125900343493276
Recall (micro): 0.8390239840106596
Hamming loss: 0.007013831318992975


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
