In [143]:
import re
import torch
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, BertTokenizer, BertModel, AutoModel, AutoTokenizer
from preprocess import preprocess
lemmatizer = WordNetLemmatizer()
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, hamming_loss

STOPWORDS = set(stopwords.words('english'))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
data = pd.read_csv('./datasets/BigBasket Products.csv')

data.head(3)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."


In [3]:
data.shape

(27555, 10)

In [4]:
data['description'] = data['description'].apply(preprocess)

In [5]:
unique_categories = [splitted.strip() for category in data['category'].unique()
                     for splitted in category.split(',')]

unique_categories

['Beauty & Hygiene',
 'Kitchen',
 'Garden & Pets',
 'Cleaning & Household',
 'Gourmet & World Food',
 'Foodgrains',
 'Oil & Masala',
 'Snacks & Branded Foods',
 'Beverages',
 'Bakery',
 'Cakes & Dairy',
 'Baby Care',
 'Fruits & Vegetables',
 'Eggs',
 'Meat & Fish']

In [6]:
categories_w_ratio = list()

for category in unique_categories:
    total = data[data['category'].str.contains(category)]['category'].count() / data['category'].count()
    categories_w_ratio.append([category, total])
    print(f"{category}: {total}")

categories_w_ratio = sorted(categories_w_ratio, key=lambda x: x[1], reverse=True)

categories_w_ratio

Beauty & Hygiene: 0.2855017238250771
Kitchen: 0.12992197423335147
Garden & Pets: 0.12992197423335147
Cleaning & Household: 0.09707857013246235
Gourmet & World Food: 0.17020504445654147
Foodgrains: 0.09711486118671747
Oil & Masala: 0.09711486118671747
Snacks & Branded Foods: 0.10212302667392488
Beverages: 0.032117583015786606
Bakery: 0.030883687171112322
Cakes & Dairy: 0.030883687171112322
Baby Care: 0.02213754309562693
Fruits & Vegetables: 0.020214117220105243
Eggs: 0.01270186898929414
Meat & Fish: 0.01270186898929414


[['Beauty & Hygiene', 0.2855017238250771],
 ['Gourmet & World Food', 0.17020504445654147],
 ['Kitchen', 0.12992197423335147],
 ['Garden & Pets', 0.12992197423335147],
 ['Snacks & Branded Foods', 0.10212302667392488],
 ['Foodgrains', 0.09711486118671747],
 ['Oil & Masala', 0.09711486118671747],
 ['Cleaning & Household', 0.09707857013246235],
 ['Beverages', 0.032117583015786606],
 ['Bakery', 0.030883687171112322],
 ['Cakes & Dairy', 0.030883687171112322],
 ['Baby Care', 0.02213754309562693],
 ['Fruits & Vegetables', 0.020214117220105243],
 ['Eggs', 0.01270186898929414],
 ['Meat & Fish', 0.01270186898929414]]

In [7]:
for category in unique_categories:
    data[category] = data['category'].apply(lambda x: 1 if category in x else 0)

In [8]:
X_train, X_test = train_test_split(data, shuffle=True)

In [9]:
X_train.shape, X_test.shape

X_train = X_train.reset_index()
X_test = X_test.reset_index()

X_train.shape, X_test.shape

((20666, 26), (6889, 26))

In [None]:
class MLDataset(Dataset):
    def __init__(self, df, max_len, tokenizer, target_cols):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.target_cols = target_cols


    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df['description'][index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': ids.clone().detach().flatten(),
            'mask': mask.clone().detach().flatten(),
            'token_type_ids': token_type_ids.clone().detach().flatten(),
            'targets': torch.tensor(self.df[self.target_cols].values[index], dtype=torch.float)
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

MAX_LEN = 512
BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 2e-5
NUM_CLASSES = len(unique_categories)

train_dataset = MLDataset(X_train, MAX_LEN, tokenizer, unique_categories)
test_dataset = MLDataset(X_test, MAX_LEN, tokenizer, unique_categories)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          num_workers=4, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         num_workers=4, shuffle=False, pin_memory=True)

In [None]:
class MLBERT(torch.nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained(
            "bert-base-cased")
        self.fc = torch.nn.Linear(768, n_classes)

    def forward(self, ids, mask, token_type_ids):
        output = self.bert(ids, attention_mask=mask,
                                token_type_ids=token_type_ids)
        return self.fc(output.pooler_output)

model = MLBERT(NUM_CLASSES)
model.load_state_dict(torch.load('thesis_NLP.bin', map_location=torch.device(device)))
model = model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.AdamW(params=model.parameters(),
                  lr=LEARNING_RATE, weight_decay=1e-6)

In [None]:
def train(epoch):
    model.train()
    for _, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask, token_type_ids)
        print(outputs.shape, targets.shape, 'SHAPES')
        loss = loss_fn(outputs, targets)

        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

In [None]:
for epoch in range(1):
    train(epoch)

In [None]:
# torch.save(model.state_dict(), "./thesis_NLP.bin")

In [None]:
import numpy as np


def eval_model(validation_loader, model, optimizer):
    losses = []
    hl = []
    macro_precision = []
    micro_precision = []
    macro_recall = []
    micro_recall = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            if ((batch_idx + 1) % 100) == 0:
                print(f"Batch: {batch_idx + 1}")
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

            # Hamming loss
            hl.append(hamming_loss(targets, outputs))

            # Macro / mictor precision
            macro_precision.append(precision_score(targets, outputs, average='macro'))
            micro_precision.append(precision_score(targets, outputs, average='micro'))

            # Macro / mictor recall
            macro_recall.append(recall_score(targets, outputs, average='macro'))
            micro_recall.append(recall_score(targets, outputs, average='micro'))

    return {
        'accuracy': float(correct_predictions)/num_samples,
        'bce_loss': losses,
        'hamming_loss': hl,
        'macro_precision': macro_precision,
        'micro_precision': micro_precision,
        'macro_recall': macro_recall,
        'micro_recall': micro_recall,
    }

In [None]:
recall_score(
    np.array([[1, 1, 1], [1, 0, 0]]),
    np.array([[1, 0, 1], [0, 0, 1]]),
    average='macro'
)

In [None]:
eval_result = eval_model(test_loader, model, optimizer)

In [None]:
print(f"Hamming loss: {np.array(eval_result['hamming_loss']).mean()}")
print(f"BCE loss: {np.array(eval_result['bce_loss']).mean()}")
print(f"Precision (macro): {np.array(eval_result['macro_precision']).mean()}")
print(f"Precision (micro): {np.array(eval_result['micro_precision']).mean()}")
print(f"Recall (macro): {np.array(eval_result['macro_recall']).mean()}")
print(f"Recall (micro): {np.array(eval_result['micro_recall']).mean()}")

In [None]:
it = iter(test_loader)
# next(it)
# next(it)
# next(it)
test_item = next(it)
input_ids = test_item['ids'].to(device)
attention_mask = test_item['mask'].to(device)
token_type_ids = test_item['token_type_ids'].to(device)
targets = test_item['targets'].cpu().detach().numpy()
output = model(input_ids, attention_mask, token_type_ids)
 # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
output = torch.sigmoid(output).cpu().detach().numpy().round()
# thresholding at 0.5
# output = output.flatten().round().numpy()
# np.sum(output==targets)

In [None]:
output.reshape(BATCH_SIZE, -1)

In [None]:
targets

In [None]:
for index, k in enumerate(targets):
    if (output[index] == k).all():
        print('Correct')
    else:
        print('Incorrect')

Sync notes:
- classical ML + Deep learning comparison
- we should get more rigid results based on data
- think about disbalance of data
- compare other NLP models
- try 3 folds CV
----------------------------------------------------
- check out lectures (metrics):
    - metric should work with classificaation
    - encounter for class disbalance
- experiment with models:
    - try other NLP models
    - try ML
    - try CNN or any other experimental reasons
- data processing:
    - display data
    - matplotlib
----------------------------------------------------
Different approaches to handle thesis:
- find out different methods
- or find out other datasets and experiment on abstract task by means of solving the same problem on different data
----------------------------------------------------
(1) finalize evaluation approach
 - present all metrics and results by tables (check sent thesis in TG)

(2) research for data
and

(2) check the thesis results from theoretical parts - inspired by other works check the experiments to carry out on our dataset
----------------------------------------------------
The main body (3-5 chapters: overview, stages, main theoretical results, implementation and / or
experiment, analysis of the results);


In [10]:
y_train = X_train[unique_categories]

y_test = X_test[unique_categories]

y_train.shape, y_test.shape

((20666, 15), (6889, 15))

In [11]:
X_train.drop(unique_categories, axis=1, inplace=True)
X_test.drop(unique_categories, axis=1, inplace=True)

In [12]:
X_train.head(1)

Unnamed: 0,level_0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,3180,3181,Verimicelli - Whole Grain Rice,Gourmet & World Food,"Pasta, Soup & Noodles",Mama,249.0,249.0,Jasmine & Sushi Rice,4.5,"mama whole grain rice vermicelli rich fibre, v..."


In [None]:
from catboost import CatBoostClassifier, Pool

cat_boost_clf = CatBoostClassifier(
    loss_function='MultiLogloss',
    # eval_metric='HammingLoss',
    iterations=500,
    class_names=unique_categories,
    task_type="GPU",
    devices='0',
)
train_pool = Pool(X_train[['description']], y_train, text_features=['description'], feature_names=['description'])
test_pool = Pool(X_test[['description']], y_test, text_features=['description'], feature_names=['description'])

cat_boost_clf.fit(train_pool, eval_set=test_pool, metric_period=10, plot=True, verbose=50)

In [None]:
cat_boost_pred = cat_boost_clf.predict(X_test[['description']])

In [133]:


print(classification_report(y_test, cat_boost_pred))

NameError: name 'cat_boost_pred' is not defined

In [None]:
print(accuracy_score(y_test, cat_boost_pred))

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train[['description']])
X_train_tfidf = vectorizer.transform(X_train['description'])
X_test_tfidf = vectorizer.transform(X_test['description'])

In [None]:
X_train_tfidf.shape, X_test_tfidf.shape

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

muloc_forest = RandomForestClassifier(random_state=1)

muloc_clf = MultiOutputClassifier(muloc_forest)
muloc_clf.fit(X_train_tfidf, y_train)

In [None]:
muloc_pred = muloc_clf.predict(X_test_tfidf)

In [None]:
accuracy_score(y_test, muloc_pred)

### Word2Vec

In [13]:
from gensim.models import KeyedVectors
import numpy as np

In [14]:
w2v_model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [32]:
X_train['description'].str.split().apply(lambda x: len(x)).max()

524

In [None]:
W2V_MAX_SIZE = 550

In [124]:
def convert_to_word2vec(text):
    tokens = text.split()
    result = []
    for token in tokens:
        try:
            vector = w2v_model[token]
            result.append(vector)
        except:
            result.append(w2v_model['UNK'])
    return np.array([np.array(result, dtype=np.float32).mean()])

X_train_w2v = X_train['description'].apply(convert_to_word2vec)

In [128]:
# X_train_w2v = X_train_w2v.apply(lambda x: x.mean(axis=0))

X_train_w2v.values.tolist()

[array([-0.00751416], dtype=float32),
 array([-0.0061201], dtype=float32),
 array([-0.00773362], dtype=float32),
 array([-0.00676016], dtype=float32),
 array([-0.0070821], dtype=float32),
 array([-0.00690758], dtype=float32),
 array([-0.00263829], dtype=float32),
 array([-0.00535984], dtype=float32),
 array([-0.00554817], dtype=float32),
 array([-0.00191036], dtype=float32),
 array([-0.00719467], dtype=float32),
 array([-0.00532602], dtype=float32),
 array([-0.00492159], dtype=float32),
 array([-0.00295631], dtype=float32),
 array([-0.00561252], dtype=float32),
 array([-0.00985911], dtype=float32),
 array([-0.00341957], dtype=float32),
 array([-0.00571357], dtype=float32),
 array([-0.00348441], dtype=float32),
 array([-0.00318675], dtype=float32),
 array([-0.00363474], dtype=float32),
 array([-0.00481035], dtype=float32),
 array([-0.00352548], dtype=float32),
 array([-0.00715165], dtype=float32),
 array([-0.00610775], dtype=float32),
 array([-0.00737234], dtype=float32),
 array([-0.003

In [129]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

muloc_forest_w2v = RandomForestClassifier(random_state=1)
muloc_clf_w2v = MultiOutputClassifier(muloc_forest_w2v)
muloc_clf_w2v.fit(X_train_w2v.values.tolist(), y_train)

In [130]:
X_test_w2v = X_test['description'].apply(convert_to_word2vec)
X_test_w2v = X_test_w2v.values.tolist()

In [131]:
muloc_pred_w2v = muloc_clf_w2v.predict(X_test_w2v)

In [144]:
print(f"Accuracy: {accuracy_score(y_test, muloc_pred_w2v)}")
print(f"Precision (macro): {precision_score(y_test, muloc_pred_w2v, average='macro')}")
print(f"Precision (micro): {precision_score(y_test, muloc_pred_w2v, average='micro')}")
print(f"Recall (macro): {recall_score(y_test, muloc_pred_w2v, average='macro')}")
print(f"Recall (micro): {recall_score(y_test, muloc_pred_w2v, average='micro')}")
print(f"Hamming loss: {hamming_loss(y_test, muloc_pred_w2v)}")

Accuracy: 0.38641312236899406
Precision (macro): 0.3459148767360777
Precision (micro): 0.4264687382060637
Recall (macro): 0.31289475134688755
Recall (micro): 0.3852710535288101
Hamming loss: 0.09646296027483428


In [167]:
y_test.to_numpy()[2], muloc_pred_w2v[2]

(array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [178]:
hamming_loss(y_test.to_numpy()[2], muloc_pred_w2v[2], sample_weight=(0.5, *([0.03571429] * 14)))

0.4999999700000016