In [26]:
# Assume that it is run on Google Colab in cloud
# thus we need to make sure all the imports are available.
# Feel free to skip if you are running the code from the repository folder
# or GPU is available locally

# ! wget https://github.com/cntgfy90/hse_mlc_final/archive/refs/heads/main.zip
# ! unzip main.zip -d main
# ! mv main/hse_mlc_final-main/requirements.txt requirements.txt
# ! cp -a main/hse_mlc_final-main/src/. ./
# ! rm -rf main
# ! rm main.zip
# ! pip install --upgrade pip
# ! pip install neptune
# ! pip install torch
# ! pip install transformers
# ! pip install scikit-learn
# ! pip install numpy
# ! pip install pandas

In [2]:
import torch
import neptune
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from train import train
from preprocess import preprocess
from evaluate import eval_model
from bert import MLBERT
from dataset import MLDataset
from tokenizer import tokenizer
from loss_fn import loss_fn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [3]:
data = pd.read_csv('./dataset/data.csv')

data.head(3)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."


In [4]:
data.shape

(27555, 10)

In [5]:
data['description'] = data['description'].apply(preprocess)

In [6]:
unique_categories = [splitted.strip() for category in data['category'].unique()
                     for splitted in category.split(',')]

unique_categories

['Beauty & Hygiene',
 'Kitchen',
 'Garden & Pets',
 'Cleaning & Household',
 'Gourmet & World Food',
 'Foodgrains',
 'Oil & Masala',
 'Snacks & Branded Foods',
 'Beverages',
 'Bakery',
 'Cakes & Dairy',
 'Baby Care',
 'Fruits & Vegetables',
 'Eggs',
 'Meat & Fish']

In [7]:
for category in unique_categories:
    data[category] = data['category'].apply(lambda x: 1 if category in x else 0)

In [8]:
X_train, X_test = train_test_split(data, shuffle=True)
X_train = X_train.reset_index()
X_test = X_test.reset_index()
X_train.shape, X_test.shape

((20666, 26), (6889, 26))

In [9]:
run = neptune.init_run(
    project="stepangrigorov/test",
    api_token="<API_TOKEN>",
)



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/stepangrigorov/test/e/TES-15


In [10]:
run['sys/tags'].add('staging')

In [11]:
parameters = {
    'max_len': 512,
    'batch_size': 8,
    'epochs': 1,
    'learning_rate': 2e-5,
    'num_classes': len(unique_categories),
    'weight_decay': 1e-6,
    'classes': ','.join(unique_categories),
}

run['config/parameters'] = parameters
run['config/dataset/train_size'] = X_train.shape[0]
run['config/dataset/val_size'] = X_test.shape[0]

In [12]:
train_dataset = MLDataset(X_train, parameters['max_len'], tokenizer, unique_categories)
test_dataset = MLDataset(X_test, parameters['max_len'], tokenizer, unique_categories)

train_loader = DataLoader(train_dataset, batch_size=parameters['batch_size'],
                          num_workers=4, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=parameters['batch_size'],
                         num_workers=4, shuffle=False, pin_memory=True)



In [13]:
model = MLBERT(n_classes=parameters['num_classes'])
model = model.to(device)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [14]:
optimizer = torch.optim.AdamW(params=model.parameters(),
                  lr=parameters['learning_rate'], weight_decay=parameters['weight_decay'])

In [15]:
for epoch in range(parameters['epochs']):
    train(epoch, model, loss_fn, train_loader, optimizer, device, run)



Epoch: 0, Loss:  0.7057161331176758
Epoch: 0, Loss:  0.11333828419446945
Epoch: 0, Loss:  0.09744659066200256
Epoch: 0, Loss:  0.0813826471567154
Epoch: 0, Loss:  0.07656201720237732
Epoch: 0, Loss:  0.15394698083400726


In [16]:
test_acc, test_loss = eval_model(test_loader, model, loss_fn, device, run)

Batch: 1
Batch: 2
Batch: 3
Batch: 4
Batch: 5
Batch: 6
Batch: 7
Batch: 8
Batch: 9
Batch: 10
Batch: 11
Batch: 12
Batch: 13
Batch: 14
Batch: 15
Batch: 16
Batch: 17
Batch: 18
Batch: 19
Batch: 20
Batch: 21
Batch: 22
Batch: 23
Batch: 24
Batch: 25
Batch: 26
Batch: 27
Batch: 28
Batch: 29
Batch: 30
Batch: 31
Batch: 32
Batch: 33
Batch: 34
Batch: 35
Batch: 36
Batch: 37
Batch: 38
Batch: 39
Batch: 40
Batch: 41
Batch: 42
Batch: 43
Batch: 44
Batch: 45
Batch: 46
Batch: 47
Batch: 48
Batch: 49
Batch: 50
Batch: 51
Batch: 52
Batch: 53
Batch: 54
Batch: 55
Batch: 56
Batch: 57
Batch: 58
Batch: 59
Batch: 60
Batch: 61
Batch: 62
Batch: 63
Batch: 64
Batch: 65
Batch: 66
Batch: 67
Batch: 68
Batch: 69
Batch: 70
Batch: 71
Batch: 72
Batch: 73
Batch: 74
Batch: 75
Batch: 76
Batch: 77
Batch: 78
Batch: 79
Batch: 80
Batch: 81
Batch: 82
Batch: 83
Batch: 84
Batch: 85
Batch: 86
Batch: 87
Batch: 88
Batch: 89
Batch: 90
Batch: 91
Batch: 92
Batch: 93
Batch: 94
Batch: 95
Batch: 96
Batch: 97
Batch: 98
Batch: 99
Batch: 100
Batch: 1

In [17]:
project = neptune.init_project(
    project="stepangrigorov/test",
    mode="read-only",
    api_token="<API_TOKEN>",
)

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/stepangrigorov/test/


In [18]:
best_runs = project.fetch_runs_table(tag='best').to_pandas()
run.wait()

if best_runs.empty:
  torch.save(model.state_dict(), './model_weights.bin')
  run['model/weights'].upload('./model_weights.bin')
  run['sys/tags'].add('best')
  run['sys/tags'].remove('staging')
else:
  best_run = neptune.init_run(
      project='stepangrigorov/test',
      api_token="<API_TOKEN>",
      with_id=best_runs.loc[0]['sys/id'],
  )
  best_acc = best_run['val/acc'].fetch_last()
  staging_acc = run['val/acc'].fetch_last()
  if staging_acc > best_acc:
    run['sys/tags'].add('best')
    run['sys/tags'].remove('staging')
    best_run['sys/tags'].remove('best')
    best_run['sys/tags'].add('staging')
    torch.save(model.state_dict(), './model_weights.bin')
    run['model/weights'].upload('./model_weights.bin')

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/stepangrigorov/test/e/TES-13


In [19]:
run.wait()
best_run.wait()

best_run.stop()
run.stop()

[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] All 0 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/stepangrigorov/test/e/TES-13/metadata
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] All 0 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/stepangrigorov/test/e/TES-15/metadata
