In [1]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
import copy
from collections import defaultdict
import pathlib

import pandas as pd
import numpy as np
import torch
from torch import nn
import transformers
# from datasets import load_dataset, Features, Value, ClassLabel, LargeList, Sequence
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import emoji

In [4]:
# BACKBONE_NAME = 'cointegrated/rubert-tiny2'
BACKBONE_NAME = "DeepPavlov/rubert-base-cased"
SUBMISSION_NAME = "rubert-base-cased"
NUM_LABELS = 50
BATCH_SIZE = 16
MAX_LEN = 256
RANDOM_STATE = 42
EPOCHS = 100
ROOT_DIR = pathlib.Path().absolute()

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [7]:
df = pd.read_csv('train_augmented_2.csv')
df_test = pd.read_csv('test.csv')

df_test['text'] = df_test['text'].apply(
    lambda x: emoji.demojize(str(x), language='ru').replace(':', ' ').replace('_', ' ')
)

In [8]:
df_train, df_valid = train_test_split(df, random_state=RANDOM_STATE, test_size=0.2, shuffle=True)

In [9]:
df_test

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text
0,1,3135,3.0,{DELIVERY},"Последнее время думаю плохо, сроки доставки да..."
1,3,4655,2.0,"{PRICE,DELIVERY,ASSORTMENT}",Цены намного выше магазинных но радуют акции
2,5,22118,2.0,"{CATALOG_NAVIGATION,ASSORTMENT,DELIVERY}","Доставка за [NUM] минут, заказ даже не начали ..."
3,7,23511,0.0,{DELIVERY},Ужасно долгая доставка
4,8,45,6.0,"{ASSORTMENT,PROMOTIONS}",Добрый вечер! Вы большие молодцы. Меня всё уст...
...,...,...,...,...,...
9010,16992,3523,3.0,"{PRICE,SUPPORT,DELIVERY}",Задержка с доставкой не даете промокод на скид...
9011,16993,24925,6.0,"{PRICE,PRODUCTS_QUALITY,ASSORTMENT}",Очень удобный формат сервиса и очень маленький...
9012,16994,6327,6.0,"{PAYMENT,ASSORTMENT,DELIVERY}","Сумма заказа почти всегда высокая, что зачасту..."
9013,16997,530,3.0,"{PRODUCTS_QUALITY,SUPPORT,DELIVERY}","Часто, заказываю у вас молочную продукцию, при..."


In [10]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list=None):
        self.tokenizer = tokenizer
        self.df = df.copy()
        self.text = df['text'].tolist()
        self.targets = self.df[target_list].values if target_list else None
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.text[index],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]) if self.targets is not None else torch.FloatTensor([]),
            'text': self.text[index]
        }

In [11]:
rubert_tiny_tokenizer = transformers.AutoTokenizer.from_pretrained(BACKBONE_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [15]:

train_dataset = CustomDataset(df_train, rubert_tiny_tokenizer, MAX_LEN, [f'trend_id_res{i}' for i in range(50)])
val_dataset = CustomDataset(df_valid, rubert_tiny_tokenizer, MAX_LEN, [f'trend_id_res{i}' for i in range(50)])
test_dataset = CustomDataset(df_test, rubert_tiny_tokenizer, MAX_LEN)

In [16]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

valid_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [17]:
class ClassificationModel(nn.Module):
    def __init__(self, base_transformer_model):
        super().__init__()
        self.backbone = transformers.AutoModel.from_pretrained(base_transformer_model, return_dict=True)
        clf_in_features = self.backbone.pooler.dense.out_features
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(clf_in_features, NUM_LABELS)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.backbone(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

In [18]:
model = ClassificationModel(BACKBONE_NAME)

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.backbone.parameters():
#     param.requires_grad = False

model.to(device)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ClassificationModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [19]:
criterion = torch.nn.BCEWithLogitsLoss()

In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [21]:
def train(training_loader, model, optimizer, criterion):
    losses = []
    correct_predictions = 0
    num_samples = 0
    model.train()
    for batch in training_loader:
        # forward
        outputs = model(
            batch['input_ids'].to(device),
            batch['attention_mask'].to(device),
            batch['token_type_ids'].to(device)
        )
        loss = criterion(outputs, batch['targets'].to(device))
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = batch['targets'].cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        # grad descent step
        optimizer.step()

    # returning: trained model, model accuracy, mean loss
    return float(correct_predictions)/num_samples, np.mean(losses)

In [22]:
def eval(validation_loader, model, optimizer, criterion):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch in validation_loader:
            outputs = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['token_type_ids'].to(device)
            )

            loss = criterion(outputs, batch['targets'].to(device))
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = batch['targets'].cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)


In [None]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    train_acc, train_loss = train(train_data_loader, model, optimizer, criterion)
    val_acc, val_loss = eval(valid_data_loader, model, optimizer, criterion)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        print('Saving best model...')
        torch.save(
            {'model_state_dict': model.state_dict()},
            f'gdrive/My Drive/ecom_tech_dls/{SUBMISSION_NAME}.pt',
        )
        best_accuracy = val_acc

Epoch 1/100
train_loss=0.2619, val_loss=0.1487, train_acc=0.9527, val_acc=0.9662
Saving best model...
Epoch 2/100
train_loss=0.1396, val_loss=0.1372, train_acc=0.9669, val_acc=0.9662
Epoch 3/100
train_loss=0.1338, val_loss=0.1306, train_acc=0.9669, val_acc=0.9662
Epoch 4/100
train_loss=0.1236, val_loss=0.1181, train_acc=0.9677, val_acc=0.9678
Saving best model...
Epoch 5/100
train_loss=0.1112, val_loss=0.1067, train_acc=0.9695, val_acc=0.9698
Saving best model...
Epoch 6/100
train_loss=0.0993, val_loss=0.0966, train_acc=0.9722, val_acc=0.9732
Saving best model...
Epoch 7/100
train_loss=0.0884, val_loss=0.0877, train_acc=0.9753, val_acc=0.9744
Saving best model...
Epoch 8/100
train_loss=0.0782, val_loss=0.0803, train_acc=0.9778, val_acc=0.9752
Saving best model...
Epoch 9/100
train_loss=0.0687, val_loss=0.0734, train_acc=0.9803, val_acc=0.9782
Saving best model...
Epoch 10/100
train_loss=0.0602, val_loss=0.0676, train_acc=0.9834, val_acc=0.9795
Saving best model...
Epoch 11/100
train_lo

In [None]:
plt.plot(history['train_loss'], label='train')
plt.plot(history['val_loss'], label='validation')
plt.title('Training history')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid()

In [23]:
trained_model = ClassificationModel(BACKBONE_NAME)

checkpoint = torch.load(f'gdrive/My Drive/ecom_tech_dls/{SUBMISSION_NAME}.pt', map_location=device)
trained_model.load_state_dict(checkpoint['model_state_dict'])
trained_model.to(device)

trained_model.eval()

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  checkpoint = torch.load(f'gdrive/My Drive/ecom_tech_dls/{SUBMISSION_NAME}.pt', map_location=device)


ClassificationModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [24]:
preds = None
for batch in test_data_loader:
    out = trained_model(
        batch['input_ids'].to(device),
        batch['attention_mask'].to(device),
        batch['token_type_ids'].to(device)
    )
    if preds is not None:
        preds = np.append(preds, torch.sigmoid(out).cpu().detach().numpy().round(), axis=0)
    else:
        preds = torch.sigmoid(out).cpu().detach().numpy().round()

KeyboardInterrupt: 

In [54]:
def write_submission(pred_test, test_df, name):
    answer = []
    np.apply_along_axis(
        lambda x: answer.append(' '.join(np.where(x == 1)[0].astype(str))),
        axis=1,
        arr=pred_test
    )
    test_df[["index"]].assign(target = answer).to_csv(ROOT_DIR / name, index=False)

In [73]:
write_submission(preds, df_test, f'{SUBMISSION_NAME}.csv')

In [74]:
res = pd.read_csv('rubert_tiny_unfreezed_submission.csv')

In [79]:
res

Unnamed: 0,index,target
0,3135,1 2
1,4655,12
2,22118,2
3,23511,0
4,45,18 20
...,...,...
9010,3523,3 37
9011,24925,22
9012,6327,8
9013,530,15


In [78]:
res[~res['target'].isna()]

Unnamed: 0,index,target
0,3135,1 2
1,4655,12
2,22118,2
3,23511,0
4,45,18 20
...,...,...
9010,3523,3 37
9011,24925,22
9012,6327,8
9013,530,15
