# Info

Получение эмбедингов из дообученной модели BERT.

# Settings

In [None]:
# Files
GDRIVE_DIR = r'/content/drive/MyDrive/DS/20230314_ke-intern-test/'

DATASET_DIR = GDRIVE_DIR + 'dataset/'

TRAIN_NPZ = GDRIVE_DIR + 'tokens_rubert_train.npz'
VAL_NPZ = GDRIVE_DIR + 'tokens_rubert_val.npz'
TEST_NPZ = GDRIVE_DIR + 'tokens_rubert_test.npz'

# Model
BERT_MODEL_DIR = GDRIVE_DIR + 'models/fine-tune-bert_0_765561/'

# Reproducibility
SEED = 1

# Init

## Installation

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## Imports

In [None]:
import gc

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import torch

## Definitions

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('CPU')

GPU: Tesla T4


In [None]:
#@title  { form-width: "1px", display-mode: "form" }
#@markdown ```python
#@markdown class Dataset(inputs)
#@markdown ```

class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs
        
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.inputs.items()}
    
    def __len__(self):
        return len(self.inputs['input_ids'])

# Main

## Loading data

In [None]:
train_npz = np.load(TRAIN_NPZ)
val_npz = np.load(VAL_NPZ)

input_ids_train = train_npz['input_ids']
labels_train = train_npz['labels']

input_ids_val = val_npz['input_ids']
labels_val = val_npz['labels']

input_ids_train.shape, labels_train.shape, input_ids_val.shape, labels_val.shape

((81120, 300), (81120,), (10000, 300), (10000,))

In [None]:
# Create torch datasets
train_inputs = {'input_ids': torch.tensor(input_ids_train.astype(np.int32)),
                'attention_mask': torch.tensor(input_ids_train != 0, dtype=torch.uint8),
                'labels': torch.tensor(labels_train.astype(np.int64))}
                # int64 is required here to convert to torch.long dtype

val_inputs = {'input_ids': torch.tensor(input_ids_val.astype(np.int32)),
                'attention_mask': torch.tensor(input_ids_val != 0, dtype=torch.uint8),
                'labels': torch.tensor(labels_val.astype(np.int64))}

train_ds = Dataset(train_inputs)
val_ds = Dataset(val_inputs)

train_ds[0]
# Example of sample

{'input_ids': tensor([   101,  94934,  31091,  46754,  35127,  48675,  43485,    869,  61248,
          33460,  28221,    192,  39362,  31694,  35633,   6301,  54119,  68524,
            814,    106,  79588,  32145,    869,  16337,  54384,   3187,  29697,
           1703,  82941,  31231,   1706,   1766,  36260,   7993,    114,  72792,
            132,  83057,   7471,    851,  19998,   2630,  14269,  24737,  60689,
            869,  16337,  54384,   3187,   2068,  34035,   2748,  27339,    128,
           4427,  11992,   2190,  39843,    851,  89585,  35260,  21953,    132,
            100,  52837,  14444, 112072,   9450,   1469,  10189,  63154,   3521,
          16729,  25377,  38156,    128,   1997,  13231,    875,   3660,   6818,
           7462,  38741,    866,  16729,    132,   7638,  10271,   3998,   5022,
          24856,  89769,    128,   3622,  22571,  45628,   3247,   1516,  45051,
            132,   7638,  56861,    128,  13717,  24935,   1516,  46758,    128,
          27519

## Language model

In [None]:
import transformers
from transformers import BertModel

In [None]:
model = BertModel.from_pretrained(BERT_MODEL_DIR).to(device)
model.eval();

Some weights of the model checkpoint at /content/drive/MyDrive/DS/20230314_ke-intern-test/models/fine-tune-bert_0_765561/ were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
rows = [[param_name, list(param_tensor.size())]
        for param_name, param_tensor in model.named_parameters()]
layers = pd.DataFrame(rows, columns='layer_name layer_shape'.split())
layers['layer_size'] = layers.layer_shape.map(np.prod)
layers  # список слоев модели

Unnamed: 0,layer_name,layer_shape,layer_size
0,embeddings.word_embeddings.weight,"[119547, 768]",91812096
1,embeddings.position_embeddings.weight,"[512, 768]",393216
2,embeddings.token_type_embeddings.weight,"[2, 768]",1536
3,embeddings.LayerNorm.weight,[768],768
4,embeddings.LayerNorm.bias,[768],768
...,...,...,...
194,encoder.layer.11.output.dense.bias,[768],768
195,encoder.layer.11.output.LayerNorm.weight,[768],768
196,encoder.layer.11.output.LayerNorm.bias,[768],768
197,pooler.dense.weight,"[768, 768]",589824


### Train/val embeddings

In [None]:
BATCH_SIZE = 256
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
train_emb = []

for batch in tqdm(train_dl):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Alternatively [CLS] token can be used as `output.last_hidden_state[:, 0, :]`
    emb = output.pooler_output

    # Normalize vectors to unit length. Useful for cosine similarity.
    norms = torch.norm(emb, dim=-1, keepdim=True)
    train_emb.append(emb / norms)

  0%|          | 0/317 [00:00<?, ?it/s]

In [None]:
val_emb = []

for batch in tqdm(val_dl):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Alternatively [CLS] token can be used as `output.last_hidden_state[:, 0, :]`
    emb = output.pooler_output

    # Normalize vectors to unit length. Useful for cosine similarity.
    norms = torch.norm(emb, dim=-1, keepdim=True)
    val_emb.append(emb / norms)

  0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
train_emb_numpy = torch.concat(train_emb).cpu().numpy()
val_emb_numpy = torch.concat(val_emb).cpu().numpy()

train_emb_numpy.shape, val_emb_numpy.shape

((81120, 768), (10000, 768))

In [None]:
np.savez_compressed('embeddings_text_train',
                    embeddings=train_emb_numpy,
                    labels=train_npz['labels'],
                    product_ids=train_npz['product_ids'],)

np.savez_compressed('embeddings_text_val',
                    embeddings=val_emb_numpy,
                    labels=val_npz['labels'],
                    product_ids=val_npz['product_ids'],)

In [None]:
!cp embeddings_text_train.npz {GDRIVE_DIR}
!cp embeddings_text_val.npz {GDRIVE_DIR}

### Test embeddings

In [None]:
test_npz = np.load(TEST_NPZ)

input_ids_test = test_npz['input_ids']

input_ids_test.shape

(16860, 300)

In [None]:
# Create torch dataset
test_inputs = {'input_ids': torch.tensor(input_ids_test.astype(np.int32)),
               'attention_mask': torch.tensor(input_ids_test != 0, dtype=torch.uint8),}

test_ds = Dataset(test_inputs)

test_ds[3]
# Example of sample

{'input_ids': tensor([   101, 100466,   2630,  78688,    108,  52158,  45174,    108,    128,
          39930,   1714,   2068, 104060,    128,  10178,  15560,  33249, 100466,
           2630,  78688,    108,  52158,  45174,    108,  67869,  64057,  36835,
          32921,    851,  39278,  52009,   1469,  16892,  56318,    128,    845,
          68142,  15625,  39930,   1714,   1997,  13289,  18344,    108,  52158,
          45174,    108,    845,  26689,  60791,    866,   9303,    128,   2306,
           3629,   1761,  54119,  96876,   2603, 114552,   1469,  67271,   5016,
          32488,    888,    132,  34992,  78688,  37711,  42753,   2800,  52785,
           1997,  20245,  10796,  12610,   8369,   2739,  10178,  19901,  66250,
           3474,  66250,   2748,  13516,  12750,    132,  28533,   5345,    124,
           5345,   4414,    132,  46460,    156, 110056,  37254,  39362,    207,
            102,      0,      0,      0,      0,      0,      0,      0,      0,
              0

In [None]:
BATCH_SIZE = 256
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
test_emb = []

for batch in tqdm(test_dl):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Alternatively [CLS] token can be used as `output.last_hidden_state[:, 0, :]`
    emb = output.pooler_output

    # Normalize vectors to unit length. Useful for cosine similarity.
    norms = torch.norm(emb, dim=-1, keepdim=True)
    test_emb.append(emb / norms)

  0%|          | 0/66 [00:00<?, ?it/s]

In [None]:
test_emb_numpy = torch.concat(test_emb).cpu().numpy()
test_emb_numpy.shape

(16860, 768)

In [None]:
np.savez_compressed('embeddings_text_test',
                    embeddings=test_emb_numpy,
                    product_ids=test_npz['product_ids'],)

In [None]:
!cp embeddings_text_test.npz {GDRIVE_DIR}