# Deep Learning - Task 10

В этой серии заданий вам понадобится сделать эмбеддинги для объектов из некоторой подвыборки датасета IMDB с помощью предобученных моделей из Hugging Face. В этом задании сделайте их с помощью 3 моделeй:
- BERT (bert-base-cased),
- RoBERTa,
- DistillBert,

воспользуйтесь функцией get_embeddings_labels с семинара.

```python
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

np.random.seed(100)
idx = np.random.randint(len(dataset), size=200)
```


Проверьте перед сдачей, тензор с эмбеддингами должен иметь размер `(200, 768)`.

## Подготовка к заданию

In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Subset, DataLoader

from warnings import filterwarnings

filterwarnings('ignore')


In [2]:
from transformers import AutoTokenizer
# https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import BertModel
# https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import RobertaModel
# https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel
from transformers import DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        # https://huggingface.co/distilbert-base-cased
        'distilbert': 'distilbert-base-cased'
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])


In [3]:
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []
    labels = []

    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device)
                 for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)


In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())


cuda:0
NVIDIA GeForce GTX 1080


In [5]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)


## BERT

In [7]:
from datasets import load_dataset

np.random.seed(100)

dataset = load_dataset("imdb", split="train")

tokenizer, model = get_model('bert')


Found cached dataset imdb (C:/Users/Gavlique/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [9]:
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [10]:
model = model.to(device)


In [11]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=[
                   "input_ids", "attention_mask", "label"])

idx = np.random.randint(len(dataset), size=200).tolist()

loader = DataLoader(Subset(dataset, idx), batch_size=64,
                    collate_fn=data_collator, pin_memory=True, shuffle=False)


Loading cached processed dataset at C:\Users\Gavlique\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-7b100d74d4f88c2b.arrow


In [12]:
embeddings, labels = get_embeddings_labels(model, loader)


  0%|          | 0/4 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


In [13]:
embeddings.shape

torch.Size([200, 768])

In [14]:
torch.save(embeddings, 'bert.pt')

## Roberta

In [24]:
from datasets import load_dataset

np.random.seed(100)

dataset = load_dataset("imdb", split="train")

tokenizer, model = get_model('roberta')


Found cached dataset imdb (C:/Users/Gavlique/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pool

In [25]:
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [26]:
data_collator

DataCollatorWithPadding(tokenizer=RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [27]:
model = model.to(device)


In [28]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=[
                   "input_ids", "attention_mask", "label"])

idx = np.random.randint(len(dataset), size=200).tolist()

loader = DataLoader(Subset(dataset, idx), batch_size=64,
                    collate_fn=data_collator, pin_memory=True, shuffle=False)


Loading cached processed dataset at C:\Users\Gavlique\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-f534348dca332b55.arrow


In [29]:
embeddings, labels = get_embeddings_labels(model, loader)


  0%|          | 0/4 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 4/4 [00:05<00:00,  1.32s/it]


In [30]:
embeddings.shape

torch.Size([200, 768])

In [31]:
torch.save(embeddings, 'roberta.pt')

## Distilbert

In [32]:
from datasets import load_dataset

np.random.seed(100)

dataset = load_dataset("imdb", split="train")

tokenizer, model = get_model('distilbert')


Found cached dataset imdb (C:/Users/Gavlique/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [34]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [35]:
model = model.to(device)


In [36]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=[
                   "input_ids", "attention_mask", "label"])

idx = np.random.randint(len(dataset), size=200).tolist()

loader = DataLoader(Subset(dataset, idx), batch_size=64,
                    collate_fn=data_collator, pin_memory=True, shuffle=False)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [37]:
embeddings, labels = get_embeddings_labels(model, loader)


  0%|          | 0/4 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 4/4 [00:03<00:00,  1.29it/s]


In [38]:
embeddings.shape

torch.Size([200, 768])

In [39]:
torch.save(embeddings, 'distilbert.pt')