In [119]:
from transformers import BertModel, BertConfig
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np

**feature extraction**

In [70]:
dataset = datasets.load_dataset('sst2')
dataset

Using custom data configuration default
Reusing dataset sst2 (/tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [74]:
np.unique(dataset['test']['label']) # weird error -- all test labels seem to be -1

array([-1])

**classification example**

In [75]:
# classifier = pipeline("sentiment-analysis", model='bert-base-uncased')
classifier = pipeline("sentiment-analysis") #, model='bert-base-uncased')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [78]:
N = 150
split = 'validation' # weird error -- all test labels seem to be -1
preds = classifier(dataset[split]['sentence'][:N])
M = {'POSITIVE': 1, 'NEGATIVE': 0}
preds = [M[p['label']] for p in preds]
# print(preds, dataset[split]['label'])
print('Acc', np.mean(np.array(preds) == np.array(dataset[split]['label'])[:N]).round(3))

Acc 0.907


**feature extraction**

In [122]:
feature_extractor = pipeline("feature-extraction",
                             model='distilbert-base-uncased-finetuned-sst-2-english')
feats_list  = feature_extractor(dataset[split]['sentence'][:N])
print([len(feats[x][0]) for x in range(10)])

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[12, 10, 24, 28, 13, 25, 8, 15, 26, 40, 23, 39, 25, 26, 29, 26, 21, 44, 14, 21, 41, 17, 10, 20, 18, 35, 35, 28, 19, 13, 35, 38, 10, 39, 27, 14, 31, 40, 27, 23, 25, 30, 26, 19, 21, 18, 23, 17, 14, 25]


By default, features with different length sequences have different-sized embeddings. To fix this, let's apply padding during embedding step.

In [136]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [157]:
sequences = dataset['train']['sentence'][:10]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print('tokens', tokens['input_ids'].shape)
output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
embs = output['pooler_output'].cpu().detach().numpy()
print('embeddings', embs.shape)

tokens torch.Size([10, 29])
embeddings (10, 768)


# custom feature extraction
let's map each sequence to the sum of features obtained by each token (tutorial [here](https://huggingface.co/course/chapter3/2?fw=pt)).

In [158]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [174]:
def featurize_function(example):
    tokens = tokenizer(example['sentence'], padding=True, truncation=True, return_tensors="pt")
    output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
    embs = output['pooler_output'].cpu().detach().numpy()
    return {'embs': embs}

In [175]:
dset_small = dataset['validation'] #[:10]
dset_small = dset_small.filter(lambda x: len(x["sentence"]) < 20)
dset_small

Loading cached processed dataset at /tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5/cache-7876086904868ef1.arrow


Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 8
})

In [176]:
featurized_dataset = dset_small.map(featurize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [184]:
np.array(featurized_dataset['embs']).shape

(8, 768)