## emotion数据集--嵌入向量提取

In [1]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import BertModel, BertTokenizer

In [2]:
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
device

'mps'

### 1. 提取嵌入

In [3]:
model_name = "bert-base-uncased"
# 下载预训练模型
model = BertModel.from_pretrained(model_name)
model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

# 加载数据集
ds = load_dataset("emotion")
ds

Using the latest cached version of the module from /Users/alex.zhou/.cache/huggingface/modules/datasets_modules/datasets/emotion/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd (last modified on Fri May 31 14:39:28 2024) since it couldn't be found locally at emotion, or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [5]:
# 分词
def tokenize_handler(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

# 给数据集ds添加3列数据：['input_ids', 'token_type_ids', 'attention_mask']
ds = ds.map(tokenize_handler, batched=True, batch_size=1000)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [6]:
# 提取句子的嵌入[CLS]
def embedding_handler(batch):
    # 模型执行需要传递的参数
    inputs = {}
    for k, v in batch.items():
        if k in tokenizer.model_input_names:
            inputs[k] = v.to(device)
            if not isinstance(v, torch.Tensor):
                print(k, v)
                print(batch[0])
                raise ValueError("传递的数据不是张量")
    # 调用模型
    with torch.no_grad():
        # 最后一层的状态向量
        last_hidden_state = model(**inputs).last_hidden_state

    # 返回第一个[CLS]的嵌入向量
    return {"hidden_state": last_hidden_state[:, 0].cpu()}
    

In [7]:
ds.set_format("torch", columns=['input_ids', 'token_type_ids', 'attention_mask'])

In [8]:
# 获取嵌入向量
%time ds_embeddings = ds.map(embedding_handler, batched=True, batch_size=500)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

CPU times: user 34.4 s, sys: 35.9 s, total: 1min 10s
Wall time: 1min 38s


In [9]:
ds_embeddings

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2000
    })
})

In [10]:
# 查看训练集第1条的嵌入向量
ds_embeddings["train"]["hidden_state"][0].shape, type(ds_embeddings["train"]["hidden_state"][0])

(torch.Size([768]), torch.Tensor)

In [11]:
train_ds = ds_embeddings["train"]
train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
    num_rows: 16000
})

In [12]:
train_ds.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'hidden_state': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}