In [35]:
import torch.utils.data as Data
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

In [36]:
train_dataset = load_dataset(path='dair-ai/emotion')['train']
train_dataset[:10]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy',
  'ive been feeling a little burdened lately wasnt sure why that was',
  'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny',
  'i feel as confused about life as a teenager or as jaded as a year old man',
  'i have been with petronas for years i feel that petronas has performed well and made a huge profit',
  'i feel romantic too'],
 'label': [0, 0, 3, 2, 3, 0, 5, 4, 1, 2]}

In [37]:
tokenizer = AutoTokenizer.from_pretrained('E:\huggingface_models\Qwen2.5-0.5B-Instruct')
train_dataset = train_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 16000
})

In [38]:
train_dataset = train_dataset.remove_columns('text')
train_dataset = train_dataset.add_column('yy', list(range(16000)))
train_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask', 'yy'],
    num_rows: 16000
})

In [None]:
dac = Data.DataLoader(train_dataset, 
                      # 保留所有列(所有列必须为数值类型列)
                      # Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length.
                      collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False), 
                      batch_size=2)

for i in dac:
    print(i.keys())
    print("yy: ", i["yy"])
    print("label: ", i['label'])

    # # labels内部计算实现(mlm=False时):
    # labels = batch["input_ids"].clone()
    # if self.tokenizer.pad_token_id is not None:
    #     labels[labels == self.tokenizer.pad_token_id] = -100
    # batch["labels"] = labels
    print("labels: ", i['labels'])
    print("labels.shape: ", i['labels'].shape)
    print('input_ids: ', i['input_ids'])
    print("input_ids.shape: ", i['input_ids'].shape)
    print("attention_mask.shape: ", i['attention_mask'].shape)
    break

dict_keys(['label', 'input_ids', 'attention_mask', 'yy', 'labels'])
yy:  tensor([0, 1])
label:  tensor([0, 0])
labels:  tensor([[   72, 47607,  2666,  2784, 53773,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100],
        [   72,   646,   728,   504,  8266,   773, 74223,   311,   773, 67365,
         37550,  1101,   504,  1660,  2163,  4325,   879, 33572,   323,   374,
         34347]])
labels.shape:  torch.Size([2, 21])
input_ids:  tensor([[    72,  47607,   2666,   2784,  53773, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643],
        [    72,    646,    728,    504,   8266,    773,  74223,    311,    773,
          67365,  37550,   1101,    504,   1660,   2163,   4325,    879,  33572,
            323,    374,  34347]])
input_ids.shape:  torch.Size([2, 21])
attention_mask.shape:  torch.Size([2, 21])
