In [9]:
import torch.utils.data as Data
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
import random

In [10]:
train_dataset = load_dataset(path='gbharti/finance-alpaca', split='train')
train_dataset[:10]

Using the latest cached version of the dataset since gbharti/finance-alpaca couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\duanm\.cache\huggingface\datasets\gbharti___finance-alpaca\default\0.0.0\6ef1c54a9bfff3397b8d9807b1934f6d9e230eca (last modified on Wed Jun 18 11:12:06 2025).


{'instruction': ['For a car, what scams can be plotted with 0% financing vs rebate?',
  'Why does it matter if a Central Bank has a negative rather than 0% interest rate?',
  'Where should I be investing my money?',
  'Specifically when do options expire?',
  'Negative Balance from Automatic Options Exercise. What to do?',
  'Approximation of equity value for company in default',
  'Is it true that 90% of investors lose their money?',
  'Can a company charge you for services never requested or received?',
  'Working out if I should be registered as self-employed in the UK',
  'About eToro investments'],
 'input': ['', '', '', '', '', '', '', '', '', ''],
 'output': ["The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, p

In [11]:
tokenizer = AutoTokenizer.from_pretrained('E:\huggingface_models\Qwen3-0.6B')

In [12]:
def process_func(example):
    messages = [
        {"role": "system", "content": "You are a financial advisor."},
        {"role": "user", "content": example['instruction'].strip()}
    ]
    instruction = tokenizer.apply_chat_template(conversation=messages,
                                                add_generation_prompt=True,
                                                tokenize=True,
                                                return_dict=True)
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


train_dataset = train_dataset.select(range(0, 1000)).map(process_func)
train_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [13]:
for i in train_dataset.select(range(0, 10)):
    print(len(i['input_ids']), len(i['labels']))

482 482
119 119
120 120
120 120
633 633
179 179
598 598
159 159
159 159
98 98


In [14]:
batch_size = 32

In [15]:
train_dataset = train_dataset.remove_columns(['instruction', 'input', 'output', 'text'])
yy_list = []
for i in range(0, len(train_dataset)):
    if i % batch_size == 0:
        length = random.randint(32, 64)
    yy_list.append(list(range(0, length)))
train_dataset = train_dataset.add_column('yy', yy_list)
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'yy'],
    num_rows: 1000
})

In [16]:
dac = Data.DataLoader(train_dataset, 
                      # 保留所有字段(且所有字段必须为数值类型)
                      # tokenizer类型字段(如:input_ids,attention_mask)和label or labels动态填充到最大长度
                      # 其他字段每个batch_size的数据必须可以转换为一个tensor
                      collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                                        padding=True,  # 默认padding=True
                                                        ), 
                      batch_size=batch_size)

for i in dac:
    print(i.keys())
    print(i['input_ids'].shape)
    print(i['attention_mask'].shape)
    print(i['labels'].shape)
    print(i['yy'].shape)
    print("#" * 50)
    

dict_keys(['input_ids', 'attention_mask', 'yy', 'labels'])
torch.Size([32, 659])
torch.Size([32, 659])
torch.Size([32, 659])
torch.Size([32, 39])
##################################################
dict_keys(['input_ids', 'attention_mask', 'yy', 'labels'])
torch.Size([32, 850])
torch.Size([32, 850])
torch.Size([32, 850])
torch.Size([32, 34])
##################################################
dict_keys(['input_ids', 'attention_mask', 'yy', 'labels'])
torch.Size([32, 1154])
torch.Size([32, 1154])
torch.Size([32, 1154])
torch.Size([32, 63])
##################################################
dict_keys(['input_ids', 'attention_mask', 'yy', 'labels'])
torch.Size([32, 796])
torch.Size([32, 796])
torch.Size([32, 796])
torch.Size([32, 41])
##################################################
dict_keys(['input_ids', 'attention_mask', 'yy', 'labels'])
torch.Size([32, 2133])
torch.Size([32, 2133])
torch.Size([32, 2133])
torch.Size([32, 47])
##################################################
dict_keys