In [1]:
from datasets import load_dataset

dataset = load_dataset("tau/commonsense_qa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_function(examples):
    inputs = []
    targets = []
    
    for question, choices, answer in zip(examples['question'], examples['choices'], examples['answerKey']):
        input_str = f"Question: {question}\nOptions:\n"
        for i, choice in enumerate(choices['text']):
            input_str += f"{chr(65+i)}. {choice}\n"
        
        target_str = f"Answer: {answer}\n"
        inputs.append(input_str)
        targets.append(target_str)

    return {"input_text": inputs, "target_text": targets}

processed_dataset = dataset.map(preprocess_function, batched=True)

print(processed_dataset['train'].column_names)  # 'input_text' & 'target_text'

['id', 'question', 'question_concept', 'choices', 'answerKey', 'input_text', 'target_text']


In [3]:
print(processed_dataset['train'][0]) 
print(processed_dataset['train'].column_names)

{'id': '075e483d21c29a511267ef62bedc0461', 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', 'question_concept': 'punishing', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}, 'answerKey': 'A', 'input_text': 'Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?\nOptions:\nA. ignore\nB. enforce\nC. authoritarian\nD. yell at\nE. avoid\n', 'target_text': 'Answer: A\n'}
['id', 'question', 'question_concept', 'choices', 'answerKey', 'input_text', 'target_text']


In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = processed_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 9741/9741 [00:04<00:00, 2254.91 examples/s]
Map: 100%|██████████| 1221/1221 [00:00<00:00, 1445.07 examples/s]
Map: 100%|██████████| 1140/1140 [00:00<00:00, 1694.47 examples/s]


In [5]:
import os 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ['CUDA_VISIBLE_DEVICES'] = "1,4" 


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

def preprocess_for_lm(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]

    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=128)["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# 将 preprocess_for_lm 函数应用到数据集
tokenized_datasets = processed_dataset.map(preprocess_for_lm, batched=True)

# 使用 Trainer 进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

trainer.train()



Map: 100%|██████████| 9741/9741 [00:04<00:00, 1961.88 examples/s]
Map: 100%|██████████| 1221/1221 [00:00<00:00, 1541.14 examples/s]
Map: 100%|██████████| 1140/1140 [00:00<00:00, 1515.54 examples/s]
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.013,0.012752
2,0.0129,0.01275
3,0.0127,0.01268


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(

TrainOutput(global_step=3654, training_loss=0.01819913815982236, metrics={'train_runtime': 1086.637, 'train_samples_per_second': 26.893, 'train_steps_per_second': 3.363, 'total_flos': 1908934262784000.0, 'train_loss': 0.01819913815982236, 'epoch': 3.0})