In [None]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# 加载数据集
# 使用IMDB数据集作为示例
dataset = load_dataset("imdb")

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


# 对数据进行编码
def encode_dataset(datasets):
    return datasets.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=512),
                        batched=True)


# 编码训练和测试集
encoded_dataset = encode_dataset(dataset)

# 选择训练和验证集
train_dataset = encoded_dataset['train'].shuffle(seed=42).select([i for i in list(range(1000))])  # 限制为1000个样本
test_dataset = encoded_dataset['test'].shuffle(seed=42).select([i for i in list(range(1000))])  # 限制为1000个样本

# 加载BERT模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',  # 输出目录
    num_train_epochs=3,  # 训练epoch数
    per_device_train_batch_size=8,  # 每个设备的batch size
    per_device_eval_batch_size=8,  # 评估时的batch size
    warmup_steps=500,  # 热身步数
    weight_decay=0.01,  # 权重衰减
    logging_dir='./logs',  # 日志目录
    logging_steps=10,
    evaluation_strategy="epoch"  # 每个epoch后评估
)

# 创建Trainer实例
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 训练模型
trainer.train()

# 评估模型
eval_result = trainer.evaluate()
print(f"评估结果: {eval_result}")


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]