In [3]:
from datasets import load_dataset  # 导入datasets库中的load_dataset函数
dateset = load_dataset('imdb')    # 加载IMDB电影评论情感分析数据

Using the latest cached version of the dataset since imdb couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at C:\Users\31163\.cache\huggingface\datasets\imdb\plain_text\0.0.0\e6281661ce1c48d982bc483cf8a173c1bbeb5d31 (last modified on Wed Jun 25 16:32:48 2025).


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
try:
	tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
	model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
except Exception as e:
	print("Could not download model from Hugging Face. Trying local cache...")
	tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', local_files_only=True)
	model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, local_files_only=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenizer_function(expamles):
    return tokenizer(expamles['text'], padding='max_length', truncation=True)
tokenizer_datasets = dateset.map(tokenizer_function, batched=True)

In [6]:
tokenizer_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [7]:
# %pip install tf-keras

In [None]:
from transformers import TrainingArguments, Trainer

tokenizer_datasets = tokenizer_datasets.remove_columns(['text'])
tokenizer_datasets = tokenizer_datasets.rename_column('label', 'labels')
tokenizer_datasets.set_format('torch')

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_datasets['train'].select(range(2000)),
    eval_dataset=tokenizer_datasets['test'].select(range(2000)),
)
trainer.train()

  0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
import transformers
print(transformers.__version__)

4.40.1


In [None]:
import accelerate
print(accelerate.__version__)


1.8.1


In [1]:
from datasets import load_dataset                    # 加载 HuggingFace 的 datasets 库，用于获取数据集
from transformers import (                           # 导入 transformers 库中的各类工具
    AutoTokenizer,                                   # 自动下载并加载分词器
    AutoModelForSequenceClassification,              # 自动下载并加载用于文本分类的 BERT 模型
    TrainingArguments,                               # 训练参数配置类
    Trainer                                          # 封装训练流程的 Trainer 类
)
import numpy as np                                   # 用于数值计算
from sklearn.metrics import accuracy_score, f1_score # 用于模型评估的准确率和F1分数

# =============================
# 2. 加载 IMDb 数据集
# =============================
dataset = load_dataset("imdb")                       # 下载并加载 IMDb 影评情感分类数据集
# 该数据集包含三部分：train（25000条）、test（25000条）、unsupervised（未使用）

# =============================
# 3. 初始化 BERT 的 tokenizer
# =============================
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # 加载英文小写版 BERT 的分词器
# 用于将英文文本分词，并转为模型输入的 token ID（WordPiece）

# =============================
# 4. 定义 tokenizer 函数，自动对每条样本进行编码
# =============================
def tokenize_function(example):
    # 对每条文本进行分词、截断和填充，返回 token id
    return tokenizer(example["text"], padding="max_length", truncation=True)

# 批量 token 化数据（非常重要的预处理）
tokenized_datasets = dataset.map(tokenize_function, batched=True)  # 对整个数据集批量分词

# =============================
# 5. 数据格式调整为模型需要的格式
# =============================
tokenized_datasets = tokenized_datasets.remove_columns(["text"])    # 移除原始文本列，只保留模型输入
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")  # 将标签字段重命名为 labels（Trainer 默认要求）
tokenized_datasets.set_format("torch")  # 转为 PyTorch Tensor 格式，便于模型训练

# =============================
# 6. 加载预训练模型并初始化分类任务
# =============================
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)
# 说明：
# - 使用预训练的 BERT
# - 修改最后输出层，使其支持 2 分类（正面 vs 负面）

# =============================
# 7. 定义训练参数
# =============================
training_args = TrainingArguments(
    output_dir="./results",                     # 输出模型保存路径
    evaluation_strategy="epoch",                # 每个 epoch 后评估一次
    per_device_train_batch_size=16,             # 每个设备的训练 batch 大小
    per_device_eval_batch_size=64,              # 每个设备的评估 batch 大小
    num_train_epochs=2,                         # 训练 2 轮
    weight_decay=0.01,                          # 权重衰减，防止过拟合
    logging_dir='./logs',                       # 日志路径（可选）
)

# =============================
# 8. 构造 Trainer（封装训练逻辑）
# =============================
trainer = Trainer(
    model=model,                                # 训练的模型
    args=training_args,                         # 训练参数
    train_dataset=tokenized_datasets["train"].select(range(2000)),  # 选取前2000条训练样本
    eval_dataset=tokenized_datasets["test"].select(range(1000)),    # 选取前1000条测试样本
)

# =============================
# 9. 正式开始训练
# =============================
trainer.train()                                 # 启动训练流程

# =============================
# 10. 模型评估
# =============================
predictions = trainer.predict(tokenized_datasets["test"].select(range(1000)))  # 对测试集前1000条进行预测
preds = np.argmax(predictions.predictions, axis=-1)  # 获取每条预测结果的类别
labels = predictions.label_ids                   # 获取真实标签

print("Accuracy:", accuracy_score(labels, preds)) # 输出准确率
print("F1 Score:", f1_score(labels, preds))      # 输出F1分数

KeyboardInterrupt: 