In [19]:
# 1. 初始化环境

import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainerCallback
)

In [20]:
# 2. 初始化模型和tokenizer

model_name = "utrobinmv/t5_translate_en_ru_zh_small_1024"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
# 3. 数据预处理函数

def preprocess_data(examples):
    # 数据清洗和转换
    inputs = ["translate to ru: " + str(zh) for zh in examples["comment_zh"]]
    targets = [str(ru) for ru in examples["comment_ru"]]
    
    # Tokenize处理
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="np"
    )
    
    labels = tokenizer(
        text_target=targets,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="np"
    )["input_ids"]

    # 有效性检查（矢量化操作）
    valid_mask = (
        (model_inputs["input_ids"] < tokenizer.vocab_size).all(axis=1) & 
        (labels < tokenizer.vocab_size).all(axis=1))
    
    return {
        "input_ids": model_inputs["input_ids"][valid_mask].tolist(),
        "attention_mask": model_inputs["attention_mask"][valid_mask].tolist(),
        "labels": labels[valid_mask].tolist()
    }

In [22]:
# 4. 数据准备流程

# 加载并清洗数据
df = pd.read_csv(r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datasets\data.csv")  # 替换为你的CSV路径
print(f"原始数据量: {len(df)}")

# 数据清洗
df = df.dropna(subset=["comment_zh", "comment_ru"])
df = df[df["comment_zh"].apply(lambda x: isinstance(x, str))]
df = df[df["comment_ru"].apply(lambda x: isinstance(x, str))]
print(f"清洗后数据量: {len(df)}")

# 数据集划分
train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)

# 转换为HuggingFace Dataset格式
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(eval_df.reset_index(drop=True))

原始数据量: 1258
清洗后数据量: 1258


In [23]:
# 5. 应用数据预处理

try:
    train_dataset = train_dataset.map(
        preprocess_data,
        batched=True,
        batch_size=32,
        remove_columns=train_df.columns.tolist()
    )
    print(f"有效训练样本: {len(train_dataset)}")
except Exception as e:
    print(f"训练集预处理错误: {str(e)}")

try:
    eval_dataset = eval_dataset.map(
        preprocess_data,
        batched=True,
        batch_size=32,
        remove_columns=eval_df.columns.tolist()
    )
    print(f"有效验证样本: {len(eval_dataset)}")
except Exception as e:
    print(f"验证集预处理错误: {str(e)}")

Map:   0%|          | 0/1132 [00:00<?, ? examples/s]

有效训练样本: 1132


Map:   0%|          | 0/126 [00:00<?, ? examples/s]

有效验证样本: 126


In [27]:
# 6：训练配置

class SafeTrainingCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.log_history and "loss" in state.log_history[-1]:
            if state.log_history[-1]["loss"] > 1e5:
                print("\n检测到异常损失值，停止训练！")
                control.should_training_stop = True

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-comment-translator",
    evaluation_strategy="epoch",  # 评估策略保持epoch
    save_strategy="epoch",        # 新增：将保存策略也设为epoch
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True if torch.cuda.is_available() else False,
    logging_steps=50,
    save_total_limit=2,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,  # 需要配合metric_for_best_model使用
    metric_for_best_model="eval_bleu_score",  # 需要与compute_metrics返回的指标名称一致
    greater_is_better=True,       # BLEU分数越高越好
    report_to="none"
)

In [28]:
# 在创建trainer之后、开始训练之前添加
print("验证训练配置:")
print(f"评估策略: {training_args.evaluation_strategy}")
print(f"保存策略: {training_args.save_strategy}")
print(f"最佳模型指标: {training_args.metric_for_best_model}")

# 应该输出：
# 评估策略: epoch
# 保存策略: epoch
# 最佳模型指标: eval_bleu_score

验证训练配置:
评估策略: epoch
保存策略: epoch
最佳模型指标: eval_bleu_score


In [29]:
# 7. 创建Trainer

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    pad_to_multiple_of=8
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[SafeTrainingCallback()]
)

In [None]:
# 8. 开始训练（带崩溃保护）

try:
    print("启动训练...")
    train_result = trainer.train()
except Exception as e:
    print(f"\n训练异常中断: {str(e)}")
    print("尝试保存当前进度...")
    trainer.save_model("./interrupted_model")
    print("已保存恢复检查点到 interrupted_model 目录")

# 保存最终模型
trainer.save_model("./t5-comment-translator/final_model")
print("训练完成！")

启动训练...


  0%|          | 0/705 [00:00<?, ?it/s]