In [1]:
# 安装必要库

!pip install transformers datasets sacrebleu sentencepiece

Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Using cached portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Using cached sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Using cached portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu


ERROR: Could not install packages due to an OSError: [WinError 2] 系统找不到指定的文件。: 'C:\\Python310\\Scripts\\sacrebleu.exe' -> 'C:\\Python310\\Scripts\\sacrebleu.exe.deleteme'


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import torch
import numpy as np
import pandas as pd
import logging
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers.trainer_callback import TrainerCallback
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# 配置日志记录
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
# 增强的错误处理回调
class EnhancedErrorHandlingCallback(TrainerCallback):
    def __init__(self):
        self.error_count = 0
        self.max_errors = 10  # 最大允许错误数
    
    def on_step_end(self, args, state, control, **kwargs):
        if "loss" not in state.log_history[-1]:
            self.error_count += 1
            logger.warning(f"跳过问题批次 (累计跳过: {self.error_count})")
            if self.error_count >= self.max_errors:
                logger.error("达到最大错误次数，终止训练")
                control.should_training_stop = True
            return control

In [6]:
# 增强的数据预处理函数
def enhanced_preprocess(example):
    try:
        # 空值检查
        if not example["comment_zh"] or not example["comment_ru"]:
            raise ValueError("空值样本")
        
        # 长度检查
        input_text = f"translate to ru: {example['comment_zh']}"
        if len(input_text) > 512 or len(example['comment_ru']) > 512:
            raise ValueError("文本过长")
        
        # 编码处理
        inputs = tokenizer(
            input_text,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None
        )
        
        targets = tokenizer(
            text_target=example["comment_ru"],
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None
        )
        
        # 有效性检查
        if not inputs["input_ids"] or not targets["input_ids"]:
            raise ValueError("无效编码")
            
        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": targets["input_ids"]
        }
    except Exception as e:
        logger.debug(f"过滤样本: {e} - 原文: {example['comment_zh'][:50]}...")
        return None

In [7]:
# 加载并严格过滤数据
def load_and_filter_data(path):
    df = pd.read_csv(path)
    logger.info(f"原始数据量: {len(df)}")
    
    # 空值过滤
    df = df[["comment_zh", "comment_ru"]].dropna()
    df = df[(df["comment_zh"].str.len() > 0) & (df["comment_ru"].str.len() > 0)]
    logger.info(f"空值过滤后: {len(df)}")
    
    # 长度过滤
    df = df[df["comment_zh"].apply(lambda x: len(x) <= 500)]
    df = df[df["comment_ru"].apply(lambda x: len(x) <= 500)]
    logger.info(f"长度过滤后: {len(df)}")
    
    if len(df) == 0:
        raise ValueError("有效数据量为零，请检查数据文件！")
    
    return df

In [8]:
# 改进的数据整理函数
def robust_data_collator(features):
    try:
        # 过滤无效特征
        valid_features = []
        for f in features:
            if f and len(f["input_ids"]) > 0 and len(f["labels"]) > 0:
                valid_features.append(f)
        
        if not valid_features:
            logger.warning("收到空批次，生成虚拟样本")
            return {
                "input_ids": torch.zeros((1, 10), dtype=torch.long),
                "attention_mask": torch.ones((1, 10), dtype=torch.long),
                "labels": torch.zeros((1, 10), dtype=torch.long)
            }
        
        # 动态最大长度计算
        max_len = max(len(f["input_ids"]) for f in valid_features)
        max_len = min(max_len, 512)
        
        batch = tokenizer.pad(
            {"input_ids": [f["input_ids"] for f in valid_features]},
            padding="longest",
            max_length=max_len,
            return_tensors="pt"
        )
        
        labels = tokenizer.pad(
            {"input_ids": [f["labels"] for f in valid_features]},
            padding="longest",
            max_length=max_len,
            return_tensors="pt"
        )["input_ids"]
        
        # 替换pad_token_id
        labels[labels == tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "labels": labels
        }
    except Exception as e:
        logger.error(f"数据整理错误: {e}")
        return None

In [9]:
# 主流程
try:
    # 加载并预处理数据
    df = load_and_filter_data(r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datasets\data.csv")  # 替换实际路径
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    
    # 转换数据集格式
    train_dataset = Dataset.from_pandas(train_df).map(
        enhanced_preprocess,
        remove_columns=["comment_zh", "comment_ru"],
        num_proc=2,
        load_from_cache_file=False
    ).filter(lambda x: x is not None)
    
    val_dataset = Dataset.from_pandas(val_df).map(
        enhanced_preprocess,
        remove_columns=["comment_zh", "comment_ru"],
        num_proc=2,
        load_from_cache_file=False
    ).filter(lambda x: x is not None)
    
    logger.info(f"训练样本数: {len(train_dataset)}")
    logger.info(f"验证样本数: {len(val_dataset)}")
    
    # 模型初始化
    model = T5ForConditionalGeneration.from_pretrained("utrobinmv/t5_translate_en_ru_zh_small_1024")
    tokenizer = T5Tokenizer.from_pretrained("utrobinmv/t5_translate_en_ru_zh_small_1024")
    
    # 训练参数配置
    training_args = Seq2SeqTrainingArguments(
        output_dir="./enhanced-model",
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=500,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        num_train_epochs=5,
        logging_dir="./logs",
        report_to="none",
        gradient_accumulation_steps=2,
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
        remove_unused_columns=False,
        predict_with_generate=True,
        generation_max_length=512
    )
    
    # 初始化Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=robust_data_collator,
        tokenizer=tokenizer,
        callbacks=[EnhancedErrorHandlingCallback()]
    )
    
    # 训练流程
    try:
        logger.info("开始训练...")
        trainer.train()
    except Exception as e:
        logger.error(f"训练中断: {e}")
        trainer.save_model("./interrupted-model")
        logger.info("中间模型已保存到 ./interrupted-model")
    
    # 最终保存
    trainer.save_model("./enhanced-model")
    tokenizer.save_pretrained("./enhanced-model")
    logger.info("训练完成，模型已保存到 ./enhanced-model")

except Exception as e:
    logger.error(f"流程错误: {e}")
    raise

INFO:__main__:原始数据量: 1258
INFO:__main__:空值过滤后: 1258
INFO:__main__:长度过滤后: 1231


Map (num_proc=2):   0%|          | 0/1107 [00:00<?, ? examples/s]

ERROR:__main__:流程错误: name 'logger' is not defined


NameError: name 'logger' is not defined

In [4]:
# 自定义回调函数处理错误
from transformers.trainer_callback import TrainerCallback

class ErrorHandlingCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("Starting training...")
        self.skipped_batches = 0

    def on_step_end(self, args, state, control, **kwargs):
        if "loss" not in state.log_history[-1]:
            print(f"Skipping problematic batch (total skipped: {self.skipped_batches})")
            self.skipped_batches += 1
            return control


In [5]:
# 数据预处理函数
def preprocess_data(example):
    try:
        # 添加目标语言前缀
        input_text = f"translate to ru: {example['comment_zh']}"
        target_text = example['comment_ru']
        
        # 编码处理
        inputs = tokenizer(
            input_text,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None
        )
        
        targets = tokenizer(
            text_target=target_text,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None
        )
        
        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": targets["input_ids"]
        }
    except Exception as e:
        print(f"Error processing sample: {e}")
        return None

In [6]:
# 加载数据集
df = pd.read_csv(r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datasets\data.csv")  # 替换为实际路径
df = df[["comment_zh", "comment_ru"]].dropna()
df

Unnamed: 0,comment_zh,comment_ru
0,用于两个样本分位数的自助法t检验,t-тест самообслуживания для двух квартилей выб...
1,计算控制组和实验组的自助法分位数,Рассчитать квантили метода самообслуживания дл...
2,对自助法分位数进行t检验,Провести t-тест на квантиль самообслуживания
3,判断是否拒绝原假设,"Определение того, отклонять или нет первоначал..."
4,返回使用自助抽样法计算的分位数分布。,"Возвращает квантильное распределение, рассчита..."
...,...,...
1253,拟合模型。\n\n 参数\n -------------\n ...,Комбинированные модели.\n\nПараметры\n--------...
1254,将数据集减少到选择的特征。\n\n 参数\n ---------...,Снизить наборы данных до отдельных характерист...
1255,返回选择的特征数量。,Возвращает количество выбранных характеристик.
1256,生成数据集。\n\n 参数\n -------------\n n_sam...,Создаёт наборы данных.\n\nПараметры\n---------...


In [7]:
# 划分训练集和验证集
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# 转换为HuggingFace数据集格式
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [8]:
# 加载模型和tokenizer
model_name = "utrobinmv/t5_translate_en_ru_zh_small_1024"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# 应用预处理
train_dataset = train_dataset.map(
    preprocess_data,
    remove_columns=["comment_zh", "comment_ru"],
    num_proc=4
).filter(lambda x: x is not None)

val_dataset = val_dataset.map(
    preprocess_data,
    remove_columns=["comment_zh", "comment_ru"],
    num_proc=4
).filter(lambda x: x is not None)

Map (num_proc=4):   0%|          | 0/1132 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1132 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/126 [00:00<?, ? examples/s]

Filter:   0%|          | 0/126 [00:00<?, ? examples/s]

In [10]:
# 数据整理函数（添加错误处理）
def data_collator(features):
    filtered_features = [f for f in features if f is not None]
    if not filtered_features:
        return None  # 跳过空batch
    
    batch = tokenizer.pad(
        filtered_features,
        padding="longest",
        return_tensors="pt"
    )
    
    # 检查是否包含无效的token id
    if (batch["labels"] >= tokenizer.vocab_size).any():
        print("Detected invalid token IDs in labels, skipping batch")
        return None
        
    return {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"],
        "labels": batch["labels"]
    }

In [None]:
# 训练参数配置
training_args = Seq2SeqTrainingArguments(
    output_dir="./fine-tuned-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none",
    gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available(),
    dataloader_drop_last=True,  # 防止最后不完整的batch
    load_best_model_at_end=True,  # 保存最佳模型
)



In [12]:
# 初始化Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[ErrorHandlingCallback()]
)

# 开始训练
try:
    trainer.train()
except Exception as e:
    print(f"Training interrupted by error: {e}")
    # 自动保存检查点
    trainer.save_model("./partial-model")
    print("Partial model saved at ./partial-model")

Starting training...


  0%|          | 0/705 [00:00<?, ?it/s]

Training interrupted by error: list index out of range


  return table.fast_gather(key % table.num_rows)


Partial model saved at ./partial-model


In [None]:
# 保存最终模型
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")