### 【步骤1】微调模型

In [1]:
# 1. 加载所需库

import pandas as pd
import torch
from datasets import Dataset
import accelerate
import evaluate
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
# 2. 加载 CSV 数据集

csv_path = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datasets\data.csv"  # 替换为你的 CSV 文件路径
df = pd.read_csv(csv_path)

# 这里以 comment_zh 作为输入、comment_ru 作为目标文本；你也可以调整为结合代码内容的信息
# 为确保模型正确接收指令，我们在输入文本前加上翻译提示前缀
prefix = "translated into russian: "
df["input_text"] = prefix + df["comment_zh"].astype(str)
df["target_text"] = df["comment_ru"].astype(str)

print("数据样本：")
print(df[["input_text", "target_text"]].head())

数据样本：
                                    input_text  \
0    translated into russian: 用于两个样本分位数的自助法t检验   
1    translated into russian: 计算控制组和实验组的自助法分位数   
2        translated into russian: 对自助法分位数进行t检验   
3           translated into russian: 判断是否拒绝原假设   
4  translated into russian: 返回使用自助抽样法计算的分位数分布。   

                                         target_text  
0  t-тест самообслуживания для двух квартилей выб...  
1  Рассчитать квантили метода самообслуживания дл...  
2       Провести t-тест на квантиль самообслуживания  
3  Определение того, отклонять или нет первоначал...  
4  Возвращает квантильное распределение, рассчита...  


In [3]:
# 3. 将 DataFrame 转换为 Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# 4. 加载预训练的 T5 翻译模型（t5_translate_en_ru_zh_small_1024）
model_name = "utrobinmv/t5_translate_en_ru_zh_small_1024"  # 请确认该模型存在并可用
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 强制使用 CPU
device = "cpu"
model.to(device)
print(f"Using device: {device}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Using device: cpu


In [4]:
# 5. 定义预处理函数（分词、截断）

def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# 6. 划分训练集和验证集（例如 90% 训练，10% 验证）
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

Map:   0%|          | 0/1258 [00:00<?, ? examples/s]



In [9]:
# 7. 加载 BLEU 评估指标（使用 evaluate 库）
bleu = evaluate.load("bleu")

def safe_decode(sequences):
    """
    对 token 序列进行安全解码：将超出 tokenizer 词汇表范围的 token 替换为 pad token。
    """
    max_id = tokenizer.vocab_size
    safe_sequences = []
    for seq in sequences:
        safe_seq = [token if (0 <= token < max_id) else tokenizer.pad_token_id for token in seq]
        safe_sequences.append(safe_seq)
    return tokenizer.batch_decode(safe_sequences, skip_special_tokens=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # 这里先对预测值和标签进行安全处理
    decoded_preds = safe_decode(predictions)
    
    # 替换 -100 为 pad token id，并安全处理标签
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = safe_decode(labels)
    # BLEU 指标要求参考翻译是列表的列表
    decoded_labels = [[ref] for ref in decoded_labels]

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     # 替换 -100 并解码标签
#     labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     decoded_labels = [[ref] for ref in decoded_labels]
#     result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
#     return {"bleu": result["bleu"]}

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     print("原始预测值：", predictions)
#     print("原始标签值：", labels)

#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
#     # 确保 labels 形状正确
#     labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    
#     # 确保 labels 不是空的
#     if not labels or not any(labels):
#         return {"bleu": 0.0}

#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     decoded_labels = [[ref] for ref in decoded_labels]

#     print("解码后的预测值：", decoded_preds)
#     print("解码后的标签值：", decoded_labels)

#     result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
#     return {"bleu": result["bleu"]}


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

#     # 确保 labels 形状正确
#     labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]

#     # 确保 labels 不是空的
#     if not labels or not any(labels):
#         return {"bleu": 0.0}

#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     decoded_labels = [[ref] for ref in decoded_labels]

#     result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
#     return {"bleu": result["bleu"]}

# 8. 定义训练参数
training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
)

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

# 9. 初始化 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# print(f"训练集大小: {len(train_dataset)}")
# print(f"验证集大小: {len(eval_dataset)}")

# 10. 开始微调训练
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/426 [00:00<?, ?it/s]

{'loss': 0.274, 'grad_norm': 4.341250419616699, 'learning_rate': 4.882629107981221e-05, 'epoch': 0.07}
{'loss': 0.3109, 'grad_norm': 3.8970205783843994, 'learning_rate': 4.765258215962441e-05, 'epoch': 0.14}
{'loss': 0.3461, 'grad_norm': 2.294842481613159, 'learning_rate': 4.647887323943662e-05, 'epoch': 0.21}
{'loss': 0.3331, 'grad_norm': 5.237137794494629, 'learning_rate': 4.530516431924883e-05, 'epoch': 0.28}
{'loss': 0.3874, 'grad_norm': 2.663878917694092, 'learning_rate': 4.413145539906103e-05, 'epoch': 0.35}
{'loss': 0.308, 'grad_norm': 2.482166290283203, 'learning_rate': 4.295774647887324e-05, 'epoch': 0.42}
{'loss': 0.3301, 'grad_norm': 1.9523558616638184, 'learning_rate': 4.178403755868545e-05, 'epoch': 0.49}
{'loss': 0.4168, 'grad_norm': 4.1357855796813965, 'learning_rate': 4.0610328638497654e-05, 'epoch': 0.56}
{'loss': 0.4076, 'grad_norm': 2.5412566661834717, 'learning_rate': 3.943661971830986e-05, 'epoch': 0.63}
{'loss': 0.3844, 'grad_norm': 2.768216133117676, 'learning_ra

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.1283717155456543, 'eval_bleu': 0.4703416941636425, 'eval_runtime': 129.3578, 'eval_samples_per_second': 0.974, 'eval_steps_per_second': 0.124, 'epoch': 1.0}
{'loss': 0.8059, 'grad_norm': 9.117254257202148, 'learning_rate': 3.23943661971831e-05, 'epoch': 1.06}
{'loss': 0.7815, 'grad_norm': 7.098135471343994, 'learning_rate': 3.1220657276995305e-05, 'epoch': 1.13}
{'loss': 0.8091, 'grad_norm': 11.68268871307373, 'learning_rate': 3.0046948356807513e-05, 'epoch': 1.2}
{'loss': 0.794, 'grad_norm': 10.478001594543457, 'learning_rate': 2.887323943661972e-05, 'epoch': 1.27}
{'loss': 0.7261, 'grad_norm': 5.437191486358643, 'learning_rate': 2.7699530516431926e-05, 'epoch': 1.34}
{'loss': 0.9052, 'grad_norm': 10.992965698242188, 'learning_rate': 2.6525821596244134e-05, 'epoch': 1.41}
{'loss': 0.8765, 'grad_norm': 9.82922649383545, 'learning_rate': 2.535211267605634e-05, 'epoch': 1.48}
{'loss': 0.8974, 'grad_norm': 9.882003784179688, 'learning_rate': 2.4178403755868547e-05, 'epoch'

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.9961930513381958, 'eval_bleu': 0.5014578047365716, 'eval_runtime': 127.3893, 'eval_samples_per_second': 0.989, 'eval_steps_per_second': 0.126, 'epoch': 2.0}
{'loss': 0.7831, 'grad_norm': 9.551262855529785, 'learning_rate': 1.5962441314553993e-05, 'epoch': 2.04}
{'loss': 0.6167, 'grad_norm': 4.976161479949951, 'learning_rate': 1.4788732394366198e-05, 'epoch': 2.11}
{'loss': 0.6306, 'grad_norm': 6.303422927856445, 'learning_rate': 1.3615023474178404e-05, 'epoch': 2.18}
{'loss': 0.5847, 'grad_norm': 5.262000560760498, 'learning_rate': 1.2441314553990612e-05, 'epoch': 2.25}
{'loss': 0.7252, 'grad_norm': 3.9484033584594727, 'learning_rate': 1.1267605633802817e-05, 'epoch': 2.32}
{'loss': 0.7429, 'grad_norm': 9.201313018798828, 'learning_rate': 1.0093896713615023e-05, 'epoch': 2.39}
{'loss': 0.6501, 'grad_norm': 7.421340465545654, 'learning_rate': 8.92018779342723e-06, 'epoch': 2.46}
{'loss': 0.8193, 'grad_norm': 11.156578063964844, 'learning_rate': 7.746478873239436e-06, 'ep

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.992005467414856, 'eval_bleu': 0.5073454349054234, 'eval_runtime': 128.7216, 'eval_samples_per_second': 0.979, 'eval_steps_per_second': 0.124, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


{'train_runtime': 1555.4569, 'train_samples_per_second': 2.183, 'train_steps_per_second': 0.274, 'train_loss': 0.6439104337647488, 'epoch': 3.0}


TrainOutput(global_step=426, training_loss=0.6439104337647488, metrics={'train_runtime': 1555.4569, 'train_samples_per_second': 2.183, 'train_steps_per_second': 0.274, 'total_flos': 104822176628736.0, 'train_loss': 0.6439104337647488, 'epoch': 3.0})

In [10]:
# 10. 评估模型效果
eval_results = trainer.evaluate()
print("评估结果：", eval_results)

# 11. 保存微调后的模型
trainer.save_model("./finetuned_model")
print("微调后的模型已保存到 ./finetuned_model")

  0%|          | 0/16 [00:00<?, ?it/s]

评估结果： {'eval_loss': 0.992005467414856, 'eval_bleu': 0.5073454349054234, 'eval_runtime': 129.0013, 'eval_samples_per_second': 0.977, 'eval_steps_per_second': 0.124, 'epoch': 3.0}
微调后的模型已保存到 ./finetuned_model


### 【步骤2】实现 Python 文件中文注释翻译并生成新 .py 文件

### 【步骤3】评估翻译质量

In [14]:
# 示例：使用 evaluate 库评估一批翻译结果
def evaluate_translations(references, predictions):
    """
    references: 列表，每个元素为参考翻译文本（俄文）
    predictions: 列表，每个元素为模型生成的翻译文本（俄文）
    """
    bleu_metric = evaluate.load("bleu")
    # BLEU 期望参考文本为列表的列表
    ref_lists = [[ref] for ref in references]
    results = bleu_metric.compute(predictions=predictions, references=ref_lists)
    print("BLEU score:", results["bleu"])

# 示例调用：
refs = ["Цель разработки - предоставить персонализированный синхронный перевод для пользователей."]
preds = ["Цель разработки - предоставить персонализированный синхронный перевод для пользователей."]
evaluate_translations(refs, preds)

BLEU score: 1.0


# -

In [17]:
# 1. 加载所需库

import pandas as pd
from datasets import Dataset
import evaluate
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [18]:
# 2. 加载 CSV 数据集

csv_path = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datasets\data.csv"  # 替换为你的 CSV 文件路径
df = pd.read_csv(csv_path)

# 假设 CSV 中的列名为： "file_path", "code", "code_comment_type", "comment_zh", "comment_ru", "comment_en"
# 本例中我们以 comment_zh 作为输入、comment_ru 作为目标；你也可以考虑加入代码信息作为上下文
df["input_text"] = df["comment_zh"]  # 可根据需要扩展，如 "代码: " + df["code"] + "\n注释: " + df["comment_zh"]
df["target_text"] = df["comment_ru"]

# 查看数据样本
df.head()

Unnamed: 0,file_path,code,code_comment_type,comment_zh,comment_ru,comment_en,input_text,target_text
0,C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datas...,import numpy as np\nfrom scipy.stats import tt...,Single-line,用于两个样本分位数的自助法t检验,t-тест самообслуживания для двух квартилей выб...,Self-service t-test for two quartiles of the s...,用于两个样本分位数的自助法t检验,t-тест самообслуживания для двух квартилей выб...
1,C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datas...,import numpy as np\nfrom scipy.stats import tt...,Single-line,计算控制组和实验组的自助法分位数,Рассчитать квантили метода самообслуживания дл...,Calculate the self-help division of the contro...,计算控制组和实验组的自助法分位数,Рассчитать квантили метода самообслуживания дл...
2,C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datas...,import numpy as np\nfrom scipy.stats import tt...,Single-line,对自助法分位数进行t检验,Провести t-тест на квантиль самообслуживания,t-test self-help splits,对自助法分位数进行t检验,Провести t-тест на квантиль самообслуживания
3,C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datas...,import numpy as np\nfrom scipy.stats import tt...,Single-line,判断是否拒绝原假设,"Определение того, отклонять или нет первоначал...",Determining whether or not to reject the origi...,判断是否拒绝原假设,"Определение того, отклонять или нет первоначал..."
4,C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\datas...,import numpy as np\nfrom scipy.stats import tt...,Multi-line,返回使用自助抽样法计算的分位数分布。,"Возвращает квантильное распределение, рассчита...",Returns the fractional distribution calculated...,返回使用自助抽样法计算的分位数分布。,"Возвращает квантильное распределение, рассчита..."


In [None]:
# 3. 将 Pandas DataFrame 转换为 Hugging Face 的 Dataset 格式

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['file_path', 'code', 'code_comment_type', 'comment_zh', 'comment_ru', 'comment_en', 'input_text', 'target_text'],
    num_rows: 1258
})

In [None]:
# 4. 加载预训练的翻译模型

# 中文到英文模型
model_name_zh_en = "Helsinki-NLP/opus-mt-zh-en"
tokenizer_zh_en = MarianTokenizer.from_pretrained(model_name_zh_en)
model_zh_en = MarianMTModel.from_pretrained(model_name_zh_en)

# 英文到俄文模型
model_name_en_ru = "Helsinki-NLP/opus-mt-en-ru"
tokenizer_en_ru = MarianTokenizer.from_pretrained(model_name_en_ru)
model_en_ru = MarianMTModel.from_pretrained(model_name_en_ru)

def translate_zh_to_ru(text_zh):
    # 第一步：中文翻译成英文
    inputs_zh_en = tokenizer_zh_en(text_zh, return_tensors="pt", padding=True, truncation=True)
    translated_en = model_zh_en.generate(**inputs_zh_en)
    text_en = [tokenizer_zh_en.decode(t, skip_special_tokens=True) for t in translated_en]

    # 第二步：英文翻译成俄文
    inputs_en_ru = tokenizer_en_ru(text_en, return_tensors="pt", padding=True, truncation=True)
    translated_ru = model_en_ru.generate(**inputs_en_ru)
    text_ru = [tokenizer_en_ru.decode(t, skip_special_tokens=True) for t in translated_ru]

    return text_ru

# 示例
text_zh = ["这是一个测试句子。", "我们将中文翻译成俄文。"]
translated_text = translate_zh_to_ru(text_zh)
print(translated_text)



['Это испытательное предложение.', 'Мы перевели китайский на русский.']


In [13]:
# 4. 加载预训练的翻译模型（此处使用 Helsinki-NLP/opus-mt-zh-ru，如果没有可考虑其他适合的模型）

model_name = "Helsinki-NLP/opus-mt-zh-ru"  # 请确保此模型存在；如果没有，可以考虑微调现有的中文->俄文翻译模型
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

OSError: Helsinki-NLP/opus-mt-zh-ru is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# 使用多语言模型：支持中文到俄文的翻译
model_name = "Helsinki-NLP/opus-mt-mul-en"  # 示例多语言模型
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_zh_to_ru_multilingual(text_zh):
    # 给输入文本添加目标语言代码 ">>ru<<"
    text_with_language_code = [">>ru<< " + text for text in text_zh]

    # 进行翻译
    inputs = tokenizer(text_with_language_code, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    text_ru = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return text_ru

# 示例
text_zh = ["这是一个测试句子。", "我们将中文翻译成俄文。"]
translated_text = translate_zh_to_ru_multilingual(text_zh)
print(translated_text)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

["It's a test sentence.", 'We translate Chinese into Russian.']


In [None]:
from transformers import MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-mul"
tokenizer = MarianTokenizer.from_pretrained(model_name)

# 打印支持的语言代码
print(tokenizer.supported_language_codes)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

['>>ewe<<', '>>sna<<', '>>lin<<', '>>toi_Latn<<', '>>ceb<<', '>>oss<<', '>>run<<', '>>mfe<<', '>>ilo<<', '>>zlm_Latn<<', '>>pes<<', '>>smo<<', '>>hil<<', '>>niu<<', '>>sag<<', '>>fij<<', '>>cmn_Hans<<', '>>nya<<', '>>tso<<', '>>war<<', '>>gil<<', '>>hau_Latn<<', '>>umb<<', '>>glv<<', '>>tvl<<', '>>ton<<', '>>zul<<', '>>kal<<', '>>pag<<', '>>cmn_Hant<<', '>>pus<<', '>>abk<<', '>>pap<<', '>>hat<<', '>>mkd<<', '>>tuk_Latn<<', '>>yor<<', '>>tuk<<', '>>sqi<<', '>>tir<<', '>>mlg<<', '>>tur<<', '>>ido_Latn<<', '>>mai<<', '>>ibo<<', '>>srp_Cyrl<<', '>>srp_Latn<<', '>>kir_Cyrl<<', '>>heb<<', '>>bos_Latn<<', '>>bak<<', '>>ast<<', '>>som<<', '>>tah<<', '>>chv<<', '>>kek_Latn<<', '>>lug<<', '>>vie<<', '>>wln<<', '>>isl<<', '>>hye<<', '>>mah<<', '>>yue_Hant<<', '>>crh_Latn<<', '>>amh<<', '>>nds<<', '>>pan_Guru<<', '>>xho<<', '>>ukr<<', '>>cat<<', '>>afr<<', '>>tat<<', '>>guj<<', '>>jpn<<', '>>mon<<', '>>eus<<', '>>nob<<', '>>glg<<', '>>ind<<', '>>sin<<', '>>cym<<', '>>zho_Hant<<', '>>zho_Hans<<', '

In [None]:
import re

# 提取 Python 文件中的注释，并保留代码的原始行结构
def extract_comments(file_path):
    """从文件中提取所有单行和多行注释，返回原始代码按行分割的列表，以及注释信息"""
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()  # 逐行读取，确保保留空行

    comments = []
    code_lines = []

    multi_line_comment = False  # 标记是否处于多行注释块中
    multi_line_content = []  # 存储多行注释内容
    multi_line_indent = ""  # 存储多行注释的缩进

    for line in lines:
        code_lines.append(line)  # 记录完整代码（包括换行符）

        # 处理单行注释（包含在代码行后的情况）
        single_comment_match = re.match(r"^(.*?)\s*(#.*)", line)
        if single_comment_match:
            code_part = single_comment_match.group(1)
            comment_part = single_comment_match.group(2)
            if re.search("[\u4e00-\u9fff]", comment_part):  # 只翻译包含中文的注释
                comments.append((comment_part, "Single-line", single_comment_match.group(1)))  # 返回3元组
            continue

        # 处理多行注释
        multi_match = re.match(r"(\s*)(['\"]{3})", line)  # 识别多行注释的开始
        if multi_match:
            if not multi_line_comment:  # 进入多行注释
                multi_line_comment = True
                multi_line_indent = multi_match.group(1)  # 记录缩进
                multi_line_content = [line]
            else:  # 结束多行注释
                multi_line_comment = False
                multi_line_content.append(line)
                comments.append(("".join(multi_line_content), "Multi-line", multi_line_indent))  # 返回3元组
            continue
        
        if multi_line_comment:
            multi_line_content.append(line)

    return code_lines, comments

# 翻译注释（中文 -> 俄文）
def translate_comment(text, prefix="translate to ru: "):
    """使用微调模型翻译中文注释"""
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_tokens = model.generate(**inputs, max_length=512)
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated[0]

# 替换代码中的中文注释，保持原有格式（缩进 & 空行）
def replace_comments_in_code(code_lines, comments_translations):
    """
    code_lines: 原始代码（按行分割）
    comments_translations: 列表，包含 (原始注释, 翻译后注释, 注释类型, 缩进)
    """
    new_code_lines = code_lines[:]  # 复制代码列表，避免修改原始数据
    
    for orig, trans, ctype, indent in comments_translations:
        if ctype == "Single-line":
            # 确保保留缩进并替换
            for i, line in enumerate(new_code_lines):
                if orig in line:
                    # 替换时去除重复的“#”
                    new_code_lines[i] = line.replace(orig, indent + "# " + trans, 1)
                    break
        
        elif ctype == "Multi-line":
            orig_lines = orig.split("\n")
            trans_lines = trans.split("\n")
            trans_lines = [indent + line for line in trans_lines]  # 保持缩进
            for i in range(len(new_code_lines) - len(orig_lines) + 1):
                if new_code_lines[i:i+len(orig_lines)] == orig_lines:
                    new_code_lines[i:i+len(orig_lines)] = trans_lines
                    break

    return new_code_lines

# 处理 Python 文件，翻译中文注释为俄语
def process_python_file(input_file_path, output_file_path):
    # 提取代码及注释
    code_lines, comments = extract_comments(input_file_path)
    
    # 翻译注释
    comments_translations = []
    for comment, ctype, indent in comments:
        if re.search("[\u4e00-\u9fff]", comment):  # 只翻译包含中文的注释
            trans = translate_comment(comment)
            comments_translations.append((comment, trans, ctype, indent))  # 保证返回4元组
        else:
            comments_translations.append((comment, comment, ctype, indent))  # 保持非中文注释不变
    
    # 替换代码中的注释
    new_code_lines = replace_comments_in_code(code_lines, comments_translations)
    
    # 保存新文件，确保换行符保持不变
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.writelines(new_code_lines)
    
    print(f"新文件已生成：{output_file_path}")

# 示例：处理单个 Python 文件
input_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2e2r.py"  # 替换为待翻译的文件路径
output_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\translated_test_6.py"  # 替换为生成的输出文件路径
process_python_file(input_file, output_file)

In [None]:
import re


# 提取 Python 文件中的注释，并保留代码的原始行结构
def extract_comments(file_path):
    """从文件中提取所有单行和多行注释，返回原始代码按行分割的列表，以及注释信息"""
    with open(file_path, "r", encoding="utf-8-sig") as f:
        lines = f.readlines()  # 逐行读取，确保保留空行

    comments = []
    code_lines = []

    multi_line_comment = False  # 标记是否处于多行注释块中
    multi_line_content = []  # 存储多行注释内容
    multi_line_delimiter = ""  # 存储多行注释的分隔符
    multi_line_indent = ""  # 存储多行注释的缩进

    for line in lines:
        code_lines.append(line)  # 记录完整代码（包括换行符）

        # 处理单行注释（包含在代码行后的情况）
        single_comment_match = re.match(r"^(.*?)\s*(#.*)", line)
        if single_comment_match:
            code_part = single_comment_match.group(1)
            comment_part = single_comment_match.group(2)
            if re.search("[\u4e00-\u9fff]", comment_part):  # 只翻译包含中文的注释
                comments.append((comment_part, "Single-line", single_comment_match.group(1)))
            continue

        # 处理多行注释
        multi_match = re.match(r"(\s*)(['\"]{3})", line)  # 识别多行注释的开始
        if multi_match:
            if not multi_line_comment:  # 进入多行注释
                multi_line_comment = True
                multi_line_delimiter = multi_match.group(2)
                multi_line_indent = multi_match.group(1)  # 记录缩进
                multi_line_content = [line]
            else:  # 结束多行注释
                multi_line_comment = False
                multi_line_content.append(line)
                comments.append(("".join(multi_line_content), "Multi-line", multi_line_indent))
            continue
        
        if multi_line_comment:
            multi_line_content.append(line)

    return code_lines, comments

# 翻译注释（中文 -> 俄文）
def translate_comment(text, prefix="translate to ru: "):
    """使用微调模型翻译中文注释"""
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_tokens = model.generate(**inputs, max_length=512)
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated[0]

# 替换代码中的中文注释，保持原有格式（缩进 & 空行）
def replace_comments_in_code(code_lines, comments_translations):
    """
    code_lines: 原始代码（按行分割）
    comments_translations: 列表，包含 (原始注释, 翻译后注释, 注释类型, 缩进)
    """
    new_code_lines = code_lines[:]  # 复制代码列表，避免修改原始数据
    
    for orig, trans, ctype, indent in comments_translations:
        if ctype == "Single-line":
            # 确保保留缩进并替换
            for i, line in enumerate(new_code_lines):
                if orig in line:
                    new_code_lines[i] = line.replace(orig, indent + "# " + trans, 1)
                    break
        
        elif ctype == "Multi-line":
            orig_lines = orig.split("\n")
            trans_lines = trans.split("\n")
            trans_lines = [indent + line for line in trans_lines]  # 保持缩进
            for i in range(len(new_code_lines) - len(orig_lines) + 1):
                if new_code_lines[i:i+len(orig_lines)] == orig_lines:
                    new_code_lines[i:i+len(orig_lines)] = trans_lines
                    break

    return new_code_lines

# 处理 Python 文件，翻译中文注释为俄语
def process_python_file(input_file_path, output_file_path):
    # 提取代码及注释
    code_lines, comments = extract_comments(input_file_path)
    
    # 翻译注释
    comments_translations = []
    for comment, ctype, indent in comments:
        if re.search("[\u4e00-\u9fff]", comment):  # 只翻译包含中文的注释
            trans = translate_comment(comment)
            comments_translations.append((comment, trans, ctype, indent))
        else:
            comments_translations.append((comment, comment, ctype, indent))  # 保持非中文注释不变
    
    # 替换代码中的注释
    new_code_lines = replace_comments_in_code(code_lines, comments_translations)
    
    # 保存新文件，确保换行符保持不变
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.writelines(new_code_lines)
    
    print(f"新文件已生成：{output_file_path}")

# 示例：处理单个 Python 文件
input_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2e2r.py"  # 替换为待翻译的文件路径
output_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\translated_test_5.py"  # 替换为生成的输出文件路径
process_python_file(input_file, output_file)

In [None]:
import re

# 提取 Python 文件中的注释，并保留代码的原始行结构
def extract_comments(file_path):
    """从文件中提取所有单行和多行注释，返回原始代码按行分割的列表，以及注释信息"""
    with open(file_path, "r", encoding="utf-8-sig") as f:
        lines = f.readlines()  # 逐行读取，确保保留空行
    
    comments = []
    code_lines = []
    
    multi_line_comment = False  # 标记是否处于多行注释块中
    multi_line_content = []  # 存储多行注释内容
    multi_line_delimiter = ""  # 存储多行注释的分隔符
    multi_line_indent = ""  # 存储多行注释的缩进

    for line in lines:
        code_lines.append(line)  # 记录完整代码（包括换行符）

        # 处理单行注释
        single_comment_match = re.match(r"(\s*#.*)", line)
        if single_comment_match:
            comments.append((single_comment_match.group(1), "Single-line", single_comment_match.group(1).lstrip()))
            continue

        # 处理多行注释
        multi_match = re.match(r"(\s*)(['\"]{3})", line)  # 识别多行注释的开始
        if multi_match:
            if not multi_line_comment:  # 进入多行注释
                multi_line_comment = True
                multi_line_delimiter = multi_match.group(2)
                multi_line_indent = multi_match.group(1)  # 记录缩进
                multi_line_content = [line]
            else:  # 结束多行注释
                multi_line_comment = False
                multi_line_content.append(line)
                comments.append(("".join(multi_line_content), "Multi-line", multi_line_indent))
            continue
        
        if multi_line_comment:
            multi_line_content.append(line)

    return code_lines, comments

# 翻译注释（中文 -> 俄文）
def translate_comment(text, prefix="translate to ru: "):
    """使用微调模型翻译中文注释"""
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_tokens = model.generate(**inputs, max_length=512)
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated[0]

# 替换代码中的中文注释，保持原有格式（缩进 & 空行）
def replace_comments_in_code(code_lines, comments_translations):
    """
    code_lines: 原始代码（按行分割）
    comments_translations: 列表，包含 (原始注释, 翻译后注释, 注释类型, 缩进)
    """
    new_code_lines = code_lines[:]  # 复制代码列表，避免修改原始数据
    
    for orig, trans, ctype, indent in comments_translations:
        if ctype == "Single-line":
            # 确保保留缩进
            for i, line in enumerate(new_code_lines):
                if orig in line:
                    new_code_lines[i] = line.replace(orig, indent + trans, 1)
                    break
        
        elif ctype == "Multi-line":
            orig_lines = orig.split("\n")
            trans_lines = trans.split("\n")
            trans_lines = [indent + line for line in trans_lines]  # 保持缩进
            for i in range(len(new_code_lines) - len(orig_lines) + 1):
                if new_code_lines[i:i+len(orig_lines)] == orig_lines:
                    new_code_lines[i:i+len(orig_lines)] = trans_lines
                    break

    return new_code_lines

# 处理 Python 文件，翻译中文注释为俄语
def process_python_file(input_file_path, output_file_path):
    # 提取代码及注释
    code_lines, comments = extract_comments(input_file_path)
    
    # 翻译注释
    comments_translations = []
    for comment, ctype, indent in comments:
        if re.search("[\u4e00-\u9fff]", comment):  # 只翻译包含中文的注释
            trans = translate_comment(comment)
            comments_translations.append((comment, trans, ctype, indent))
        else:
            comments_translations.append((comment, comment, ctype, indent))  # 保持非中文注释不变
    
    # 替换代码中的注释
    new_code_lines = replace_comments_in_code(code_lines, comments_translations)
    
    # 保存新文件，确保换行符保持不变
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.writelines(new_code_lines)
    
    print(f"新文件已生成：{output_file_path}")

# 示例：处理单个 Python 文件
input_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2e2r.py"  # 替换为待翻译的文件路径
output_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\translated_test_4.py"  # 替换为生成的输出文件路径
process_python_file(input_file, output_file)

In [None]:
import re

# 提取 Python 文件中的注释，并保留代码的原始行结构
def extract_comments(file_path):
    """从文件中提取所有单行和多行注释，返回原始代码按行分割的列表，以及注释信息"""
    with open(file_path, "r", encoding="utf-8-sig") as f:
        lines = f.readlines()  # 逐行读取，确保保留空行
    
    comments = []
    code_lines = []
    
    multi_line_comment = False  # 标记是否处于多行注释块中
    multi_line_content = []  # 存储多行注释内容
    multi_line_delimiter = ""  # 存储多行注释的分隔符
    
    for line in lines:
        code_lines.append(line)  # 记录完整代码（包括换行符）

        # 处理单行注释
        single_comment_match = re.match(r"(\s*#.*)", line)
        if single_comment_match:
            comments.append((single_comment_match.group(1), "Single-line"))
            continue

        # 处理多行注释
        multi_match = re.match(r"(\s*)(['\"]{3})", line)  # 识别多行注释的开始
        if multi_match:
            if not multi_line_comment:  # 进入多行注释
                multi_line_comment = True
                multi_line_delimiter = multi_match.group(2)
                multi_line_content = [line]
            else:  # 结束多行注释
                multi_line_comment = False
                multi_line_content.append(line)
                comments.append(("".join(multi_line_content), "Multi-line"))
            continue
        
        if multi_line_comment:
            multi_line_content.append(line)

    return code_lines, comments

# 翻译注释（中文 -> 俄文）
def translate_comment(text, prefix="translate to ru: "):
    """使用微调模型翻译中文注释"""
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_tokens = model.generate(**inputs, max_length=512)
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated[0]

# 替换代码中的中文注释，保持原有格式（缩进 & 空行）
def replace_comments_in_code(code_lines, comments_translations):
    """
    code_lines: 原始代码（按行分割）
    comments_translations: 列表，包含 (原始注释, 翻译后注释, 注释类型)
    """
    new_code_lines = code_lines[:]  # 复制代码列表，避免修改原始数据
    
    for orig, trans, ctype in comments_translations:
        if ctype == "Single-line":
            # 确保保留缩进
            for i, line in enumerate(new_code_lines):
                if orig in line:
                    new_code_lines[i] = line.replace(orig, trans, 1)
                    break
        
        elif ctype == "Multi-line":
            orig_lines = orig.split("\n")
            trans_lines = trans.split("\n")
            for i in range(len(new_code_lines) - len(orig_lines) + 1):
                if new_code_lines[i:i+len(orig_lines)] == orig_lines:
                    new_code_lines[i:i+len(orig_lines)] = trans_lines
                    break

    return new_code_lines

# 处理 Python 文件，翻译中文注释为俄语
def process_python_file(input_file_path, output_file_path):
    # 提取代码及注释
    code_lines, comments = extract_comments(input_file_path)
    
    # 翻译注释
    comments_translations = []
    for comment, ctype in comments:
        if re.search("[\u4e00-\u9fff]", comment):  # 只翻译包含中文的注释
            trans = translate_comment(comment)
            comments_translations.append((comment, trans, ctype))
        else:
            comments_translations.append((comment, comment, ctype))  # 保持非中文注释不变
    
    # 替换代码中的注释
    new_code_lines = replace_comments_in_code(code_lines, comments_translations)
    
    # 保存新文件，确保换行符保持不变
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.writelines(new_code_lines)
    
    print(f"新文件已生成：{output_file_path}")

# 示例：处理单个 Python 文件
input_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2e2r.py"  # 替换为待翻译的文件路径
output_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\translated_test_3.py"  # 替换为生成的输出文件路径
process_python_file(input_file, output_file)

In [None]:
import re

# 提取 Python 文件中的注释，并保留代码的原始行结构
def extract_comments(file_path):
    """从文件中提取所有单行和多行注释，返回原始代码按行分割的列表，以及注释信息"""
    with open(file_path, "r", encoding="utf-8-sig") as f:
        lines = f.readlines()  # 逐行读取，确保保留空行
    
    comments = []
    code_lines = []
    
    multi_line_comment = False  # 标记是否处于多行注释块中
    multi_line_content = []  # 存储多行注释内容
    multi_line_delimiter = ""  # 存储多行注释的分隔符
    
    for line in lines:
        code_lines.append(line)  # 记录完整代码（包括换行符）

        # 处理单行注释
        single_comment_match = re.match(r"(\s*#.*)", line)
        if single_comment_match:
            comments.append((single_comment_match.group(1), "Single-line"))
            continue

        # 处理多行注释
        multi_match = re.match(r"(\s*)(['\"]{3})", line)  # 识别多行注释的开始
        if multi_match:
            if not multi_line_comment:  # 进入多行注释
                multi_line_comment = True
                multi_line_delimiter = multi_match.group(2)
                multi_line_content = [line]
            else:  # 结束多行注释
                multi_line_comment = False
                multi_line_content.append(line)
                comments.append(("".join(multi_line_content), "Multi-line"))
            continue
        
        if multi_line_comment:
            multi_line_content.append(line)

    return code_lines, comments

# 翻译注释（中文 -> 俄文）
def translate_comment(text, prefix="translate to ru: "):
    """使用微调模型翻译中文注释"""
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_tokens = model.generate(**inputs, max_length=512)
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated[0]

# 替换代码中的中文注释，保持原有格式（缩进 & 空行）
def replace_comments_in_code(code_lines, comments_translations):
    """
    code_lines: 原始代码（按行分割）
    comments_translations: 列表，包含 (原始注释, 翻译后注释, 注释类型)
    """
    new_code_lines = code_lines[:]  # 复制代码列表，避免修改原始数据
    
    for orig, trans, ctype in comments_translations:
        if ctype == "Single-line":
            # 确保保留缩进
            for i, line in enumerate(new_code_lines):
                if orig in line:
                    new_code_lines[i] = line.replace(orig, trans, 1)
                    break
        
        elif ctype == "Multi-line":
            orig_lines = orig.split("\n")
            trans_lines = trans.split("\n")
            for i in range(len(new_code_lines) - len(orig_lines) + 1):
                if new_code_lines[i:i+len(orig_lines)] == orig_lines:
                    new_code_lines[i:i+len(orig_lines)] = trans_lines
                    break

    return new_code_lines

# 处理 Python 文件，翻译中文注释为俄语
def process_python_file(input_file_path, output_file_path):
    # 提取代码及注释
    code_lines, comments = extract_comments(input_file_path)
    
    # 翻译注释
    comments_translations = []
    for comment, ctype in comments:
        if re.search("[\u4e00-\u9fff]", comment):  # 只翻译包含中文的注释
            trans = translate_comment(comment)
            comments_translations.append((comment, trans, ctype))
        else:
            comments_translations.append((comment, comment, ctype))  # 保持非中文注释不变
    
    # 替换代码中的注释
    new_code_lines = replace_comments_in_code(code_lines, comments_translations)
    
    # 保存新文件，确保换行符保持不变
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.writelines(new_code_lines)
    
    print(f"新文件已生成：{output_file_path}")

# 示例：处理单个 Python 文件
input_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2e2r_2.py"  # 替换为待翻译的文件路径
output_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\translated_test_2.py"  # 替换为生成的输出文件路径
process_python_file(input_file, output_file)

In [None]:
import re

# 提取 Python 文件中的注释
def extract_comments(file_path):
    """从文件中提取所有单行和多行注释，返回原始代码和注释信息列表（包含注释内容及类型）"""
    with open(file_path, "r", encoding="utf-8-sig") as f:
        code = f.read()
    
    # 匹配单行注释：以 # 开头
    single_line_pattern = r"(^\s*#.*$)"
    single_line_comments = re.findall(single_line_pattern, code, flags=re.MULTILINE)
    
    # 匹配多行注释：''' 或 """ 包裹的内容
    multi_line_pattern = r"(['\"]{3})([\s\S]*?)(\1)"
    multi_line_comments = re.findall(multi_line_pattern, code)
    multi_line_comments = [match[1] for match in multi_line_comments]
    
    # 构造注释列表，标记类型
    comments = []
    for comment in single_line_comments:
        comments.append((comment, "Single-line"))
    for comment in multi_line_comments:
        comments.append((comment.strip(), "Multi-line"))
    
    return code, comments

# 使用微调后的模型进行翻译（从中文到俄文）
def translate_comment(text, prefix="translate to ru: "):
    # 构造输入文本：添加前缀以提示模型翻译方向
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_tokens = model.generate(**inputs, max_length=512)
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated[0]

# 根据原代码内容替换注释为翻译后的俄文注释
def replace_comments_in_code(code, comments_translations):
    """
    comments_translations: 列表，元素为 (原始注释, 翻译后注释, 注释类型)
    """
    new_code = code
    # 逐条替换，注意替换时可能需要额外处理格式问题
    for orig, trans, ctype in comments_translations:
        # 为了尽量精确替换，使用原始注释文本进行替换
        new_code = new_code.replace(orig, trans)
    return new_code

# 主流程：上传文件 -> 翻译注释 -> 生成新 .py 文件
def process_python_file(input_file_path, output_file_path):
    # 提取代码和注释
    code, comments = extract_comments(input_file_path)
    
    # 翻译每条注释（这里只处理中文注释，可添加判断，如只处理包含中文字符的注释）
    comments_translations = []
    for comment, ctype in comments:
        # 这里可以添加一个简单的判断：如果注释中包含中文，则进行翻译
        if re.search("[\u4e00-\u9fff]", comment):
            trans = translate_comment(comment)
            comments_translations.append((comment, trans, ctype))
        else:
            comments_translations.append((comment, comment, ctype))
    
    # 替换代码中的注释
    new_code = replace_comments_in_code(code, comments_translations)
    
    # 保存生成的新文件
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.write(new_code)
    print(f"新文件已生成：{output_file_path}")

# 示例：处理某个 Python 文件
input_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2e2r_2.py"   # 替换为待翻译的 Python 文件路径
output_file = r"C:\Users\gdnjr5233_YOLO\Desktop\ВКР_2025\c2r_test.py"  # 替换为输出文件路径
process_python_file(input_file, output_file)