# 源数据处理

## 1 导入相关包

In [1]:
import pandas as pd
import random
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import PeftModel
from transformers import pipeline

bin D:\anac\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [4]:
en_path = r'H:\datasets\data\翻译1\test.en.txt'
ch_path = r'H:\datasets\data\翻译1\test.ch.txt'
csv_path=r'C:\Users\30535\Desktop'

In [3]:
class TextToCsv:
    ## 定义tokenizer,对原始数据进行处理
    def __init__(self, en_path, ch_path,csv_path,text_pair_nums=30000):
        """
        初始化
        :param en_path: 英文数据路径
        :param ch_path: 中文数据路径
        :csv_path 文件保存路径
        :text_pair_nums: 使用多少对数据
        """
        self.en_path = en_path  # 英文路径
        self.ch_path = ch_path  # 中文路径
        self.text_pair_nums=text_pair_nums
        
        # 读取原始英文数据
        self.en_data = self.__read_ori_data(en_path)
        # 读取原始中文数据
        self.ch_data = self.__read_ori_data(ch_path)
        self.x=self.return_csv(csv_path)

    def __read_ori_data(self, path):
        """
        读取原始数据
        :param path: 数据路径
        :return: 返回一个列表，每个元素是一条数据
        """
        with open(path, 'r', encoding='utf-8') as f:
            data = f.read().split('\n')[:-1]
            self.text_pair_nums =self.text_pair_nums if self.text_pair_nums <=len(data) else len(data)
            data = data[:self.text_pair_nums] 
        return data
    
    def return_csv(self,csv_path):
        """
        将源数据处理成csv文件
        :csv_path 文件保存路径
        """
        data=[]
        # 遍历所有数据，长度大于127的数据抛弃
        for i in range(self.text_pair_nums):
            if len(self.en_data[i])>127 or len(self.en_data[i])>127:
                continue
            # 英文→中文
            data.append([
                self.en_data[i],
                self.ch_data[i]]
            )
            # 中文→英文
            data.append([
                self.ch_data[i],
                self.en_data[i]]
            )
        random.shuffle(data) # 数据随机打乱
        csv_train=os.path.join(csv_path,'train.csv') # 训练集文件
        csv_test=os.path.join(csv_path,'test.csv') # 测试集文件
        dat=pd.DataFrame(data[:len(data)-500],columns=['src','tgt']) # 训练集
        dat2=pd.DataFrame(data[len(data)-500:],columns=['src','tgt']) # 测试集
        dat.to_csv(csv_train,index=False) # 转换为csv文件
        dat2.to_csv(csv_test,index=False)
        

In [5]:
TextToCsv(en_path,ch_path,csv_path)

<__main__.TextToCsv at 0x176b22e8850>

## 1 导入相关包

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

## 2 加载数据集

In [3]:
data_train=r'C:\Users\30535\Desktop\train.csv'
data_test=r'C:\Users\30535\Desktop\test.csv'
ds=load_dataset('csv',data_files={'train':data_train, 'test': data_test},
                                split=['train', 'test'])
ds

[Dataset({
     features: ['src', 'tgt'],
     num_rows: 92644
 }),
 Dataset({
     features: ['src', 'tgt'],
     num_rows: 1000
 })]

## 4 数据处理

In [4]:
model_path=r'H:\models\bloom-1b4-zh'
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
def process_func(examples):
    MAX_LENGTH = 150
    contents='机器翻译:\n' + examples['src']
    # 对输入与label进行编码
    inputs=tokenizer(contents)
    labels = tokenizer(text_target=examples['tgt'] + tokenizer.eos_token)
    input_ids=inputs["input_ids"]+labels["input_ids"]
    attention_mask=inputs["attention_mask"] + labels["attention_mask"]
    labels = [-100] * len(inputs["input_ids"]) + labels["input_ids"]
    # 数据截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [6]:
tokenized_train=ds[0].map(process_func, remove_columns=ds[0].column_names)

In [7]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 55750
})

## 5 创建模型

In [7]:
model=AutoModelForCausalLM.from_pretrained(model_path)

In [8]:
model = model.half()
model=model.to('cuda')

In [13]:
x="机器翻译:\n{}".format("what is this。").strip()
ipt = tokenizer(x,return_tensors='pt').to('cuda')
print(tokenizer.decode(model.generate(**ipt,max_length=256, do_sample=False)[0],skip_special_tokens=True)[len(x):])

 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译: 翻译


## 6 使用Lora进行微调

In [9]:
# 6.1 创建配置文件
from peft import LoraConfig,get_peft_model,TaskType
comfig = LoraConfig(task_type=TaskType.CAUSAL_LM)
comfig

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [10]:
# 6.2 创建模型
model_lora = get_peft_model(model,comfig)

In [11]:
model_lora=model_lora.half()

In [12]:
model_lora.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,304,684,544 || trainable%: 0.120555118647899


## 7 配置训练参数

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true" # 防止日志输出到wandb.ai
args= TrainingArguments(
                                  output_dir='./modelcheak/m2',
                                  logging_dir=r'./modelcheak/m2',
                                  per_device_train_batch_size=8,  # batch_size
                                  gradient_accumulation_steps=4,
                                  logging_steps=20,
                                  optim="adafactor",  # 使用特定的优化器优化显存
                                  save_strategy='epoch',  # 每一轮保存一个模型
                                  num_train_epochs=1,
                                  adam_epsilon=1e-4
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## 8 创建训练器

In [14]:
trainr=Trainer(
    args=args,
    model=model_lora,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

In [15]:
trainr.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
20,4.2162
40,3.8813
60,3.4356
80,3.1482
100,2.9386
120,2.8034
140,2.758
160,2.715
180,2.7437
200,2.5815


TrainOutput(global_step=2895, training_loss=2.3527866607297065, metrics={'train_runtime': 1000.5068, 'train_samples_per_second': 92.597, 'train_steps_per_second': 2.894, 'total_flos': 3.09147635810304e+16, 'train_loss': 2.3527866607297065, 'epoch': 1.0})

## 9 权重合并与

In [16]:
from peft import PeftModel
# model_id 是checkpoint那个路径
prft_model=PeftModel.from_pretrained(model=model,model_id=r"C:\Users\30535\Desktop\CodeProgram\Python\deepstudy\code2\使用Transformer进行中英文翻译\modelcheak\m2\checkpoint-2895")
# 权重合并
merge_model=prft_model.merge_and_unload()

In [7]:
# 模型保存
merge_model.save_pretrained('./modelcheak/trans11')

In [14]:
x="机器翻译:\n{}".format("what is this。").strip()
ipt = tokenizer(x,return_tensors='pt').to('cuda')
print(tokenizer.decode(merge_model.generate(**ipt,max_length=256, do_sample=False)[0],skip_special_tokens=True)[len(x):])

这是什么？


In [19]:
x="机器翻译:\n{}".format("这又是什么呢？").strip()
ipt = tokenizer(x,return_tensors='pt').to('cuda')
print(tokenizer.decode(merge_model.generate(**ipt,max_length=256, do_sample=False)[0],skip_special_tokens=True)[len(x):])

What is this?


In [17]:
import re
import sacrebleu
def is_english_sentence(sentence):
    # 使用正则表达式检查句子中是否包含英文字母
    english_pattern = re.compile(r'[a-zA-Z]')
    match = english_pattern.search(sentence)
    
    if match:
        return True
    else:
        return False
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

smooth = SmoothingFunction().method1
bleu_scores=[]
m1,m2=[],[]
m3,m4=[],[]
import time
t=time.time()
for i in range(100):
    if i%40==0:
        print(i/len(ds[1]['src']))
    x="机器翻译:\n{}".format(ds[1]['src'][i]).strip()
    ipt = tokenizer(x,return_tensors='pt').to('cuda')
    y=tokenizer.decode(merge_model.generate(**ipt,max_length=150, do_sample=False)[0],skip_special_tokens=True)[len(x):]
    if is_english_sentence(ds[1]['tgt'][i]):
        m1.append(ds[1]['tgt'][i])
        m2.append([y])
    else:
        m3.append(list(ds[1]['tgt'][i][:-1]))
        m4.append([list(y)[:-1]])
print('时间',time.time()-t)
smooth = SmoothingFunction().method1
b1=[sacrebleu.sentence_bleu(candidate, refs).score for candidate, refs in zip(m1, m2)]
for i in range(len(m4)):
    b2 = sentence_bleu(m4[i], m3[i], weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)*100
    b1.append(b2)
print(sum(b1)/100)

0.0
0.04
0.08
时间 18.47494339942932
15.121825586870461


## 9 模型推理

In [32]:
from transformers import pipeline

In [33]:
pipe=pipeline('text2text-generation',model=merge_model,tokenizer=tokenizer,device=0)

The model 'BloomForCausalLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [35]:
pipe('机器翻译:\n'+'我有一个苹果',max_length=30,do_sample=False)

[{'generated_text': '机器翻译:\n我有一个苹果I have a Apple'}]