# 源数据处理

## 1 导入相关包

In [1]:
import pandas as pd
import random
import os

In [3]:
en_path = r'H:\datasets\data\training-parallel-nc-v13\news-commentary-v13.zh-en.en'
ch_path = r'H:\datasets\data\training-parallel-nc-v13\news-commentary-v13.zh-en.zh'
json_path=r'C:\Users\30535\Desktop'

In [6]:
class TextToCsv:
    ## 定义tokenizer,对原始数据进行处理
    def __init__(self, en_path, ch_path,csv_path,text_pair_nums=200000):
        """
        初始化
        :param en_path: 英文数据路径
        :param ch_path: 中文数据路径
        :json_path 文件保存路径
        :text_pair_nums: 使用多少对数据
        """
        self.en_path = en_path  # 英文路径
        self.ch_path = ch_path  # 中文路径
        self.text_pair_nums=text_pair_nums
        
        # 读取原始英文数据
        self.en_data = self.__read_ori_data(en_path)
        # 读取原始中文数据
        self.ch_data = self.__read_ori_data(ch_path)
        self.x=self.return_csv(csv_path)

    def __read_ori_data(self, path):
        """
        读取原始数据
        :param path: 数据路径
        :return: 返回一个列表，每个元素是一条数据
        """
        with open(path, 'r', encoding='utf-8') as f:
            data = f.read().split('\n')[:-1]
            self.text_pair_nums =self.text_pair_nums if self.text_pair_nums <=len(data) else len(data)
            data = data[:self.text_pair_nums] 
        return data
    
    def return_csv(self,csv_path):
        """
        将源数据处理成json文件
        :json_path 文件保存路径
        """
        data=[]
        for i in range(self.text_pair_nums):
            if len(self.en_data[i])>127 or len(self.en_data[i])>127:
                continue
            data.append([
                self.en_data[i],
                self.ch_data[i]]
            )
            data.append([
                self.ch_data[i],
                self.en_data[i]]
            )
        random.shuffle(data)
        csv_train=os.path.join(csv_path,'train.csv')
        csv_test=os.path.join(csv_path,'test.csv')
        dat=pd.DataFrame(data[:len(data)-500],columns=['src','tgt'])
        dat2=pd.DataFrame(data[len(data)-500:],columns=['src','tgt'])
        dat.to_csv(csv_train,index=False)
        dat2.to_csv(csv_test,index=False)
        

In [7]:
TextToCsv(en_path,ch_path,json_path)

<__main__.TextToCsv at 0x212417d7bd0>

## 1 导入相关包

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

## 2 加载数据集

In [3]:
data_train=r'C:\Users\30535\Desktop\train.csv'
data_test=r'C:\Users\30535\Desktop\test.csv'
ds=load_dataset('csv',data_files={'train':data_train, 'test': data_test},
                                split=['train', 'test'])
ds

[Dataset({
     features: ['src', 'tgt'],
     num_rows: 191778
 }),
 Dataset({
     features: ['src', 'tgt'],
     num_rows: 500
 })]

## 4 数据处理

In [4]:
model_path=r'H:\models\bloom-389m-zh'
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
def process_func(examples):
    MAX_LENGTH = 150
    contents='机器翻译:\n' + examples['src']
    # 对输入与label进行编码
    inputs=tokenizer(contents)
    labels = tokenizer(text_target=examples['tgt'] + tokenizer.eos_token)
    input_ids=inputs["input_ids"]+labels["input_ids"]
    attention_mask=inputs["attention_mask"] + labels["attention_mask"]
    labels = [-100] * len(inputs["input_ids"]) + labels["input_ids"]
    # 数据截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [6]:
tokenized_train=ds[0].map(process_func, remove_columns=ds[0].column_names)

In [14]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 191778
})

## 5 创建模型

In [7]:
model=AutoModelForCausalLM.from_pretrained(model_path)

In [8]:
model = model.half()
model=model.to()

In [10]:
x="机器翻译:\n{}".format("what is this。").strip()
ipt = tokenizer(x,return_tensors='pt').to('cuda')
print(tokenizer.decode(model.generate(**ipt,max_length=256, do_sample=False)[0],skip_special_tokens=True)[len(x):])



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

## 6 使用Lora进行微调

In [16]:
# 6.1 创建配置文件
from peft import LoraConfig,get_peft_model,TaskType
comfig = LoraConfig(task_type=TaskType.CAUSAL_LM)
comfig

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [17]:
# 6.2 创建模型
model_lora = get_peft_model(model,comfig)

In [18]:
model_lora=model_lora.half()

In [19]:
model_lora.print_trainable_parameters()

trainable params: 786,432 || all params: 346,555,392 || trainable%: 0.22692822508443325


## 7 配置训练参数

In [21]:
import os
os.environ["WANDB_DISABLED"] = "true" # 防止日志输出到wandb.ai
args= TrainingArguments(
                                  output_dir='./modelcheak/trans1',
                                  logging_dir=r'./modelcheak/trans1',
                                  per_device_train_batch_size=4,  # batch_size
                                  gradient_accumulation_steps=8,
                                  logging_steps=20,
                                  optim="adafactor",  # 使用特定的优化器优化显存
                                  save_strategy='epoch',  # 每一轮保存一个模型
                                  num_train_epochs=1,
                                  adam_epsilon=1e-4
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## 8 创建训练器

In [22]:
trainr=Trainer(
    args=args,
    model=model_lora,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

In [23]:
trainr.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
20,4.4067
40,4.0447
60,3.8683
80,3.7067
100,3.5873
120,3.5214
140,3.3367
160,3.3098
180,3.2238
200,3.1285


TrainOutput(global_step=5993, training_loss=2.61267729424563, metrics={'train_runtime': 2310.065, 'train_samples_per_second': 83.018, 'train_steps_per_second': 2.594, 'total_flos': 2.029660253631283e+16, 'train_loss': 2.61267729424563, 'epoch': 1.0})

## 9 权重合并与

In [24]:
from peft import PeftModel
# model_id 是checkpoint那个路径
prft_model=PeftModel.from_pretrained(model=model,model_id="./modelcheak/trans1/checkpoint-5993")
# 权重合并
merge_model=prft_model.merge_and_unload()

In [25]:
# 模型保存
merge_model.save_pretrained('./modelcheak/trans')

In [55]:
x="机器翻译:\n{}".format("what is this。").strip()
ipt = tokenizer(x,return_tensors='pt').to('cuda')
print(tokenizer.decode(merge_model.generate(**ipt,max_length=256, do_sample=False)[0],skip_special_tokens=True)[len(x):])

这又是什么呢？


In [56]:
x="机器翻译:\n{}".format("这又是什么呢？").strip()
ipt = tokenizer(x,return_tensors='pt').to('cuda')
print(tokenizer.decode(merge_model.generate(**ipt,max_length=256, do_sample=False)[0],skip_special_tokens=True)[len(x):])

What is this?


## 9 模型推理

In [32]:
from transformers import pipeline

In [33]:
pipe=pipeline('text2text-generation',model=merge_model,tokenizer=tokenizer,device=0)

The model 'BloomForCausalLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [35]:
pipe('机器翻译:\n'+'我有一个苹果',max_length=30,do_sample=False)

[{'generated_text': '机器翻译:\n我有一个苹果I have a Apple'}]