In [2]:

### Load Model From huggingface
import os
import tqdm
import joblib
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

import peft
import loralib as lora
from peft import LoraConfig

import json
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import accelerate
from accelerate import Accelerator, DeepSpeedPlugin
from transformers import get_linear_schedule_with_warmup
from tokenization_chatglm import ChatGLMTokenizer
import transformers
# from tokenizaion


In [3]:
# from tokenizaion

checkpoint = "THUDM/chatglm-6b"
mixed_precision = 'bf16'

accumulate_step = 8
MAX_LENGTH = 750

# from peft import LoraConfig,get_peft_model
config = LoraConfig(
    peft_type="LORA", 
    r=32, 
    lora_alpha=32, 
    target_modules=["q", "k", "v"],
    lora_dropout=0.1, 
)
'''
Args:
    r (int): Lora attention dimension
    target_modules (Union[List[str],str]): The names of the modules to apply Lora to.
    lora_alpha (float): The alpha parameter for Lora scaling.
    lora_dropout (float): The dropout probability for Lora layers. merge_weights (bool):
        Whether to merge the weights of the Lora layers with the base transformer model in eval mode.
    fan_in_fan_out (bool): Set this to True if the layer to replace stores weight like (fan_in, fan_out)
    enable_lora ( List[bool]): Used with lora.MergedLinear.
    bias (str): Bias type for Lora. Can be 'none', 'all' or 'lora_only'
    modules_to_save (List[str]):List of modules apart from LoRA layers to be set as trainable
        and saved in the final checkpoint.
'''

# LR = 2e-5
LR = 8e-5

NUM_EPOCHS = 2
warm_up_ratio = 0.1

"warm_up_ratio" 是一个术语，通常用于机器学习中的优化算法。它指的是在训练过程中，初始迭代步数中学习率的比例。具体来说，如果我们将整个训练过程分为若干个迭代步骤，那么 warm_up_ratio 就是指在训练开始时，前 warm_up_ratio 比例的迭代步骤中所使用的学习率较小，以帮助模型更好地适应数据分布，避免初始阶段出现过拟合等问题。在这个比例之后，学习率会逐渐增加，直到达到最大值，以便在后续的迭代步骤中更好地收敛到最优解。

In [4]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)

model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True)
deepspeed_plugin = DeepSpeedPlugin(zero_stage=2, gradient_accumulation_steps=accumulate_step)# zero 2
# 精度控制
accelerator = Accelerator(mixed_precision=mixed_precision, gradient_accumulation_steps=accumulate_step, deepspeed_plugin=deepspeed_plugin)
device = accelerator.device

### print setting

max_memory=accelerate.utils.get_max_memory()
accelerator.print("max_memory",max_memory)

### Insert LoRA to model

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
lora.mark_only_lora_as_trainable(model)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
trainable_params = sum([np.prod(p.size()) for p in model_parameters])
model_parameters = filter(lambda p: not p.requires_grad, model.parameters())
non_trainable_params = sum([np.prod(p.size()) for p in model_parameters])
accelerator.print('trainable_params:{} ({:.2f}%), non_trainable_params:{}'.format(trainable_params, trainable_params/non_trainable_params*100,non_trainable_params))


In [None]:
class QKV_layer(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(QKV_layer, self).__init__()
        self.linear_q = torch.nn.Linear(in_features, out_features//3)
        self.linear_k = torch.nn.Linear(in_features, out_features//3)
        self.linear_v = torch.nn.Linear(in_features, out_features//3)

    def update(self, target_layer):
        self.linear_q.weight.data = target_layer.weight[:target_layer.out_features//3, :].data
        self.linear_q.bias.data = target_layer.bias[:target_layer.out_features//3].data

        self.linear_k.weight.data = target_layer.weight[target_layer.out_features//3:target_layer.out_features//3*2, :].data
        self.linear_k.bias.data = target_layer.bias[target_layer.out_features//3:target_layer.out_features//3*2].data

        self.linear_v.weight.data = target_layer.weight[target_layer.out_features//3*2:, :].data
        self.linear_v.bias.data = target_layer.bias[target_layer.out_features//3*2:].data
    
    def forward(self, x):
        q = self.linear_q(x)
        k = self.linear_k(x)
        v = self.linear_v(x)
        return torch.concat([q,k,v], dim = -1)


for key, module in model.named_modules():
    if key.endswith('attention'):
        if isinstance(module.query_key_value, peft.tuners.lora.LoraModel):
            module.query_key_value = peft.tuners.lora.LoraModel(config, module.query_key_value.model)
        else:
            # Here we split the query_key_value layer into three linear layer for LoRA. But you can also use merged linear.
            qkv_layer = QKV_layer(module.query_key_value.in_features, module.query_key_value.out_features) 
            qkv_layer.update(module.query_key_value)
            module.query_key_value = qkv_layer
            module.query_key_value = peft.tuners.lora.LoraModel(config, module.query_key_value)

In [None]:
### 数据处理
### Dataset

EOS_ID = 150005

PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}

with open('data/alpaca_data.json', 'r') as f:
    content = json.load(f)


pairs = []

for line in content:
    if line['input'] == '':
        prompt = PROMPT_DICT['prompt_no_input'].format_map(line)
    else:
        prompt = PROMPT_DICT['prompt_input'].format_map(line)
    completion = line['output']+'</s>'
    if len(prompt) + len(completion) < MAX_LENGTH:
        pairs.append({'prompt':prompt, 'completion':completion})


In [None]:
## jupyter notebook
from accelerate import notebook_launcher