In [None]:
!pip install tf-keras --upgrade
!pip uninstall keras -y  # 卸载现有的Keras 3
!pip install -q h5py typing-extensions wheel
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!nvidia-smi

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "deepseek-ai/deepseek-llm-14b-chat"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, # Activate nested quantization for 4-bit base models (double quantization)
    bnb_4bit_quant_type="nf4", # Quantization type (fp4 or nf4), According to QLoRA paper, for training 4-bit base models (e.g. using LoRA adapters) one should use
    bnb_4bit_compute_dtype=torch.bfloat16
)
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

# 模型加载配置
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",  # 自动分配设备（CPU/GPU）
    low_cpu_mem_usage=True  # 减少CPU内存占用
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig, get_peft_model

# You can try differnt parameter-effient strategy for model trianing, for more info, please check https://github.com/huggingface/peft
config = LoraConfig(
    r=24,                # 平衡模型容量与数据量
    lora_alpha=48,       # alpha=2*r
    lora_dropout=0.4,    # 强化正则化
    target_modules=["q_proj", "v_proj", "k_proj"],  # 增加k_proj提升注意力机制灵活性
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

In [None]:
from jinja2 import Template
template = Template(tokenizer.chat_template)
message = "Please introduce yourself"
print(f"message:\n{message}\n")
message_send_to_model=template.render(messages=[{"role": "user", "content": message}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
print(f"message_send_to_model:\n{message_send_to_model}")

In [None]:
template = Template(tokenizer.chat_template)
@torch.no_grad()
def generate(prompt):
    modelInput=template.render(messages=[{"role": "user", "content": prompt}],bos_token= tokenizer.bos_token,add_generation_prompt=True)
    print("-"*80)
    print(f"model_input_string:\n{modelInput}")
    input_ids = tokenizer.encode(modelInput, add_special_tokens=False, return_tensors='pt').to("cuda:0")
    outputs = model.generate(input_ids, do_sample=False)
    model_return_string = tokenizer.decode(*outputs, skip_special_tokens=False)
    print("-"*80)
    print(f"model_return_string:\n{model_return_string}")
    generated_ids = outputs[:, input_ids.shape[1]:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    return generated_text

query = "Please introduce yourself"
print("-"*80)
print(f"query:\n{query}")
response = generate(query)
print("-"*80)
print(f"response:\n{response}")

In [None]:
from datasets import load_dataset

dataset = load_dataset("blended_skill_talk")
dataset = dataset['train'].map(lambda sample: {"conversations": [{"from": "human", "value": sample['question']}, {"from": "gpt", "value": sample['answer']}]}, batched=False)

In [None]:
from torch.utils.data import random_split
train_dataset_size, val_dataset_size = 800, 200
train_dataset, val_dataset, _ = random_split(dataset, [train_dataset_size, val_dataset_size, len(dataset)-train_dataset_size-val_dataset_size])
print(train_dataset[0]['conversations'])

In [None]:
import transformers
from typing import Dict, Sequence, List
from torch.utils.data import Dataset
from dataclasses import dataclass

def preprocess(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    template = Template(tokenizer.chat_template)
    max_seq_len = tokenizer.model_max_length
    messages = []
    for i, source in enumerate(sources):
        if source[0]["from"] != "human":
            # Skip the first one if it is not from human
            source = source[1:]

        for j in range(0, len(source), 2):
            if j+1 >= len(source): continue
            q = source[j]["value"]
            a = source[j+1]["value"]
            assert q is not None and a is not None, f'q:{q} a:{a}'
            input =  template.render(messages=[{"role": "user", "content": q},{"role": "assistant", "content": a}],bos_token=tokenizer.bos_token,add_generation_prompt=False)
            input_ids = tokenizer.encode(input, add_special_tokens= False)

            query = template.render(messages=[{"role": "user", "content": q}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
            query_ids = tokenizer.encode(query, add_special_tokens= False)

            labels = [-100]*len(query_ids) + input_ids[len(query_ids):]
            assert len(labels) == len(input_ids)
            if len(input_ids) == 0: continue
            messages.append({"input_ids": input_ids[-max_seq_len:], "labels": labels[-max_seq_len:]})

    input_ids = [item["input_ids"] for item in messages]
    labels = [item["labels"] for item in messages]

    max_len = max(len(x) for x in input_ids)

    max_len = min(max_len, max_seq_len)
    input_ids = [ item[:max_len] + [tokenizer.eos_token_id]*(max_len-len(item)) for item in input_ids]
    labels = [ item[:max_len] + [-100]*(max_len-len(item)) for item in labels]

    input_ids = torch.LongTensor(input_ids)
    labels = torch.LongTensor(labels)
    return {
        "input_ids": input_ids,
        "labels": labels
    }


class InstructDataset(Dataset):
    def __init__(self, data: Sequence, tokenizer: transformers.PreTrainedTokenizer) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
        sources = self.data[index]
        if isinstance(index, int):
            sources = [sources]
        data_dict = preprocess([e['conversations'] for e in sources], self.tokenizer)
        if isinstance(index, int):
            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
        return data_dict


@dataclass
class DataCollatorForSupervisedDataset(object):
    tokenizer: transformers.PreTrainedTokenizer
    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [None]:
train_dataset = InstructDataset(train_dataset, tokenizer)
val_dataset = InstructDataset(val_dataset, tokenizer)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [None]:
sample_data = train_dataset[9]
IGNORE_INDEX=-100

print("=" * 80)
print("Debuging: ")
print(f"Input_ids\n{sample_data['input_ids']}")
print(f"Label_ids\n{sample_data['labels']}")
print("-" * 80)
print(f"Input:\n{tokenizer.decode(sample_data['input_ids'])}")
print("-" * 80)
N_id = tokenizer.encode("N", add_special_tokens= False)[0]
print(f"Label:\n{tokenizer.decode([N_id if x == -100 else x for x in sample_data['labels']])}")
print("=" * 80)


In [None]:
# Set training parameters
training_arguments = transformers.TrainingArguments(
    num_train_epochs=5,
    learning_rate=2.5e-5,                  # 微调学习率
    lr_scheduler_type="cosine_with_restarts",  # 启用带重启的余弦退火
    warmup_ratio=0.1,
    weight_decay=0.01,
)

In [None]:
model.train()
trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)
trainer.train()

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

model.print_trainable_parameters()

In [None]:
import math
!pip install -q -U git+https://github.com/huggingface/accelerate.git
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
!pwd
output_path = "ilora"
trainer.save_model(output_path)

In [None]:
template = Template(tokenizer.chat_template)
@torch.no_grad()
def generate(prompt):
    modelInput = template.render(messages=[{"role": "user", "content": prompt}],bos_token= tokenizer.bos_token,add_generation_prompt=True)
    input_ids = tokenizer.encode(modelInput, add_special_tokens=False, return_tensors='pt').to("cuda:0")
    outputs = model.generate(
    input_ids,
    do_sample=False,              # 关闭采样
    temperature=1.0,              # 无效参数（因do_sample=False）
    max_new_tokens=10,            # 限制生成长度（仅需选项字母）
)
    model_return_string = tokenizer.decode(*outputs, skip_special_tokens=False)
    print("-"*80)
    print(f"model_return_string:\n{model_return_string}")
    generated_ids = outputs[:, input_ids.shape[1]:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    return generated_text

query = "I get hit"
print(f"query:\n{query}")
response = generate(query)
print("-"*80)
print(f"response:\n{response}")

In [None]:
# Empty VRAM
# del model
# del trainer
import gc
import torch
torch.cuda.empty_cache()
gc.collect()
gc.collect()

In [None]:
!nvidia-smi

In [None]:
@torch.no_grad()
def generate(prompts):
    model_inputs = [template.render(messages=[{"role": "user", "content": prompt}], bos_token=tokenizer.bos_token, add_generation_prompt=True) for prompt in prompts]
    input_ids = tokenizer(model_inputs, add_special_tokens=False, return_tensors='pt', padding=True).to("cuda:0")

    outputs = model.generate(input_ids.input_ids, attention_mask=input_ids.attention_mask, max_new_tokens=100)

    generated_texts = []
    for i in range(len(prompts)):
        generated_ids = outputs[i, input_ids.input_ids.shape[1]:]
        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        generated_texts.append(generated_text)

    return generated_texts

# test
print("\n\n".join(generate(["I get hit", "Who are you?"])))