# EEVE-10.8B QLoRA Fine Tuning

## 1. Install Modules

In [1]:
!pip install -U datasets
!pip install -U bitsandbytes
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U typing_extensions
!pip install -U torch

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m 

## 2. Import Modules

In [2]:
import os
import torch
import transformers
from datasets import load_dataset
from transformers import (
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TextStreamer,
    pipeline,
)
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    TaskType,
    PeftModel,
    PeftConfig,
)
from trl import SFTTrainer

  warn(


## 3. Set PLM Into QLoRA Form

In [None]:
from huggingface_hub import login
from config.secret_keys import HUGGINGFACE_ACCESS_TOKEN

login(token=HUGGINGFACE_ACCESS_TOKEN)

In [4]:
model_name = "yanolja/EEVE-Korean-Instruct-10.8B-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Load model in 4bit precision
    bnb_4bit_quant_type='nf4',             # Pre-trained model has to be quantization in 4bit nf type
    bnb_4bit_use_double_quant=True,        # Use double-qauntization of QLoRA
    bnb_4bit_compute_dtype=torch.bfloat16  # Pre-trained model has to be loaded in BF16 dtype
)

plm = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=True,
    quantization_config=bnb_config,        # Use bitsandbytes config
    device_map='auto',                     # auto : HF Accelerate determines which GPU to allocate for each layer of the model.
    trust_remote_code=True                 # Setting for use EEVE model
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained('yanolja/EEVE-Korean-Instruct-10.8B-v1.0', token=True,)
tokenizer.pad_token = tokenizer.eos_token

## 4. Set Hyper Parameters

In [6]:
flm = prepare_model_for_kbit_training(plm)

lora_alpha = 32
lora_dropout = 0.05
lora_rank = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj']
)

flm=get_peft_model(flm, peft_config)
flm.print_trainable_parameters()

trainable params: 69,206,016 || all params: 10,874,130,432 || trainable%: 0.6364


In [7]:
flm

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(40960, 4096)
        (layers): ModuleList(
          (0-47): 48 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [8]:
training_arguments =  transformers.TrainingArguments(
    output_dir = './train_output',
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 2,                                               # 배치 크기가 줄어들면 기울기 누적 단계가 2배 증가
    optim = 'paged_adamw_32bit',                                                   # 더 나은 메모리 관리를 위해 페이징을 활성화
    save_strategy='steps',                                                         # 학습 중에 채택할 체크포인트 save strategy
    save_steps = 10,                                                               # 두 개의 체크포인트가 저장되기 전의 업데이트 단계 수
    logging_steps = 10,                                                            # 두 로그 사이의 업데이트 단계 수
    learning_rate = 2e-4,                                                          # AdamW 최적화 프로그램의 학습률
    max_grad_norm = 0.3,                                                           # 최대 그라데이션 표준(gradient clipping)
    max_steps = 60,                                                                # 60 단계 동안 학습
    warmup_ratio = 0.03,                                                           # 0 에서 learning_rate 까지 선형 준비에 사용되는 단계 수
    lr_scheduler_type = 'cosine',                                                  # 학습률 스케줄러
    report_to = 'none',                                                            # You can find your API key in your browser here: https://wandb.ai/authorize
)

## 5. Fine Tuning

In [None]:
trainer = SFTTrainer(
    model=flm,
    train_dataset=ds['train'],
    peft_config=peft_config,
    dataset_text_field='QA_text',
    max_seq_length=1024,
    tokenizer=tokenizer,
    args = training_arguments,
)


for name, module in trainer.model.named_modules():
    if 'norm' in name:
        module = module.to(torch.float32)

In [None]:
flm.config.use_cache = False
trainer.train()

## 6. Compare between PLM and FLM

In [None]:
# 예제 입력 텍스트
input_text = "골프보험 알려줘"

# FLM 모델 사용
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
output = flm.generate(**inputs, max_length=100)
result = tokenizer.decode(output[0], skip_special_tokens=True)
print("Output:", result)

In [None]:
# 예제 입력 텍스트
input_text = "골프보험 알려줘"

# plm 모델 사용
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
output = plm.generate(**inputs, max_length=100)
result = tokenizer.decode(output[0], skip_special_tokens=True)
print("Output:", result)