In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import glob
import json
from datetime import datetime
from zoneinfo import ZoneInfo
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, EarlyStoppingCallback, TrainingArguments
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
from transformers.integrations import MLflowCallback, WandbCallback
from peft import prepare_model_for_kbit_training
# mlflow_process.terminate()

In [3]:
train_path = glob.glob("../data/Small_Business_Customer_Order_QnA/Training/labeling_data_train_json/*jsonl")
val_path = glob.glob("../data/Small_Business_Customer_Order_QnA/Validation/labeling_data_val_json/*jsonl")

In [4]:
# mlflow.set_experiment("250416_v1_finetuning_llama-3.2-Korean-Bllossom-3B")
# mlflow.set_tracking_uri("http://localhost:5000")

In [5]:
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = []
for path in train_path: 
    train_data.extend(load_jsonl(path))

val_data = []
for path in val_path:
    val_data.extend(load_jsonl(path))

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

In [6]:
train_dataset

Dataset({
    features: ['messages'],
    num_rows: 2303354
})

In [7]:
val_dataset

Dataset({
    features: ['messages'],
    num_rows: 295153
})

In [8]:
# 모델 이름 및 토크나이저 로드
# model_name = "meta-llama/Llama-3.2-3B-Instruct"
model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="right",  # 시퀀스 패킹 최적화
    model_max_length=4096   # Llama 3.2 최대 컨텍스트 길이
)
tokenizer.pad_token = tokenizer.eos_token  # 패딩 토큰 강제 지정


# 양자화 설정 (BitsAndBytesConfig)

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,  # 이중 양자화 (VRAM 23% 절감)[1][4]
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,  # A100 AMP 가속
    llm_int8_skip_modules=["lm_head"]  # 분류 레이어 FP16 유지[4]
)

# 양자화된 모델 로드
# device = torch.device("cuda" if torch.cuda.current_device() else "cpu")
device_map = {"": 0}  # os.environ["CUDA_VISIBLE_DEVICES"] = "3" 으로 설정했음
# model_nf4  = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=nf4_config,  # 양자화 설정
#     torch_dtype=torch.bfloat16,
#     cache_dir="../ai_models/llama-3.2-Korean-Bllossom-3B",
#     device_map=device_map,
#     return_dict=True,
# )
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    cache_dir="../ai_models/llama-3.2-Korean-Bllossom-3B",
    device_map=device_map,
    attn_implementation="flash_attention_2",  # 학습 속도 2.1x 향상[5]
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

In [10]:
model = prepare_model_for_kbit_training(model)

In [11]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

In [12]:
peft_config = LoraConfig(
    r=64,  # LoRA 차원 (A100 80GB 최적 밸런스)[3]
    lora_alpha=128,       # α/r=2 비율 (RoRA 알고리즘)[3]
    lora_dropout=0.05,    # 드롭아웃 비율
    bias="lora_only",     # LoRA 레이어 편향 업데이트 허용[2]
    task_type="CAUSAL_LM",
    target_modules=[      # 전체 어텐션 레이어 타겟팅[4]
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    use_rslora=True       # RoRA 스케일링 적용[3]
)
model = get_peft_model(model, peft_config)


In [13]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [14]:
print(train_dataset.shape)
print(val_dataset.shape)

(2303354, 1)
(295153, 1)


In [15]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=1,  # patience 횟수 (성능 개선 없을 때 몇 번 더 기다릴지)
    # early_stopping_threshold=0.005  # 개선으로 간주할 최소 변화량
)


In [16]:
print(datetime.now(ZoneInfo("Asia/Seoul")).strftime('%D,%H:%M:%S'))

04/29/25,16:15:22


In [None]:
print("start_time", datetime.now(ZoneInfo("Asia/Seoul")).strftime('%D,%H:%M:%S'))
dir_name = "250429_v1_finetuning_llama-3.2-Korean-Bllossom-3B"

training_args = TrainingArguments(
    output_dir=f"./finetuned_llm/{dir_name}",
    # 배치/학습률 설정
    per_device_train_batch_size=16,  # A100 80GB 최대 용량[4]
    gradient_accumulation_steps=1,   # 유효 배치 크기 16
    num_train_epochs=3,              # 1→3 (수렴도 개선)[2]
    learning_rate=3e-5,              # 사전학습 모델 적합값[1]
    warmup_ratio=0.03,               # 초기 3% 스텝 워밍업[2]
    
    # 최적화 기법
    optim="adamw_torch_fused",  # 최적화된 AdamW[5]
    weight_decay=0.01,          # L2 정규화 (과적합 방지)[5]
    max_grad_norm=0.3,          # 그래디언트 클리핑[5]
    gradient_checkpointing=True,  # VRAM 66% 절감[4]
    
    # 평가/저장 전략
    eval_strategy="steps",
    eval_steps=1000,            # 2000→1000 (평가 빈도 증가)[2]
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,
    
    # 하드웨어 최적화
    bf16=True,      # AMP 활성화 (A100)
    tf32=True,      # TF32 연산 가속
    fp16=False,
    
    # 재현성
    seed=42,
    data_seed=42,
    run_name=dir_name,

    # best model 설정정
    metric_for_best_model="eval_loss",    
    greater_is_better=False,                
    load_best_model_at_end=True,   
)

# SFTTrainer로 학습 진행
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args, 
    callbacks=[early_stopping],  # EarlyStoppingCallback 추가
)

trainer.train()
print("end_time", datetime.now(ZoneInfo("Asia/Seoul")).strftime('%D,%H:%M:%S'))

start_time 04/29/25,16:15:22


Converting train dataset to ChatML:   0%|          | 0/2303354 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2303354 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2303354 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2303354 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/295153 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/295153 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/295153 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/295153 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfirst202301[0m ([33mfirst202301-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss


In [None]:
print(datetime.now(ZoneInfo("Asia/Seoul")).strftime('%D,%H:%M:%S'))

04/29/25,09:39:38


In [None]:
best_model_path = trainer.state.best_model_checkpoint
print("Best checkpoint:", best_model_path)

# # 또는 명시적으로 불러오기
# model = AutoModelForCausalLM.from_pretrained(best_model_path)


Best checkpoint: ./finetuned_llm/250423_v1_finetuning_llama-3.2-Korean-Bllossom-3B/checkpoint-140000


In [None]:
trainer.save_model("../best_llm/250429_v1_finetuning_llama-3.2-Korean-Bllossom-3B")

# Q4 & GGUF 변환

In [30]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

# ① base model 로드 (ex: LLaMA 2)
# base = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"
device_map = {"": 0}
base  = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir="../ai_models/llama-3.2-Korean-Bllossom-3B",
    device_map=device_map,
    return_dict=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
# ② QLoRA adapter 로드
lora = PeftModel.from_pretrained(base, "../best_llm/250423_v1_finetuning_llama-3.2-Korean-Bllossom-3B")

In [33]:
lora

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_f

In [34]:
# ③ 병합 (LoRA → base 모델에 완전히 합침)
merged = lora.merge_and_unload()


In [35]:
# ④ Transformers 포맷으로 저장
merged.save_pretrained("../merged_model/250423_v1-merged_model_finetuning_llama-3.2-Korean-Bllossom-3B")

In [36]:
tokenizer.save_pretrained("../merged_model/250423_v1-merged_model_finetuning_llama-3.2-Korean-Bllossom-3B")  # 핵심

('../merged_model/250423_v1-merged_model_finetuning_llama-3.2-Korean-Bllossom-3B/tokenizer_config.json',
 '../merged_model/250423_v1-merged_model_finetuning_llama-3.2-Korean-Bllossom-3B/special_tokens_map.json',
 '../merged_model/250423_v1-merged_model_finetuning_llama-3.2-Korean-Bllossom-3B/tokenizer.json')

In [None]:
# 

In [None]:
# python convert_hf_to_gguf.py ../merged_model/250423_v1-merged_model_finetuning_llama-3.2-Korean-Bllossom-3B --outfile ../gguf_models --outtype f16
# ../gguf_models/250423_v1-Merged_Model_Finetuning_Llama-3.2-Korean-Bllossom-3B-3B-250423_v1-merged_model_finetuning_llama-Korean-Bllossom-3.2-F16.gguf

# ./llama-quantize ../gguf_models/250423_v1-Merged_Model_Finetuning_Llama-3.2-Korean-Bllossom-3B-3B-250423_v1-merged_model_finetuning_llama-Korean-Bllossom-3.2-F16.gguf ../gguf_models/250423_v1-Merged_Model_Finetuning_Llama-3.2-Korean-Bllossom-3B-3B-250423_v1-merged_model_finetuning_llama-Korean-Bllossom-3.2-F16-Q4_K_M.gguf Q4_K_M

In [None]:
# trainer.model.save_pretrained("./llama3.2-qlora")
# tokenizer.save_pretrained("./llama3.2-qlora")

# # 텍스트 생성 예시
# from transformers import pipeline

# pipe = pipeline("text-generation", model="./llama3.2-qlora", tokenizer=tokenizer, device_map="auto")

# print(pipe("질문: 서울의 명소는?\n답변:", max_new_tokens=100)[0]["generated_text"])
