In [None]:
!pip3 install -q -U bitsandbytes==0.39.1
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1
!pip3 install -q -U huggingface_hub==0.23.0
!pip3 install -q -U triton==2.0.0
!pip3 install -q -U scipy

In [None]:
!pip3 install -q -U scipy

In [None]:
from huggingface_hub import login
import os

Hugging_Access_Token = os.getenv("Hugging_Access_Token")

login(token=Hugging_Access_Token)

In [None]:
import pandas as pd
import time
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    TrainerCallback,
)
from datasets import Dataset
import torch

In [None]:
train = pd.read_csv("./combined_data.csv", encoding="utf-8-sig")
test = pd.read_csv("./test.csv", encoding="utf-8-sig")

In [None]:
samples = []

for i in range(10):
    sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
    samples.append(sample)

In [None]:
dataset = Dataset.from_pandas(train)

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GemmaTokenizer,
)


model_id = "beomi/KoAlpaca-KoRWKV-6B"
device = "auto"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# 8비트 양자화로 모델 로드 (BitsAndBytesConfig 없이 load_in_8bit=True 옵션 사용)
model = AutoModelForCausalLM.from_pretrained(
    model_id, load_in_8bit=True, device_map=device
)

In [None]:
!nvidia-smi

In [None]:
def tokenize_function(examples):
    # 모델 입력 토큰화
    inputs = tokenizer(
        examples["input"], padding="max_length", truncation=True, max_length=512
    )
    # causal LM 학습을 위해 labels를 input_ids로 그대로 복사
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs


# map()을 통해 데이터셋에 토크나이저 적용
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets


class DynamicMaxTokensCallback(TrainerCallback):
    def on_batch_begin(self, args, state, control, **kwargs):
        # 배치마다 가장 긴 입력 길이를 찾아 동적으로 max_new_tokens를 설정
        max_input_length = max(
            [len(tokenizer(x)["input_ids"]) for x in kwargs["inputs"]["input"]]
        )
        control.max_new_tokens = max_input_length
        print(f"Dynamic max_new_tokens: {max_input_length}")
        return control

In [None]:
# # LoRA config
# lora_r = 16 #lora 가운데 차원
# lora_alpha = 16 #lora 스케일링 alpha/r
# lora_dropout = 0.05
# lora_target_modules = ["gate_proj", "down_proj", "up_proj"]

# LoRA config
lora_r = 16  # lora 가운데 차원
lora_alpha = 16  # lora 스케일링 alpha/r
lora_dropout = 0.05
lora_target_modules = ["key", "value", "output"]

In [None]:
from peft import LoraConfig
from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    prepare_model_for_int8_training,
)

# LoRA옵션값 설정
lora_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    # LoRA를 붙이는 위치로, attention쪽, MLP쪽 등 내가 원하는 곳에 붙일수 있다
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

# 위에서 4bit로 양자한 모델을 준비
# 모델을 LoRA붙일수 있게 셋팅
model = prepare_model_for_int8_training(model)
print(model)
# LoRA붙이기
model = get_peft_model(model, lora_config)  # Applying LoRA
print(model)

In [None]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     per_device_train_batch_size=2,   # 각 GPU/CPU에서 학습할 배치 크기
#     gradient_accumulation_steps=8,   # 실제 batch size = 2 * 8 = 16
#     fp16=False,
#     max_grad_norm=0.0,
#     save_steps=1000,
#     logging_steps=50,
#     evaluation_strategy="no",       # 예시에서는 검증 생략
#     num_train_epochs=3,
#     save_total_limit=2,
#     gradient_checkpointing=True,    # 메모리 절약
#     learning_rate=2e-5,
#     report_to="none"
# )

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,  # 배치 크기를 16으로 증가 (GPU 메모리가 여유 있을 경우)
    gradient_accumulation_steps=1,  # gradient accumulation 단계 1로, 즉 매 배치마다 업데이트
    bf16=True,  # A100에서 BF16 사용
    max_grad_norm=0.0,  # gradient clipping 비활성화
    save_steps=1000,
    logging_steps=50,
    evaluation_strategy="no",
    num_train_epochs=1,  # 에폭 수도 1로 줄여서 빠른 프로토타입 진행
    save_total_limit=2,
    gradient_checkpointing=False,  # 체크포인팅 끄면 속도 향상
    learning_rate=2e-5,
    report_to="none",
    dataloader_num_workers=4,
    optim="adamw_bnb_8bit",  # 8-bit 옵티마이저 사용 (속도 및 메모리 이점)
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

In [None]:
start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Total training time for sample dataset: {elapsed_time / 60:.2f} minutes")

In [None]:
!nvidia-smi

In [None]:
pwd

In [None]:
# beomi/KoAlpaca-KoRWKV-6B

# 8비트
# [ 3/1408 01:07 < 26:21:53, 0.01 it/s, Epoch 0.00/1]
# 6973
# 24793

# 4비트
# [ 7/1408 04:29 < 21:00:41, 0.02 it/s, Epoch 0.00/1]
# 4325MiB
# 18555MiB

# 4비트 32 * 2
# [ 3/352 03:16 < 19:04:18, 0.01 it/s, Epoch 0.01/1]
# 4325MiB
# 31663MiB

# 4비트 48 * 2
# [ 3/235 04:47 < 18:30:59, 0.00 it/s, Epoch 0.01/1]
# 4325MiB
# 39411MiB

# beomi/KoAlpaca-KoRWKV-1.5B
# 8비트
# [ 3/352 01:42 < 9:57:31, 0.01 it/s, Epoch 0.01/1]
# 1909MiB

# 8비트 + 데이터 전처리
# [ 3/1408 00:30 < 11:42:57, 0.03 it/s, Epoch 0.00/1]
# 1909MiB
# 13743MiB

# 8비트 64*4
# [ 3/88 06:19 < 8:57:03, 0.00 it/s, Epoch 0.02/1]
# 1909MiB
# 26421MiB


# 4비트 32 * 2
# [ 3/352 01:29 < 8:41:15, 0.01 it/s, Epoch 0.01/1]
# 1357
# 20191MiB