In [1]:
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import transformers
from torch.amp import autocast, GradScaler
from trl import SFTTrainer

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # 메모리 조각 방지지
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("code_search_net", "python")  # 언어 선택 가능 (ex: python, java)
train_data = dataset["train"]
valid_data = dataset["validation"]

In [3]:
BASE_MODEL = "./DeepSeek-R1-Distill-Llama-8B"

In [4]:
# 4bit 양자화 설정 - QLoRA로 해야 함
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", #nf4
    bnb_4bit_use_double_quant=True, #True
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token  # 패딩 토큰 설정

# 4-bit 양자화된 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    quantization_config=bnb_config  # 4-bit 설정 적용
)

  _ = torch.tensor([0], device=i)
Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.41s/it]


In [5]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],  # 가중치 적용할 레이어
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# LoRA 적용
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# LoRA가 적용된 레이어만 학습 (모델 파라미터 freeze)
for name, param in model.named_parameters():
    if name not in lora_config.target_modules:  # LoRA가 적용된 레이어가 아니면 freeze
        param.requires_grad = False

trainable params: 10,485,760 || all params: 8,040,747,008 || trainable%: 0.1304


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["func_documentation_string"], examples["func_code_string"], truncation=True, padding="max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=True)
#tokenized_datasets.set_format(type="torch", device=DEVICE)

In [7]:
tokenized_datasets.set_format(type="torch")

In [8]:
training_args = TrainingArguments(
    output_dir="./deepseek-lora",
    dataloader_pin_memory=False,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_checkpointing=True,  # GPU 메모리 절약
    gradient_accumulation_steps=8,  # 작은 배치 크기 보완
    num_train_epochs=3,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    fp16=True,  # 16-bit 연산
    optim="adamw_torch"
)



In [None]:
model = model.to_empty(device=DEVICE)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


In [None]:
model.save_pretrained("./deepseek-code-search")
tokenizer.save_pretrained("./deepseek-code-search")