In [1]:
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import transformers
from torch.amp import autocast, GradScaler
from trl import SFTTrainer
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # 메모리 조각 방지지
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("code_search_net", "python")  # 언어 선택 가능 (ex: python, java)
train_data = dataset["train"]
valid_data = dataset["validation"]

In [3]:
BASE_MODEL = "./DeepSeek-R1-Distill-Llama-8B"

In [4]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    #target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],  # 가중치 적용할 레이어
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],  # 가중치 적용할 레이어
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# 4bit 양자화 설정 - QLoRA로 해야 함
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", #nf4
    bnb_4bit_use_double_quant=True, #True
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token  # 패딩 토큰 설정

# 4-bit 양자화된 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    #device_map="sequential",
    device_map="auto",
    quantization_config=bnb_config  # 4-bit 설정 적용
)


  _ = torch.tensor([0], device=i)
Loading checkpoint shards: 100%|██████████| 2/2 [00:38<00:00, 19.36s/it]


In [5]:
# RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn 에러
model.enable_input_require_grads() # get_input_embeddings().weight.requires_grad = True 
# LoRA 적용
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# LoRA가 적용된 레이어만 학습 (모델 파라미터 freeze)
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [6]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["func_code_string"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )
    tokenized["labels"] = torch.tensor(tokenized["input_ids"])  # ✅ `torch.tensor()` 사용
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)

#tokenized_datasets.set_format(type="torch", device=DEVICE)

Map: 100%|██████████| 22176/22176 [00:07<00:00, 3147.35 examples/s]


In [7]:
training_args = TrainingArguments(
    output_dir="./deepseek-lora",
    dataloader_pin_memory=False,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_checkpointing=True,  # T: GPU 메모리 절약
    gradient_accumulation_steps=4,  # 작은 배치 크기 보완
    num_train_epochs=3,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    fp16=True,  # 16-bit 연산
    optim="paged_adamw_8bit"
)



In [8]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [None]:
model.train()
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

trainer.train()

In [None]:
model.save_pretrained("./deepseek-code-search")
tokenizer.save_pretrained("./deepseek-code-search")