In [None]:
!pip install trl peft accelerate datasets transformers huggingface_hub wandb

In [None]:
import time
import json
import os
import torch
import wandb
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, default_data_collator
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from huggingface_hub import hf_hub_download

In [None]:
MODEL_NAME = "facebook/opt-350m"
DATASET_NAME = "sahil2801/CodeAlpaca-20k"
DATA_FILE = "code_alpaca_20k.json"
LORA_RANKS = [8, 128, 256]
MAX_SEQ_LENGTH = 128
PROJECT_NAME = "lora_rank_experiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
from huggingface_hub import hf_hub_download
local_json = hf_hub_download(
    repo_id=DATASET_NAME,
    filename=DATA_FILE,
    repo_type="dataset"
)

In [None]:
import json
from datasets import Dataset

with open(local_json, 'r', encoding='utf-8') as f:
    records = json.load(f)
records = records[:1000]
raw_dataset = Dataset.from_list(records)

In [None]:
len(raw_dataset)

In [None]:
raw_dataset

In [None]:
raw_dataset[3]

In [None]:
def prepare_data(examples, tokenizer, max_length=512):
    """데이터 전처리 함수"""
    # 프롬프트와 응답을 결합
    texts = []
    for instruction, input_text, output in zip(
        examples["instruction"],
        examples["input"],
        examples["output"]
    ):
        # 입력이 있는 경우와 없는 경우 구분
        if input_text:
            text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
        else:
            text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
        texts.append(text)

    # 토크나이징
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # SFT 형식에 맞게 데이터 구성
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"].clone()
    }

tokenized_dataset = raw_dataset.map(
    lambda x: prepare_data(x, tokenizer),
    batched=True,
    remove_columns=raw_dataset.column_names
)

In [None]:
tokenized_dataset

In [None]:
from transformers import DataCollatorForLanguageModeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
import wandb

In [None]:
for r in LORA_RANKS:
    run_name = f"lora_r_{r}"
    wandb.init(
        project=PROJECT_NAME,
        name=run_name,
        reinit=True,
        config={
            "lora_rank": r,
            "model_name": MODEL_NAME,
            "max_seq_length": MAX_SEQ_LENGTH,
            "learning_rate": 2e-4,
            "batch_size": 4,
            "gradient_accumulation_steps": 4,
            "num_epochs": 3
        }
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16
    ).cuda()

    peft_config = LoraConfig(
        r=r,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset,
        args=SFTConfig(
            output_dir=os.path.join("./results", run_name),
            max_seq_length=MAX_SEQ_LENGTH,
            dataset_kwargs={"skip_prepare_dataset": True},
            learning_rate=2e-4,
            num_train_epochs=3,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            warmup_steps=100,
            logging_steps=10,
            save_strategy="no",  # 저장 비활성화
            eval_strategy="no",
            load_best_model_at_end=False,
            disable_tqdm=False,
            label_names=["labels"],
            fp16=True,
            remove_unused_columns=False,
            report_to="wandb"
        ),
        data_collator=collator,
    )
    trainer.train()

    duration = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / 1024**3
    steps_per_sec = trainer.state.global_step / duration if duration > 0 else 0.0

    wandb.log({
        "duration_sec": duration,
        "peak_memory_gb": peak_memory_gb,
        "steps_per_sec": steps_per_sec,
        "final_loss": trainer.state.log_history[-1]["train_loss"] if trainer.state.log_history else None,
        "total_steps": trainer.state.global_step,
    })

    model.save_pretrained(os.path.join("./results", run_name))
    # wandb에 모델 아티팩트로 저장
    artifact = wandb.Artifact(
        name=f"model-lora-r-{r}",
        type="model",
        description=f"LoRA model with rank {r}"
    )
    artifact.add_dir(os.path.join("./results", run_name))
    wandb.log_artifact(artifact)

    wandb.finish()

# 모든 실험이 끝난 후 wandb에 요약 리포트 생성
wandb.init(project=PROJECT_NAME, name="experiment_summary", reinit=True)
for r in LORA_RANKS:
    api = wandb.Api()
    runs = api.runs(f"{wandb.run.entity}/{PROJECT_NAME}", filters={"name": f"lora_r_{r}"})
    if runs:
        run = runs[0]
        wandb.log({
            f"rank_{r}_final_loss": run.summary.get("final_loss"),
            f"rank_{r}_duration": run.summary.get("duration_sec"),
            f"rank_{r}_memory": run.summary.get("peak_memory_gb"),
            f"rank_{r}_speed": run.summary.get("steps_per_sec")
        })
wandb.finish()