In [1]:
!pip install trl peft accelerate datasets transformers huggingface_hub wandb

Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-c

In [2]:
import time
import json
import os
import torch
import wandb
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, default_data_collator
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from huggingface_hub import hf_hub_download

In [3]:
MODEL_NAME = "facebook/opt-350m"
DATASET_NAME = "sahil2801/CodeAlpaca-20k"
DATA_FILE = "code_alpaca_20k.json"
LORA_RANKS = [8, 128, 256]
MAX_SEQ_LENGTH = 128
PROJECT_NAME = "lora_rank_experiment"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [5]:
from huggingface_hub import hf_hub_download
local_json = hf_hub_download(
    repo_id=DATASET_NAME,
    filename=DATA_FILE,
    repo_type="dataset"
)

code_alpaca_20k.json:   0%|          | 0.00/8.06M [00:00<?, ?B/s]

In [6]:
import json
from datasets import Dataset

with open(local_json, 'r', encoding='utf-8') as f:
    records = json.load(f)
records = records[:1000]
raw_dataset = Dataset.from_list(records)

In [7]:
len(raw_dataset)

1000

In [8]:
raw_dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1000
})

In [9]:
raw_dataset[3]

{'instruction': 'Create an array of length 15 containing numbers divisible by 3 up to 45.',
 'input': '',
 'output': 'arr = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45]'}

In [10]:
def prepare_data(examples, tokenizer, max_length=512):
    """데이터 전처리 함수"""
    # 프롬프트와 응답을 결합
    texts = []
    for instruction, input_text, output in zip(
        examples["instruction"],
        examples["input"],
        examples["output"]
    ):
        # 입력이 있는 경우와 없는 경우 구분
        if input_text:
            text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
        else:
            text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
        texts.append(text)

    # 토크나이징
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # SFT 형식에 맞게 데이터 구성
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"].clone()
    }

tokenized_dataset = raw_dataset.map(
    lambda x: prepare_data(x, tokenizer),
    batched=True,
    remove_columns=raw_dataset.column_names
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [12]:
from transformers import DataCollatorForLanguageModeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
import wandb

In [19]:
for r in LORA_RANKS:
    run_name = f"lora_r_{r}"
    wandb.init(
        project=PROJECT_NAME,
        name=run_name,
        reinit=True,
        config={
            "lora_rank": r,
            "model_name": MODEL_NAME,
            "max_seq_length": MAX_SEQ_LENGTH,
            "learning_rate": 2e-4,
            "batch_size": 4,
            "gradient_accumulation_steps": 4,
            "num_epochs": 3
        }
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16
    ).cuda()

    peft_config = LoraConfig(
        r=r,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset,
        args=SFTConfig(
            output_dir=os.path.join("./results", run_name),
            max_seq_length=MAX_SEQ_LENGTH,
            dataset_kwargs={"skip_prepare_dataset": True},
            learning_rate=2e-4,
            num_train_epochs=3,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            warmup_steps=100,
            logging_steps=10,
            save_strategy="no",  # 저장 비활성화
            eval_strategy="no",
            load_best_model_at_end=False,
            disable_tqdm=False,
            label_names=["labels"],
            fp16=True,
            remove_unused_columns=False,
            report_to="wandb"
        ),
        data_collator=collator,
    )
    trainer.train()

    duration = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / 1024**3
    steps_per_sec = trainer.state.global_step / duration if duration > 0 else 0.0

    wandb.log({
        "duration_sec": duration,
        "peak_memory_gb": peak_memory_gb,
        "steps_per_sec": steps_per_sec,
        "final_loss": trainer.state.log_history[-1]["train_loss"] if trainer.state.log_history else None,
        "total_steps": trainer.state.global_step,
    })

    model.save_pretrained(os.path.join("./results", run_name))
    # wandb에 모델 아티팩트로 저장
    artifact = wandb.Artifact(
        name=f"model-lora-r-{r}",
        type="model",
        description=f"LoRA model with rank {r}"
    )
    artifact.add_dir(os.path.join("./results", run_name))
    wandb.log_artifact(artifact)

    wandb.finish()

# 모든 실험이 끝난 후 wandb에 요약 리포트 생성
wandb.init(project=PROJECT_NAME, name="experiment_summary", reinit=True)
for r in LORA_RANKS:
    api = wandb.Api()
    runs = api.runs(f"{wandb.run.entity}/{PROJECT_NAME}", filters={"name": f"lora_r_{r}"})
    if runs:
        run = runs[0]
        wandb.log({
            f"rank_{r}_final_loss": run.summary.get("final_loss"),
            f"rank_{r}_duration": run.summary.get("duration_sec"),
            f"rank_{r}_memory": run.summary.get("peak_memory_gb"),
            f"rank_{r}_speed": run.summary.get("steps_per_sec")
        })
wandb.finish()

0,1
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/grad_norm,▇▅▅▃▄▃▇▄▃▁▄▄▆▃▆▂▅█
train/learning_rate,▁▂▃▃▄▅▆▆▇█▇▆▅▅▄▃▂▁
train/loss,██▆▅▅▄▄▃▂▂▂▂▁▁▁▂▂▂
train/mean_token_accuracy,▂▁▂▃▃▅▅▆▆▆▇▆▇▇█▇▇▇▇
train/num_tokens,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██

0,1
total_flos,2765621381038080.0
train/epoch,2.96
train/global_step,186.0
train/grad_norm,1.14588
train/learning_rate,2e-05
train/loss,1.6251
train/mean_token_accuracy,0.65903
train/num_tokens,348884.0
train_loss,1.83293
train_runtime,197.0764


Step,Training Loss
10,2.4146
20,2.4395
30,2.2313
40,2.1027
50,2.0609
60,1.891
70,1.8857
80,1.799
90,1.7178
100,1.7264


[34m[1mwandb[0m: Adding directory to artifact (./results/lora_r_8)... Done. 0.0s


0,1
duration_sec,▁
final_loss,▁
peak_memory_gb,▁
steps_per_sec,▁
total_steps,▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,▆▅▅▃▄▃▇▄▃▁▄▃▆▃▆▃▅█
train/learning_rate,▁▂▃▃▄▅▆▆▇█▇▆▅▅▄▃▂▁
train/loss,██▆▅▅▄▄▃▂▂▂▂▁▁▁▂▂▂

0,1
duration_sec,196.29961
final_loss,1.8353
peak_memory_gb,3.92775
steps_per_sec,0.94753
total_flos,2765621381038080.0
total_steps,186.0
train/epoch,2.96
train/global_step,186.0
train/grad_norm,1.1105
train/learning_rate,2e-05


Step,Training Loss
10,2.4148
20,2.4407
30,2.2351
40,2.107
50,2.0652
60,1.8969
70,1.8932
80,1.8015
90,1.7198
100,1.7262


[34m[1mwandb[0m: Adding directory to artifact (./results/lora_r_128)... Done. 0.2s


0,1
duration_sec,▁
final_loss,▁
peak_memory_gb,▁
steps_per_sec,▁
total_steps,▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,▆▅▅▃▄▃▇▄▃▁▅▄▆▃▆▂▅█
train/learning_rate,▁▂▃▃▄▅▆▆▇█▇▆▅▅▄▃▂▁
train/loss,██▆▅▅▄▄▃▂▂▂▂▁▁▁▂▂▂

0,1
duration_sec,199.89806
final_loss,1.83609
peak_memory_gb,4.14525
steps_per_sec,0.93047
total_flos,2872888189255680.0
total_steps,186.0
train/epoch,2.96
train/global_step,186.0
train/grad_norm,0.27718
train/learning_rate,2e-05


Step,Training Loss
10,2.4149
20,2.4413
30,2.2364
40,2.1083
50,2.0662
60,1.8973
70,1.8933
80,1.802
90,1.7199
100,1.7264


[34m[1mwandb[0m: Adding directory to artifact (./results/lora_r_256)... Done. 0.4s


0,1
duration_sec,▁
final_loss,▁
peak_memory_gb,▁
steps_per_sec,▁
total_steps,▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,▆▅▅▃▄▃▇▄▃▁▅▅▆▃▇▂▅█
train/learning_rate,▁▂▃▃▄▅▆▆▇█▇▆▅▅▄▃▂▁
train/loss,██▆▅▅▄▄▃▂▂▂▂▁▁▁▂▂▂

0,1
duration_sec,204.96328
final_loss,1.83646
peak_memory_gb,4.51167
steps_per_sec,0.90748
total_flos,2987306118021120.0
total_steps,186.0
train/epoch,2.96
train/global_step,186.0
train/grad_norm,0.19512
train/learning_rate,2e-05
