In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset
import gc

# Load fine-tuned LLaMA model (Replace with your model path)
model_name = "../../Llama-3.2-1B-Instruct"
train_ds_folder = "../finetune_summaries_json/"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

files = os.listdir(train_ds_folder)
N_FILES = len(files)

device = "cuda"
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), 

In [2]:
# QLoRA config
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["lm_head"],
    task_type="CAUSAL_LM",
)

# Add adapters to model
model = prepare_model_for_kbit_training(
    model,
    use_gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)
gc.collect()
torch.cuda.empty_cache()
gc.collect()
model = get_peft_model(model, lora_config)

# disable KV cache due to memory consumption (no need here)
model.config.use_cache = False

gc.collect()
torch.cuda.empty_cache()
gc.collect()

0

In [3]:
# Load dataset
dataset = load_dataset(
    "json",
    data_files={
        "train": [train_ds_folder + filename for filename in files],
    }
)

def create_target_chat(question, answer):
    return [
        {"role": "user", "content": f"{question}"},
        {"role": "assistant", "content": f"{answer}"},
    ]

# function to tokenize dataset
def tokenize(ds_element):
    target_chat = tokenizer.apply_chat_template(
        create_target_chat(ds_element["question"], ds_element["answer"]), 
        tokenize=False, 
        add_generation_prompt=False
    )
    tokenized_text = tokenizer(target_chat, truncation=False, padding=False, add_special_tokens=False)
    return {
        "input_ids": tokenized_text["input_ids"],
        "labels": tokenized_text["input_ids"],
        "attention_mask": tokenized_text["attention_mask"],
    }

# apply tokenize
dataset_tokenized = dataset.map(
    tokenize,
    batched=False,
    num_proc=os.cpu_count(),  # multithreaded
    remove_columns=["question", "answer"],
)

Downloading data: 100%|██████████| 747/747 [00:00<00:00, 466936.67files/s]
Generating train split: 747 examples [00:00, 887.93 examples/s]
Map (num_proc=12): 100%|██████████| 747/747 [00:01<00:00, 662.74 examples/s] 


In [4]:
IGNORE_INDEX = -100
ATTN_IGNORE_INDEX = 0

# function to batch inputs
def collate(elements):
    # Extract input_ids from each element and find the maximum length among them
    tokens = [e["input_ids"] for e in elements]
    tokens_maxlen = max([len(t) for t in tokens])

    for e in elements:
        input_ids = e["input_ids"]
        labels = e["labels"]
        attention_mask = e["attention_mask"]

        # Calculate the padding length required to match the maximum token length
        pad_len = tokens_maxlen - len(input_ids)

        # Pad 'input_ids' with the pad token ID, 'labels' with IGNORE_INDEX, and 'attention_mask' with 0
        input_ids.extend(pad_len * [tokenizer.pad_token_id])
        labels.extend(pad_len * [IGNORE_INDEX])
        attention_mask.extend(pad_len * [ATTN_IGNORE_INDEX])

    # create and return batch with all the data in elements
    batch = {
        "input_ids": torch.tensor([e["input_ids"] for e in elements]),
        "labels": torch.tensor([e["labels"] for e in elements]),
        "attention_mask": torch.tensor([e["attention_mask"] for e in elements]),
    }
    return batch

In [5]:
# Hyperparemeters
BS = 4  # batch size
GA_STEPS = 4  # gradient acc. steps
EPOCHS = 10
LR = 2e-5

steps_per_epoch = len(dataset_tokenized["train"]) // (BS * GA_STEPS)

args = TrainingArguments(
    output_dir="qlora_checkpoints",
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    eval_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch * 1,  # eval once per epoch
    save_steps=steps_per_epoch * 1,  # save once per epoch
    gradient_accumulation_steps=GA_STEPS,
    num_train_epochs=EPOCHS,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=LR,
    group_by_length=True,
    bf16=True,
    ddp_find_unused_parameters=False,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["train"],
)

# training loop
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
46,1.7173,1.692706
92,1.4733,1.558116
138,1.7203,1.494418
184,1.5078,1.453456
230,1.6138,1.423345
276,1.5121,1.397536
322,1.3541,1.374594
368,1.3297,1.353431
414,1.3256,1.333605
460,1.3376,1.31434


TrainOutput(global_step=460, training_loss=1.4794208067914714, metrics={'train_runtime': 2099.4632, 'train_samples_per_second': 3.558, 'train_steps_per_second': 0.219, 'total_flos': 1.0535074097108582e+17, 'train_loss': 1.4794208067914714, 'epoch': 9.79144385026738})