In [1]:
import os
import numpy as np

import torch
from torchinfo import summary

# disable keras backend for transformers
os.environ["USE_TF"] = "0"
os.environ["USE_JAX"] = "0"

import evaluate
from datasets import load_dataset
from transformers import pipeline
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Load the Dataset

In [3]:
dataset = load_dataset("yahma/alpaca-cleaned")
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51760
    })
})

In [4]:
def format_instruction(example):
    return {
        "text": f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
    }

dataset = dataset.map(format_instruction)
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'text'],
        num_rows: 51760
    })
})

In [5]:
dataset['train'][0]

{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
 'input': '',
 'instruction': 'Give three tips for staying healthy.',
 'text': '### Instruction:\nGive three tips for staying healthy.\n### Input:\n\n### Response:\n1. Eat a balanced and nutritious diet: Make sure your meals ar

### Load Base Model and Tokenizer

In [6]:
base_model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    load_in_4bit=True,   # Optional for memory efficiency
    trust_remote_code=True
).to(device)

def tokenize_function(example):
    result = tokenizer(
        example["text"],
        truncation=True,
        max_length=1024,
        padding="max_length"
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

# prepare smaller subset for quick fine-tuning
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))

summary(model, depth=5)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Layer (type:depth-idx)                             Param #
LlamaForCausalLM                                   --
├─LlamaModel: 1-1                                  --
│    └─Embedding: 2-1                              262,668,288
│    └─ModuleList: 2-2                             --
│    │    └─LlamaDecoderLayer: 3-1                 --
│    │    │    └─LlamaAttention: 4-1               --
│    │    │    │    └─Linear4bit: 5-1              (2,097,152)
│    │    │    │    └─Linear4bit: 5-2              (524,288)
│    │    │    │    └─Linear4bit: 5-3              (524,288)
│    │    │    │    └─Linear4bit: 5-4              (2,097,152)
│    │    │    └─LlamaMLP: 4-2                     --
│    │    │    │    └─Linear4bit: 5-5              (8,388,608)
│    │    │    │    └─Linear4bit: 5-6              (8,388,608)
│    │    │    │    └─Linear4bit: 5-7              (8,388,608)
│    │    │    │    └─SiLUActivation: 5-8          --
│    │    │    └─LlamaRMSNorm: 4-3                 2,048
│    │

### LoRA Fine-Tuning Setup (PEFT)

In [7]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
summary(model)

Layer (type:depth-idx)                                            Param #
PeftModelForCausalLM                                              --
├─LoraModel: 1-1                                                  --
│    └─LlamaForCausalLM: 2-1                                      --
│    │    └─LlamaModel: 3-1                                       750,979,072
│    │    └─Linear: 3-2                                           (262,668,288)
Total params: 1,013,647,360
Trainable params: 1,703,936
Non-trainable params: 1,011,943,424

### Train the Model

In [8]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./llama3_checkpoints",
    #remove_unused_columns=False,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=50,
    max_steps=200,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    optim="paged_adamw_8bit",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    #eval_dataset=eval_dataset,
    data_collator=data_collator,
    #tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
)

# fine tune the model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### Evaluate the Model

In [None]:
metrics = trainer.evaluate()
for key, value in metrics.items():
    print(f"{key}: {value}")

In [None]:
# save the model
finetuned_model_path = "./llama3-finetuned"

model.save_pretrained(finetuned_model_path)
tokenizer.save_pretrained(finetuned_model_path)

from peft import merge_and_unload

model = merge_and_unload(model)
model.save_pretrained(f"{finetuned_model_path}-merged")

### Prediction

In [None]:
finetuned_pipeline = pipeline("text-generation", model=finetuned_model_path, tokenizer=tokenizer)

prompt = "Explain why the sky is blue in simple terms."
print(finetuned_pipeline(prompt, max_new_tokens=100)[0]["generated_text"])