# Training Large Models on Limited Hardware

This notebook demonstrates techniques for training large language models on limited hardware resources.

## 1. Setup and Imports

First, let's import the necessary libraries:

In [1]:
import torch
import transformers
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb
from tqdm import tqdm
import math

print(f"transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"accelerate version: {accelerate.__version__}")

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


transformers version: 4.45.1
PyTorch version: 2.4.1+cu121
accelerate version: 0.34.2


## 2. Load Pre-trained Model and Dataset

We'll use a small version of GPT-2 as our base model and the WikiText-2 dataset for fine-tuning:

In [9]:
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

## 3. Implement QLoRA for Parameter Efficiency

QLoRA (Quantized Low-Rank Adaptation) allows us to fine-tune large models with significantly reduced memory requirements:

In [10]:
def prepare_model_for_qlora(model):
    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)
    
    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()
    
    class CastOutputToFloat(torch.nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)
    
    model.lm_head = CastOutputToFloat(model.lm_head)
    
    return model

model = prepare_model_for_qlora(model)

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["c_attn"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

## 4. Apply Gradient Accumulation

We'll use gradient accumulation to simulate larger batch sizes:

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=True,
    save_steps=1000,
    logging_steps=100,
    remove_unused_columns=False,
    use_legacy_prediction_loop=True,
    push_to_hub=False,
)

## 5. Utilize CPU Offloading for Optimizer States

We'll use the bitsandbytes 8-bit optimizer to reduce GPU memory usage:

In [12]:
optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=training_args.learning_rate)

## 6. Prepare Dataset and Trainer

Now let's prepare our dataset and set up the Trainer:

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

# Create a data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    optimizers=(optimizer, None)
)

## 7. Train the Model

Finally, let's train our model:

In [None]:
trainer.train()

## 8. Evaluate the Model

After training, we can evaluate our model:

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")