# Continual pretraining

Run continual pretraining on a HuggingFace base model with a given raw text dataset.

This notebook is optimized to run on a T4 machine via Google Colab.

Builds upon the Unsloth project: https://unsloth.ai/

In [None]:
# Normally using pip install unsloth is enough
# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
%pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
%pip install --no-deps cut_cross_entropy unsloth_zoo
%pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
%pip install --no-deps unsloth
# Revert to pip install unsloth when the issue is resolved

In [None]:
# Imports
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from google.colab import runtime

In [None]:
# Param
HF_TOKEN = "paste-your-huggingface-token-here"

base_model = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
#base_model = "unsloth/Qwen2.5-7B-bnb-4bit"
#base_model = "unsloth/Llama-3.2-3B-bnb-4bit"
raw_text_file = "stories_es_F.txt" # for continual pretraining
trained_model = "trained-model-saved-to-your-huggingface-account"
num_steps = 22 # number of steps for continual pretraining

In [None]:
# Print the first line of the training data file
try:
  with open(raw_text_file, 'r') as file:
    first_line = file.readline().strip()
    print(first_line)
except FileNotFoundError:
  print(f"Error: File '{raw_text_file}' not found.")
except Exception as e:
  print(f"An error occurred: {e}")

In [None]:
# Load base model
max_seq_length = 2048
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = HF_TOKEN
)

In [None]:
# Apply PEFT with LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
# Load raw text dataset
dataset = load_dataset("text", data_files={"train": raw_text_file})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_seq_length)

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set to False for causal language modeling (text completion)
)

In [None]:
# Prepare training
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets["train"],
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    data_collator=data_collator,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=num_steps,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

In [None]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Execute training
trainer_stats = trainer.train()
# Save model
model.push_to_hub(trained_model, token=HF_TOKEN)
tokenizer.push_to_hub(trained_model, token=HF_TOKEN)

In [None]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# Disconnect from the Google Colab machine when done
runtime.unassign()