In [None]:
%%capture
# !pip install transformers bitsandbytes datasets sentencepiece accelerate trl peft

In [1]:
model_name = "mistralai/Mistral-7B-v0.1"
max_seq_length = 2048
learning_rate = 2e-4
weight_decay = 0.01
max_steps = 120*2
warmup_steps = 10
batch_size = 4
gradient_accumulation_steps = 4
lr_scheduler_type = "linear"
optimizer = "adamw_8bit"
use_gradient_checkpointing = True
random_state = 3407

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

token = None
HAS_BFLOAT16 = torch.cuda.is_bf16_supported()
dtype = torch.bfloat16 if HAS_BFLOAT16 else torch.float16
load_in_4bit = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit              = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type       = "nf4",
    bnb_4bit_compute_dtype    = dtype,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    torch_dtype = dtype,
    quantization_config = bnb_config if load_in_4bit else None,
    token = None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length = max_seq_length,
    padding_side = "right",
    token = None,
)
tokenizer.add_special_tokens({"pad_token" : tokenizer.unk_token});
tokenizer.pad_token = tokenizer.unk_token
config = model.config.update({"pad_token_id" : tokenizer.unk_token_id});

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, TaskType, get_peft_model

from transformers import set_seed as transformers_set_seed
transformers_set_seed(random_state) # Must set since LoRA weights get initialized.

lora_config = LoraConfig(
    r              = 16,
    lora_alpha     = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_dropout   = 0,
    bias           = "none",
    task_type      = TaskType.CAUSAL_LM,
)
if load_in_4bit:
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing = use_gradient_checkpointing,
    )
elif use_gradient_checkpointing:
    model.gradient_checkpointing_enable()
model = get_peft_model(model, lora_config)

In [4]:
#@title Slim Orca data prep
from datasets import load_dataset
dataset = load_dataset("Open-Orca/SlimOrca", split = "train")
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    mapper = {"system" : "SYSTEM:", "human" : "USER:", "gpt" : "ASSISTANT:"}
    end_mapper = {"system" : "\n\n", "human" : "\n", "gpt" : "</s>\n"}
    for convo in convos:
        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}{end_mapper[turn]}" for x in convo)
        texts.append(text)
    return { "text" : texts, }
pass
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/986M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/517982 [00:00<?, ? examples/s]

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers.utils import logging
logging.set_verbosity_info()

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        max_steps = max_steps,
        learning_rate = learning_rate,
        fp16 = not HAS_BFLOAT16,
        bf16 = HAS_BFLOAT16,
        logging_steps = 1,
        output_dir = "outputs",
        optim = optimizer,
        weight_decay = weight_decay,
        lr_scheduler_type = lr_scheduler_type,
        seed = random_state,
    ),
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Map:   0%|          | 0/517982 [00:00<?, ? examples/s]

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [6]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.691 GB.
2.414 GB of memory reserved.


In [7]:
trainer_stats = trainer.train()

***** Running training *****
  Num examples = 517,982
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 240
  Number of trainable parameters = 41,943,040
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,1.8623
2,1.6874
3,1.6828
4,1.6134
5,1.5763
6,1.6429
7,1.4487
8,1.5563
9,1.5567
10,1.5617




Training completed. Do not forget to share your model on huggingface.co/models =)




In [8]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3740.7986 seconds used for training.
62.35 minutes used for training.
Peak reserved memory = 20.754 GB.
Peak reserved memory for training = 18.34 GB.
Peak reserved memory % of max memory = 87.603 %.
Peak reserved memory for training % of max memory = 77.413 %.


In [9]:
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


3740.7986 seconds used for training.
62.35 minutes used for training.
Sat Feb 10 17:34:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:00:10.0 Off |                  N/A |
| 68%   75C    P2             287W / 350W |  20254MiB / 24576MiB |     41%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------