In [None]:
# https://www.philschmid.de/instruction-tune-llama-2

In [3]:
!pip install "transformers" -q 
!pip install "torch" -q
!pip install "datasets" -q
!pip install "peft" -q
!pip install "sentencepiece" -q
!pip install "fire" -q
!pip install bitsandbytes==0.41.0



In [4]:
from typing import List

import torch
import torch.nn as nn

import transformers
from datasets import load_dataset
from functools import partial

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
    PrefixTuningConfig,
    TaskType
)
from transformers import LlamaForCausalLM, LlamaTokenizer

from transformers import  LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments

False
'CUDASetup' object has no attribute 'cuda_available'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
base_model = "meta-llama/Llama-2-7b-hf"
device_map = "auto"
n_gpus = torch.cuda.device_count()
max_memory = f'{15920}MB'
seed = 42
print("Number of GPUs: ", n_gpus)

Number of GPUs:  4


In [7]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map=device_map, # dispatch efficiently the model on the available ressources
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(base_model, use_auth_token=True)

# Needed for LLaMA tokenizer
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [8]:
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
      # cast the small parameters (e.g. layernorm) to fp32 for stability
      param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=True, # Add this for using int8
        torch_dtype=torch.float16,
        device_map=device_map,
    )

tokenizer = LlamaTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = 0

In [9]:
def tokenize(data):
        source_ids = tokenizer.encode(data['input'])
        target_ids = tokenizer.encode(data['output'])

        input_ids = source_ids + target_ids + [tokenizer.eos_token_id]
        labels = [-100] * len(source_ids) + target_ids + [tokenizer.eos_token_id]

        return {
            "input_ids": input_ids,
            "labels": labels
        }

In [10]:
def format_instruction(sample):
    return f"""### Instruction:
    Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

    ### Input:
    {sample['instruction']}

    ### Response:
    {sample['response']}
    """

In [11]:
# Load the databricks dataset from Hugging Face
from datasets import load_dataset

dataset = load_dataset("databricks/databricks-dolly-15k", split="train")



In [12]:
from random import randrange

print(format_instruction(dataset[randrange(len(dataset))]))

### Instruction:
    Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

    ### Input:
    Given these paragraphs about Large language models, what is an LLM?

    ### Response:
    A large language model (LLM) is a language model consisting of a neural network with many parameters (typically billions of weights or more), trained on large quantities of unlabelled text using self-supervised learning.
    


In [36]:
# model/data params
# data_path: str = "",
# output_dir: str = "",

micro_batch_size: int = 2
gradient_accumulation_steps: int = 4
num_epochs: int = 3
learning_rate: float = 3e-4
val_set_size: int = 2000

# lora hyperparams
lora_r = 8
lora_alpha = 16
lora_dropout= 0.1

In [29]:
type(model)

peft.peft_model.PeftModelForCausalLM

In [30]:
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [31]:
def format_instruction(sample):
    return f"""### Instruction:
    Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

    ### Input:
    {sample['response']}

    ### Response:
    {sample['instruction']}
    """

In [37]:
peft_config = LoraConfig(
      r=lora_r,
      lora_alpha=lora_alpha,
      lora_dropout=lora_dropout,
      bias="none",
      task_type="CAUSAL_LM",
  )
# model = prepare_model_for_int8_training(model) # Add this for using int8
model = get_peft_model(model, peft_config)

In [38]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model)

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [39]:
output_dir = "./outputs"

In [41]:
args = TrainingArguments(
    output_dir="llama-7-int4-dolly",
    num_train_epochs=3,
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=False,
    tf32=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False # disable tqdm since with packing values are in correct
)

In [42]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args,
)

In [43]:
# train
trainer.train() # there will not be a progress bar since tqdm is disabled

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument tensors in method wrapper_CUDA_cat)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=2,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=200,
        save_steps=200,
        output_dir=output_dir,
        save_total_limit=3
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

trainer.train()

In [None]:
base_model