# PEFT Finetuning with QLoRA
The following notebook is an examplke of performing QLoRA fine-tuning on a LLM using an instruction-following dataset. This script produces the same instruction-following adapter as shown in the AMP adapters_prebuilt directory and the CML Job "Job for fine-tuning on Instruction Dataset"

Note: This does not run fine-tuning distributed accross multiple CML Workers, that requires launching accelerate cli specifying fine-tuning python scripts. See implementation in dsitributed_peft_scripts for examples.

### Install Dependencies

In [1]:
!pip install -q --no-cache-dir -r requirements.txt

### Load the base model with 4bit quantization

In [2]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, PeftModel
from trl import SFTTrainer
import datasets


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/cdsw/.local/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/cdsw/.local/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so...


  warn(msg)
  warn(msg)
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the tokenizer and base model in quantized mode
base_model = "bigscience/bloom-1b1"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token


# Configuration to load the model in 4bit quantized mode
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config=bnb_config,
    device_map='auto',
)

### Get Peft Model with LoRA training configuration

In [4]:
lora_config = LoraConfig(
          r=16,
          lora_alpha=32,
          target_modules=["query_key_value", "xxx"],
          lora_dropout=0.05,
          bias="none",
          task_type="CAUSAL_LM"
      )
model = get_peft_model(model, lora_config)

### Get and modify dataset

In [5]:
# Use only 30% of the dataset
dataset_fraction = 30
data = datasets.load_dataset('teknium/GPTeacher-General-Instruct', split=f'train[:{dataset_fraction}%]')

# Merge function to combine two columns of the dataset to have examples that look like
#<Instruction>: %s
#<Input>: %s
#<Response>: %s
#    or
#<Instruction>: %s
#<Response>: %s
def merge_columns(example):
    if example["input"]:
      prediction_format = """<Instruction>: %s
<Input>: %s
<Response>: %s"""
      example["prediction"] = prediction_format %(example["instruction"], example["input"], example["response"])
    else:
      prediction_format = """<Instruction>: %s
<Response>: %s"""
      example["prediction"] = prediction_format %(example["instruction"], example["response"])
    return example

finetuning_data = data.map(merge_columns)

Found cached dataset json (/home/cdsw/.cache/huggingface/datasets/teknium___json/teknium--GPTeacher-General-Instruct-3d3eb51407944fd2/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Loading cached processed dataset at /home/cdsw/.cache/huggingface/datasets/teknium___json/teknium--GPTeacher-General-Instruct-3d3eb51407944fd2/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-d68d4531b7a1dc54.arrow


### Set up SFTTrainer for PEFT fine-tuning

In [6]:
training_args = TrainingArguments(
                output_dir="outputs",
                num_train_epochs=1,
                optim="paged_adamw_32bit",
                per_device_train_batch_size=1, 
                gradient_accumulation_steps=4,
                warmup_ratio=0.03, 
                max_grad_norm=0.3,
                learning_rate=2e-4, 
                fp16=True,
                logging_steps=1,
                lr_scheduler_type="constant",
                disable_tqdm=True,
                report_to='tensorboard',
)

trainer = SFTTrainer(
    model=model, 
    train_dataset=finetuning_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    dataset_text_field = "prediction",
    packing=True,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)





### Launch fine-tuning
Fine-tuning takes approximately 14 minutes on a V100 GPU

In [7]:
trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.5416, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.4226, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.5232, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2795, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.3107, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.3264, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2317, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.1416, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2079, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.3161, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2311, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.097, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.0517, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.0757, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.0103, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.1402, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2105, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.1347, 'learning_rate': 0.0002, 'epoch'

TrainOutput(global_step=848, training_loss=1.9399923396841534, metrics={'train_runtime': 845.3739, 'train_samples_per_second': 31.676, 'train_steps_per_second': 7.918, 'train_loss': 1.9399923396841534, 'epoch': 0.13})

### Save adapter
NOTE: sfttrainer savemodel() saves the adapter only

In [8]:
trainer.save_model("adapters_custom/bloom1b1-lora-instruct-notebook")

### Reset CUDA device for inferencing
Removing the original loaded quantized model to free up room on GPU and load the model normally

In [9]:
del trainer
del model
del tokenizer
import gc
gc.collect()
torch.cuda.empty_cache()

### Load base model and tokenizer

In [10]:
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1", return_dict=True, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")

### Load adapter for use with the base model

In [11]:
model = PeftModel.from_pretrained(model=model, model_id="adapters_custom/bloom1b1-lora-instruct-notebook", adapter_name="bloom1b1-lora-instruct-notebook")

### Inference

In [12]:
prompt = """<Instruction>: Classify the following items into two categories: fruits and vegetables.
<Input>: tomato, apple, cucumber, carrot, banana, zucchini, strawberry, cauliflower
<Response>:"""
batch = tokenizer(prompt, return_tensors='pt')
batch = batch.to('cuda')

#### Base Model

In [13]:
# Inference with base model only:

with model.disable_adapter():
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**batch, max_new_tokens=60)
    prompt_length = len(prompt)
    print(tokenizer.decode(output_tokens[0], skip_special_tokens=True)[prompt_length:])

 green, yellow, red, orange, red, yellow, green, blue, yellow, red, orange, red, yellow, green, blue, yellow, red, orange, red, yellow, green, blue, yellow, red, orange, red, yellow, green, blue, yellow,


#### Fine-tuned adapter

In [14]:
# Inference with fine-tuned adapter:
model.set_adapter("bloom1b1-lora-instruct-notebook")
with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=60)
prompt_length = len(prompt)
print(tokenizer.decode(output_tokens[0], skip_special_tokens=True)[prompt_length:])

 Fruits: Tomato, Apple, Cucumber, Carrot, Banana, Zucchini, Strawberry, Cauliflower. Vegetables: Tomato, Apple, Cucumber, Carrot, Banana, Zucchini, Strawberry, Cauliflower


#### The finetuned adapter output is not perfect, but it is a step closer in the direction of downstream task completion