Sam Witeveen
https://www.youtube.com/watch?v=Us5ZFp16PaU
https://colab.research.google.com/drive/14xo6sj4dARk8lXZbOifHEn1f_70qNAwy?usp=sharing

Medium: https://medium.com/@venkata_sai/unleashing-the-potential-of-peft-parameter-efficient-fine-tuning-in-training-large-language-b7a87e8a4eb9

In [1]:
!pip install -q transformers peft datasets bitsandbytes  accelerate loralib

In [2]:
import torch
import torch.nn as nn


def memory_stats():
    print("Units in MB")
    print("Total available memory:",torch.cuda.mem_get_info()[1]/1024**2)
    print("Total free memory:",torch.cuda.mem_get_info()[0]/1024**2)
    print("Total allocated memory:",torch.cuda.memory_allocated()/1024**2)
    print("Total reserved memory:",torch.cuda.memory_reserved()/1024**2)

memory_stats()

Units in MB
Total available memory: 15101.8125
Total free memory: 14998.8125
Total allocated memory: 0.0
Total reserved memory: 0.0


In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
!nvidia-smi -L # nividia system management interface -display a list of all of the NVIDIA GPUs in your system, along with detailed information (L) about each GPU.

In [6]:
import torch
torch.cuda.is_available()

True

In [7]:
torch.cuda.current_device()

0

## Setup the Model

In [10]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0" #tells the CUDA runtime which GPU devices are visible to the current process.

In [11]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [12]:
model_name_path = "bigscience/bloom-7b1"
model_name_path = 'bigscience/bloom-560m'

cache_dir="/content/model" # where the model will be cached

model = AutoModelForCausalLM.from_pretrained(
    model_name_path,
    load_in_8bit=True,
    device_map='auto',
    cache_dir=cache_dir
)

tokenizer = AutoTokenizer.from_pretrained(model_name_path,cache_dir=cache_dir)

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

### Model Size

In [13]:
import os

def get_folder_size(folder_path):
  """
  Gets the total size of a folder and its subfolders, in bytes.

  Args:
    folder_path: The path to the folder.

  Returns:
    The total size of the folder and its subfolders, in bytes.
  """

  total_size = 0
  for root, dirs, files in os.walk(folder_path):
    for file in files:
      file_path = os.path.join(root, file)
      file_size = os.path.getsize(file_path)
      print(file_path,round(file_size / 1024 / 1024,2))
      total_size += file_size

      # Convert the size to megabytes
      model_file_size_mb = round(total_size / 1024 / 1024,2)


  return model_file_size_mb

# Get the size of the current directory
current_dir_size = get_folder_size(cache_dir)

# Print the size of the current directory
print(f"Current directory size: {current_dir_size} MB")

/content/model/models--bigscience--bloom-560m/.no_exist/ac2ae5fab2ce3f9f40dc79b5ca9f637430d24971/generation_config.json 0.0
/content/model/models--bigscience--bloom-560m/.no_exist/ac2ae5fab2ce3f9f40dc79b5ca9f637430d24971/adapter_config.json 0.0
/content/model/models--bigscience--bloom-560m/.no_exist/ac2ae5fab2ce3f9f40dc79b5ca9f637430d24971/added_tokens.json 0.0
/content/model/models--bigscience--bloom-560m/refs/main 0.0
/content/model/models--bigscience--bloom-560m/blobs/a8702498162c95d68d2724e7f333c83d7be08de81cfc091455c38730682116d3 1066.65
/content/model/models--bigscience--bloom-560m/blobs/a9f31df161b949147c63449ead0bd4e5fc70770d 0.0
/content/model/models--bigscience--bloom-560m/blobs/25bc39604f72700b3b8e10bd69bb2f227157edd1 0.0
/content/model/models--bigscience--bloom-560m/blobs/e7016b49fcff7e162946ec012d3c7b4db0b66d87 0.0
/content/model/models--bigscience--bloom-560m/blobs/3fa39cd4b1500feb205bcce3b9703a4373414cafe4970e0657b413f7ddd2a9d3 13.83
/content/model/models--bigscience--bl

In [14]:
!pip install torchinfo
from torchinfo import summary

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [15]:
summary(model)

Layer (type:depth-idx)                        Param #
BloomForCausalLM                              --
├─BloomModel: 1-1                             --
│    └─Embedding: 2-1                         256,901,120
│    └─LayerNorm: 2-2                         2,048
│    └─ModuleList: 2-3                        --
│    │    └─BloomBlock: 3-1                   12,596,224
│    │    └─BloomBlock: 3-2                   12,596,224
│    │    └─BloomBlock: 3-3                   12,596,224
│    │    └─BloomBlock: 3-4                   12,596,224
│    │    └─BloomBlock: 3-5                   12,596,224
│    │    └─BloomBlock: 3-6                   12,596,224
│    │    └─BloomBlock: 3-7                   12,596,224
│    │    └─BloomBlock: 3-8                   12,596,224
│    │    └─BloomBlock: 3-9                   12,596,224
│    │    └─BloomBlock: 3-10                  12,596,224
│    │    └─BloomBlock: 3-11                  12,596,224
│    │    └─BloomBlock: 3-12                  12,596,224
│    

In [None]:
# for param in model.parameters():
#   print(param.requires_grad)

### Freezing the original weights

Option 1: No custom *changes*

In [16]:
import peft
from peft import LoraConfig, get_peft_model

In [17]:
tokenizer

BloomTokenizerFast(name_or_path='bigscience/bloom-560m', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [18]:
model

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear8bitLt(in_features=1024, out_features=3072, bias=True)
          (dense): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear8bitLt(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementw

In [19]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['query_key_value'],
    lora_dropout=0.05,
    bias = "none",
    task_type="CAUSAL_LM"
    )

In [20]:
peft_model = get_peft_model(model,lora_config)
peft_model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 560,787,456 || trainable%: 0.2804741766549072


In [21]:
summary(peft_model)

Layer (type:depth-idx)                                            Param #
PeftModelForCausalLM                                              --
├─LoraModel: 1-1                                                  --
│    └─BloomForCausalLM: 2-1                                      --
│    │    └─BloomModel: 3-1                                       560,787,456
│    │    └─Linear: 3-2                                           (256,901,120)
Total params: 817,688,576
Trainable params: 1,572,864
Non-trainable params: 816,115,712

Option 2 : Custom *changes*

In [47]:
import peft
from peft import LoraConfig, get_peft_model

In [48]:
for param in model.parameters():
  param.requires_grad = False # freeze the model
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to FP32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self,x):
    return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)


### Setting up the LoRa Adapters

In [49]:
from peft import LoraConfig, get_peft_model

lora_config2 = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    target_modules=['query_key_value'],
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)


In [50]:
peft_model2 = get_peft_model(model,lora_config2)

In [51]:
peft_model2.print_trainable_parameters()

trainable params: 1,572,864 || all params: 560,787,456 || trainable%: 0.2804741766549072


### Data

In [52]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")

In [53]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [54]:
data['train'][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

In [55]:
def prompt_template(example):
    # print(example)
    prompt_start = "Create the tags for below quote: \n"
    prompt_end = "\n\n Tags: \n"
    # Convert the list of tags to a string
    tags_str = ', '.join(example['tags'])

    example['prompted_input'] = prompt_start + example['quote'] + prompt_end + tags_str

    return example

In [56]:
print(prompt_template(data['train'][0]))

{'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator'], 'prompted_input': 'Create the tags for below quote: \n“Be yourself; everyone else is already taken.”\n\n Tags: \nbe-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator'}


In [57]:
data = data.map(prompt_template,batched=False) # batched = false as we are doing row-wise operation
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prompted_input'],
        num_rows: 2508
    })
})

In [58]:
print(data['train']['prompted_input'][200])

Create the tags for below quote: 
“The fear of death follows from the fear of life. A man who lives fully is prepared to die at any time.”

 Tags: 
death, life


#### Applying tokenizers

In [59]:
data = data.map(lambda x:tokenizer(x['prompted_input']),batched=True)

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [60]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prompted_input', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [61]:
print(f"Shapes of the datasets:")
print(f"Training: {data['train'].shape}")

Shapes of the datasets:
Training: (2508, 6)


### Fine-Tune the Model with the Preprocessed Dataset

Now utilize the built-in Hugging Face `Trainer` class (see the documentation [here](https://huggingface.co/docs/transformers/main_classes/trainer)). Pass the preprocessed dataset with reference to the original model. Other training parameters are found experimentally and there is no need to go into details about those at the moment.

In [62]:
from transformers import TrainingArguments, Trainer

#### Free up memory

In [63]:
memory_stats()

Units in MB
Total available memory: 15101.8125
Total free memory: 12394.8125
Total allocated memory: 833.41650390625
Total reserved memory: 2566.0


In [64]:
import torch
torch.cuda.empty_cache()

In [65]:
memory_stats()

Units in MB
Total available memory: 15101.8125
Total free memory: 14054.8125
Total allocated memory: 833.41650390625
Total reserved memory: 906.0


In [66]:
import time
output_dir =  f'./training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps= 4,
    learning_rate=1e-5,
    warmup_steps= 1000,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_steps=10,
    max_steps=50

    )

In [67]:
memory_stats()

Units in MB
Total available memory: 15101.8125
Total free memory: 14054.8125
Total allocated memory: 833.41650390625
Total reserved memory: 906.0


In [68]:
trainer = Trainer(
    model=peft_model2,
    args=training_args,
    train_dataset=data['train'],
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False,return_tensors='pt')
)

In [69]:
memory_stats()

Units in MB
Total available memory: 15101.8125
Total free memory: 14054.8125
Total allocated memory: 833.41650390625
Total reserved memory: 906.0


In [72]:
trainer.train()

Step,Training Loss
10,4.1186
20,4.1053
30,4.1644
40,4.2212
50,4.1062


TrainOutput(global_step=50, training_loss=4.143156585693359, metrics={'train_runtime': 118.437, 'train_samples_per_second': 6.755, 'train_steps_per_second': 0.422, 'total_flos': 174469654315008.0, 'train_loss': 4.143156585693359, 'epoch': 0.32})

In [74]:
memory_stats()

Units in MB
Total available memory: 15101.8125
Total free memory: 260.8125
Total allocated memory: 864.4267578125
Total reserved memory: 14690.0


## Share adapters on the 🤗 Hub

In [77]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [92]:
model_hub_path = "bond001/bloom-560m-LORA-v1"

In [91]:
trainer.push_to_hub("bond001/bloom-560m-LORA-v1")

'https://huggingface.co/bond001/training-1698155518/tree/main/'

## Save the model

In [84]:
peft_model_path="./peft-dialogue-summary-checkpoint-local"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

### Check the peft model size

The -alh flags tell the ls command to display the following information:

-a: Show all files, including hidden files.

-l: Display the file in long format, which includes the file mode, owner, group, size, last modified date and time, and filename.

-h: Display the file size in human-readable format.

In [89]:
!ls -alh peft-dialogue-summary-checkpoint-local/adapter_model.bin

## Load adapters from the Hub

In [14]:
cache_dir="/content/model" # where the model will be cached

In [8]:
!pip install -q peft transformers datasets accelerate bitsandbytes

In [10]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
peft_model_id = "bond001/training-1698155518"
config = PeftConfig.from_pretrained(peft_model_id)

In [11]:
config.base_model_name_or_path

'bigscience/bloom-560m'

In [59]:
cache_dir  =''
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path, #changed from earlier
    load_in_8bit=True,
    device_map='auto',
    cache_dir=cache_dir,
    return_dict = True #??
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,
                                          cache_dir=cache_dir,
                                          return_dict = True #??
                                          )

# Load the Lora model
peft_model3 = PeftModel.from_pretrained(base_model, peft_model_id)

In [60]:
peft_model3.print_trainable_parameters()

In [61]:
peft_model3

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 1024)
        (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-23): 24 x BloomBlock(
            (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): Linear8bitLt(
                in_features=1024, out_features=3072, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): Parame

### Evaluate the Model Qualitatively (Human Evaluation)


In [62]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")

In [63]:
data['train'][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

In [84]:
index = 1
input = data['train'][index]['quote']
label = data['train'][index]['tags']
print(input,'\n\n',label)

In [85]:
input

"“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”"

In [86]:
label

['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']

In [87]:
# Convert the input into prompt
def prompt_template_inf(example):
    # print(example)
    prompt_start = "Create the tags for below quote: \n"
    prompt_end = "\n\n Tags: \n"

    input_inf = prompt_start + example + prompt_end

    return input_inf

In [88]:
prompt_template_inf(input)

"Create the tags for below quote: \n“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”\n\n Tags: \n"

In [93]:
input_ids = tokenizer(prompt_template_inf(input), return_tensors="pt").to('cuda')

In [94]:
input_ids

{'input_ids': tensor([[ 17132,    368,  42335,    613,  12312,  71824,     29,   5306,   1502,
          10203, 239002,     15, 136192,   1049,    530,    267,  10512,   3131,
         133716,     17,    473,   5219, 120496,     15,    473,    912,   1800,
            461,   5048,    530,    919,  11866,  12587,    427,  21053,     17,
           7702,   1320,   1152,  11229,  21053,   1074,    919,   2670,  69583,
             15,   3816,   1152,  11097,    661,  62798,   5926, 158808,   1074,
            919,   2670,   7733,     17,    982,    603,  43367,   6633,   5306]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [95]:
type(input_ids)

transformers.tokenization_utils_base.BatchEncoding

#### Inference part I

In [96]:
with torch.cuda.amp.autocast():
  output_tokens = peft_model3.generate(**input_ids, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



In [97]:
tokenizer.decode(output_tokens[0])

'Create the tags for below quote: \n“I\'m selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can\'t handle me at my worst, then you sure as hell don\'t deserve me at my best.”\n\n Tags: \n"I\'m selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can\'t handle me at my worst, then you sure as hell don\'t deserve me at my best.'

In [98]:
with torch.cuda.amp.autocast():
  output_tokens2 = base_model.generate(**input_ids, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens2[0], skip_special_tokens=True))

In [99]:
tokenizer.decode(output_tokens2[0])

'Create the tags for below quote: \n“I\'m selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can\'t handle me at my worst, then you sure as hell don\'t deserve me at my best.”\n\n Tags: \n"I\'m selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can\'t handle me at my worst, then you sure as hell don\'t deserve me at my best.'

In [None]:
original_model_outputs = base_model.generate(input_ids=input_ids,
                                             generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)