In [1]:
import os
import gc

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

import json
import wandb
from tqdm import tqdm



In [2]:
# Print current working directory
# !ls -alh /var/model/Phind-CodeLlama-34B-v2
# Change to /var/model/Phind-CodeLlama-34B-v2
os.chdir( "/var/model/Phind-CodeLlama-34B-v2" )
# Print current working directory
os.getcwd()

'/var/model/Phind-CodeLlama-34B-v2'

In [3]:

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mricardo-felipe-ruiz[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
%env WANDB_PROJECT=Phind-CodeLlama-34B-v2-peft-fine-tuning

env: WANDB_PROJECT=Phind-CodeLlama-34B-v2-peft-fine-tuning


In [6]:
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"
split = "train[:10%]"
# finetunes_model_name = "output/codellama-7b-finetuned-int4-python-18k-alpaca"

In [7]:
from datasets import load_dataset

dataset = load_dataset( dataset_name, split=split )

In [14]:
dataset[ 0 ]

{'instruction': 'Create a function to calculate the sum of a sequence of integers.',
 'input': '[1, 2, 3, 4, 5]',
 'output': '# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum',
 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a function to calculate the sum of a sequence of integers.\n\n### Input:\n[1, 2, 3, 4, 5]\n\n### Output:\n# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum'}

In [None]:
# Delete a row from dataset
# del dataset[ 0 ]

In [9]:
# Use the Task below and the Input given to write the Response, which is a programmatic instruction that can solve the following Task:
def prompt_instruction_format( sample ):
    
  return f"""### Instruction:
    Use the Task below and the Input given to write a Response that can solve the following Task:

    ### Task:
    {sample['instruction']}

    ### Input:
    {sample['input']}

    ### Response:
    {sample['output']}
    """

In [27]:
for line in prompt_instruction_format( dataset[ 0 ] ).split( "\n" ): print( line )

### Instruction:
    Use the Task below and the Input given to write the Response, which is programmatic instruction that can solve the following Task:

    ### Task:
    Create a function to calculate the sum of a sequence of integers.

    ### Input:
    [1, 2, 3, 4, 5]

    ### Response:
    # Python code
def sum_sequence(sequence):
  sum = 0
  for num in sequence:
    sum += num
  return sum
    


In [15]:
def generate_text( foo_tokenizer, model, product, max_new_tokens=128 ):
    
    instruction = f"""### Instruction:
    Use the Task below and the Input given to write the Response, which is programmatic instruction that can solve the following Task:

    ### Task:
    Create a detailed description for the following product

    ### Input:
    {product}

    ### Response:
    """
    
    
    device = "cuda:0"
    inputs = foo_tokenizer( instruction, return_tensors="pt" ).to( device )
    
    generation_output = model.generate(
        input_ids=inputs[ "input_ids" ],
        attention_mask=inputs[ "attention_mask" ],
        max_new_tokens=max_new_tokens
    )
        
    print( "generation_output[ 0 ]:", generation_output[ 0 ], end="\n\n" )
    print( "generation_output[ 0 ].shape:", generation_output[ 0 ].shape, end="\n\n" )
    
    raw_output = foo_tokenizer.decode( generation_output[ 0 ] )
    
    print( "raw_output:", raw_output, end="\n\n" )
    print(  "len( raw_output ):", len( raw_output ), end="\n\n")
    
    response   = raw_output.split( "### Response:" )[ 1 ]
    
    return response

product = "Corelogic Smooth Mouse, belonging to category: Optical Mouse"

# for line in generate_text( tokenizer, base_model, product ).split( "\n" ): print( line )

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
print( bnb_config )
tokenizer              = AutoTokenizer.from_pretrained( "." )
tokenizer.pad_token    = tokenizer.eos_token
tokenizer.padding_side = "right"

# ¡OJO! Why are All the examples I'm finding online turning off the cache here? It makes a huge performance difference: 21 vs 14 tokens per second!
base_model = AutoModelForCausalLM.from_pretrained(
    ".", quantization_config=bnb_config, device_map="auto", low_cpu_mem_usage=True, use_cache=True, use_flash_attention_2=True
)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
for line in generate_text( tokenizer, base_model, product ).split( "\n" ): print( line )

In [30]:
def ppl_model( model, tokenizer, dataset ):
    nlls = [ ]
    max_length = 4096 #2048
    stride = 512
    for s in tqdm( range( len( dataset[ 'prompt' ] ) ) ):
        encodings = tokenizer( dataset[ 'prompt' ][ s ], return_tensors="pt", truncation=True, max_length=max_length )
        seq_len = encodings.input_ids.size( 1 )
        prev_end_loc = 0
        for begin_loc in range( 0, seq_len, stride ):
            end_loc = min( begin_loc + max_length, seq_len )
            trg_len = end_loc - prev_end_loc
            input_ids = encodings.input_ids[ :, begin_loc:end_loc ].to( "cuda" )
            target_ids = input_ids.clone()
            target_ids[ :, :-trg_len ] = -100
            with torch.no_grad():
                outputs = model( input_ids, labels=target_ids )
                neg_log_likelihood = outputs.loss
            nlls.append( neg_log_likelihood )
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break
    ppl = torch.exp( torch.stack( nlls ).mean() )
    return ppl.item()

# 4.7532477378845215

In [None]:
ppl_model( base_model, tokenizer, dataset )

In [12]:
len( dataset )

1861

In [17]:
# Iterate dataset line by line and find all strings longer than 4096  
count = 0
temp = [ ]
for s in range( len( dataset[ 'prompt' ] ) ):
    
    if len( dataset[ 'prompt' ][ s ] ) > 4096:
        count += 1
        print( len( dataset[ 'prompt' ][ s ] ) )
    else:
        temp.append( dataset[ s ] )
        
print( count )
len( temp )

7976
5733
13286
9491
4192
10394
8083
6716
5412
9311
9028
8722
4850
5961
5733
8359
6524
6241
13248
9596
13128
15029
10664
13881
24


1837

In [20]:
dataset = temp

In [44]:
import gc
del base_model
torch.cuda.empty_cache()
gc.collect()

2802

In [11]:
base_model.hf_device_map

{'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 0,
 'model.layers.13': 0,
 'model.layers.14': 0,
 'model.layers.15': 0,
 'model.layers.16': 0,
 'model.layers.17': 0,
 'model.layers.18': 0,
 'model.layers.19': 0,
 'model.layers.20': 0,
 'model.layers.21': 1,
 'model.layers.22': 1,
 'model.layers.23': 1,
 'model.layers.24': 1,
 'model.layers.25': 1,
 'model.layers.26': 1,
 'model.layers.27': 1,
 'model.layers.28': 1,
 'model.layers.29': 1,
 'model.layers.30': 1,
 'model.layers.31': 1,
 'model.layers.32': 1,
 'model.layers.33': 1,
 'model.layers.34': 1,
 'model.layers.35': 1,
 'model.layers.36': 1,
 'model.layers.37': 1,
 'model.layers.38': 1,
 'model.layers.39': 1,
 'model.layers.40': 1,
 'model.layers.41': 1,
 'model.layers.42'

In [40]:
# for name, param in base_model.named_parameters():
#     print(f"Parameter {name} is on device {param.device}")

## Set up training arguments

In [None]:
# base_model

In [21]:
from peft import LoraConfig, get_peft_config, PeftModel, PeftConfig, get_peft_model, AutoPeftModelForCausalLM

peft_config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    # When target_modules was disabled, it was causing detention layers to be assigned to the CPU, throwing this runtime error:
    # RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! 
    # (when checking argument for argument mat2 in method wrapper_CUDA_mm)
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], 
    lora_dropout=0.10, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [22]:
# Define the training arguments
trainingArgs = TrainingArguments(
    output_dir="./training-results", # Output directory where the model predictions and checkpoints will be stored
    num_train_epochs=1, # Number of training epochs
    per_device_train_batch_size=1, # Batch size per GPU for training
    gradient_accumulation_steps=4,  # Number of update steps to accumulate the gradients for
    gradient_checkpointing=True,# Enable gradient checkpointing
    optim="paged_adamw_32bit", # Optimizer to use
    #save_steps=save_steps,
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    # fp16=True,
    bf16=False,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    #max_steps=max_steps,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    report_to="wandb",
    seed=42
)
# Create the trainer
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=4096, #2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)

In [23]:
def print_trainable_parameters( model ):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [24]:
print_trainable_parameters( base_model )

trainable params: 108,920,832 || all params: 17,243,447,296 || trainable%: 0.63


In [42]:
# for name, param in base_model.named_parameters():
#     print(f"Parameter {name} is on {param.device}")


In [25]:
trainer.train()

#stop reporting to wandb
wandb.finish()

# save model
trainer.save_model()

print( "Model saved" )

# {'loss': 0.8568, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.01}
# {'loss': 0.6902, 'learning_rate': 0.00014285714285714287, 'epoch': 0.02}
# {'loss': 0.5357, 'learning_rate': 0.00019999750800065415, 'epoch': 0.03}
# {'loss': 0.4925, 'learning_rate': 0.00019991030106398364, 'epoch': 0.04}
# {'train_runtime': 1982.9587, 'train_samples_per_second': 0.926, 'train_steps_per_second': 0.231, 'train_loss': 0.6177679002285004, 'epoch': 0.05}

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


{'loss': 0.8568, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.01}
{'loss': 0.6902, 'learning_rate': 0.00014285714285714287, 'epoch': 0.02}
{'loss': 0.5357, 'learning_rate': 0.00019999750800065415, 'epoch': 0.03}
{'loss': 0.4925, 'learning_rate': 0.00019991030106398364, 'epoch': 0.04}
{'train_runtime': 1982.9587, 'train_samples_per_second': 0.926, 'train_steps_per_second': 0.231, 'train_loss': 0.6177679002285004, 'epoch': 0.05}


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▃▄▆█
train/global_step,▁▃▅▇█
train/learning_rate,▁▅██
train/loss,█▅▂▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,0.05
train/global_step,24.0
train/learning_rate,0.0002
train/loss,0.4925
train/total_flos,8.00760415565906e+16
train/train_loss,0.61777
train/train_runtime,1982.9587
train/train_samples_per_second,0.926
train/train_steps_per_second,0.231


Model saved


In [19]:
# wandb.finish()

In [None]:
for line in generate_text( tokenizer, base_model, product ).split( "\n" ): print( line )

In [26]:
# Drops 16.4/19.0 GB per GPU down to 3.25 GB per GPU!
import gc
# del base_model
torch.cuda.empty_cache() 
gc.collect()

566

In [51]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
print( bnb_config )
tokenizer              = AutoTokenizer.from_pretrained( "." )
tokenizer.pad_token    = tokenizer.eos_token
tokenizer.padding_side = "right"

# ¡OJO! Why were we turning off the cash here? 
# We're not! It makes a huge performance difference: 21 vs 14 tokens per second!
base_model = AutoModelForCausalLM.from_pretrained(
    ".", quantization_config=bnb_config, device_map="auto", low_cpu_mem_usage=True, use_cache=True, use_flash_attention_2=True
)

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [52]:
for line in generate_text( tokenizer, base_model, product ).split( "\n" ): print( line )

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generation_output[ 0 ]: tensor([    1,   835,  2799,  4080, 29901,    13,  1678,  4803,   278,  9330,
         2400,   322,   278, 10567,  2183,   304,  2436,   278, 13291, 29892,
          607,   338,  1824, 29885,  2454, 15278,   393,   508,  4505,   278,
         1494,  9330, 29901,    13,    13,  1678,   835,  9330, 29901,    13,
         1678,  6204,   263, 13173,  6139,   363,   278,  1494,  3234,    13,
           13,  1678,   835, 10567, 29901,    13,  1678,  2994,   295,   468,
          293,  4116,  6983, 25992, 29892, 23329,   304,  7663, 29901, 20693,
          936, 25992,    13,    13,  1678,   835, 13291, 29901,    13,   268,
        29896, 29889, 10969,  4408, 29901,  2994,   295,   468,   293,  4116,
         6983, 25992,    13,   268, 29906, 29889, 10969, 12953, 29901,   450,
         2994,   295,   468,   293,  4116,  6983, 25992,   338,   263,  1880,
        29899, 29567, 27070,  9495,  8688,   304,  3867,   263, 10597,   322,
        18378,  7271,   363,  4160, 2988

In [50]:
# Drops 16.4/19.0 GB per GPU down to 3.25 GB per GPU!
import gc
del adapter_plus_model
torch.cuda.empty_cache() 
gc.collect()

48878

In [53]:
adapter_plus_model = PeftModel.from_pretrained( base_model, "training-results/checkpoint-85", use_flash_attention_2=True )

In [54]:
for line in generate_text( tokenizer, adapter_plus_model, product ).split( "\n" ): print( line )

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generation_output[ 0 ]: tensor([    1,   835,  2799,  4080, 29901,    13,  1678,  4803,   278,  9330,
         2400,   322,   278, 10567,  2183,   304,  2436,   278, 13291, 29892,
          607,   338,  1824, 29885,  2454, 15278,   393,   508,  4505,   278,
         1494,  9330, 29901,    13,    13,  1678,   835,  9330, 29901,    13,
         1678,  6204,   263, 13173,  6139,   363,   278,  1494,  3234,    13,
           13,  1678,   835, 10567, 29901,    13,  1678,  2994,   295,   468,
          293,  4116,  6983, 25992, 29892, 23329,   304,  7663, 29901, 20693,
          936, 25992,    13,    13,  1678,   835, 13291, 29901,    13,   268,
           13,  1678,   450,  2994,   295,   468,   293,  4116,  6983, 25992,
          338,   263,  1880, 29899, 29567, 27070,  9495,  8688,   363, 18378,
          322, 10597, 10298, 29889,   739,  5680,   263, 12844,  1416,   322,
        11071,  2874, 29892,  3907,   372,  4780,   304,  8677,  2820, 29889,
          450,  9495,   756,   263, 2987