# WizardCoder QLoRa Fine-tuning
Need around 40GB of VRAM to run this notebook. If you don't have that much, you can try to reduce the batch size and/or the max_seq_length. You can also try to reduce the number of epochs, but that will likely result in a lower accuracy.

Install dependecies. We are using `transformers --version` **4.30.2**, you will have to manually upload the adapter to the hub.

In [1]:
!pip install sentencepiece
!pip install bitsandbytes>=0.39.0
!pip install einops
#!pip install gradio
!pip install scipy
!pip install tensorboardX
!pip install git+https://github.com/huggingface/transformers@de9255de27abfcae4a1f816b904915f0b1e23cd9
#!pip install git+https://github.com/huggingface/transformers@main
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git@main
!pip install git+https://github.com/lvwerra/trl.git@main
!apt-get install git-lfs
!git lfs install

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Load the model with bnb

In [2]:
model_name = "WizardLM/WizardCoder-15B-V1.0"

In [None]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer # if LlamaModel, use LlamaTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    #low_cpu_mem_usage=True,
    device_map='auto'
)
model.config.use_cache = False # QLoRa does this

# Load Tokenizer, if LlamaModel, use LlamaTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # For LlamaTokenizer

model.config.pad_token_id = model.config.eos_token_id
model.generation_config.pad_token_id = model.config.eos_token_id

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/31.0G [00:00<?, ?B/s]

Check the layers names for your model.

In [5]:
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        print(name)

transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.mlp.c_fc
transformer.h.2.mlp.c_proj
transformer.h.3.attn.c_attn
transformer.h.3.attn.c_proj
transformer.h.3.mlp.c_fc
transformer.h.3.mlp.c_proj
transformer.h.4.attn.c_attn
transformer.h.4.attn.c_proj
transformer.h.4.mlp.c_fc
transformer.h.4.mlp.c_proj
transformer.h.5.attn.c_attn
transformer.h.5.attn.c_proj
transformer.h.5.mlp.c_fc
transformer.h.5.mlp.c_proj
transformer.h.6.attn.c_attn
transformer.h.6.attn.c_proj
transformer.h.6.mlp.c_fc
transformer.h.6.mlp.c_proj
transformer.h.7.attn.c_attn
transformer.h.7.attn.c_proj
transformer.h.7.mlp.c_fc
transformer.h.7.mlp.c_proj
transformer.h.8.attn.c_attn
transformer.h.8.attn.c_proj
transformer.h.8.mlp.c_fc
transformer.h.8.mlp.c_proj
transformer.h.9.attn.c_attn


Create the LoRa config. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [None]:
from peft import LoraConfig, get_peft_model

lora_dropout = 0.1 # QLoRa says 0.1 for >13B models, 0.05 for <=13B
lora_r = 32 # or 32

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "c_proj",
        "c_attn",
        "c_fc",
    ]
)

### Load an instruct style Dataset

In [None]:
from datasets import load_dataset

dataset_name = "richardr1126/spider-skeleton-context-instruct"
sql = load_dataset(dataset_name, split="train")

sql = sql.shuffle(seed=77)

### Load the HuggingFace Trainer

In [None]:
from transformers import TrainingArguments

output_dir = "./qlora-wizard-coder-adapter"
save_total_limit = 2 # Number of training adapters to keep saved on disk (might take a lot of storage)

# Accordingto QLoRA, (batch * grad_accum) should equal 16
per_device_train_batch_size = 16
gradient_accumulation_steps = 1
#auto_find_batch_size = True
#gradient_checkpointing=False

save_steps = 500 # how often PeftSavingCallback gets called
logging_steps = 10
learning_rate = 2e-4 # trial and error really ---> longer max_seq_length == slower learning rate??????
max_grad_norm = 0.3
warmup_ratio = 0.03
#num_train_epochs = 3
max_steps=1000

optim = "paged_adamw_32bit" # according to QLoRa
lr_scheduler_type = "linear" # according to QLoRa

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    #auto_find_batch_size=auto_find_batch_size,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    push_to_hub=False,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    save_total_limit=save_total_limit,
    #num_train_epochs=num_train_epochs,
    report_to='tensorboard',
    #gradient_checkpointing=gradient_checkpointing,
    lr_scheduler_type=lr_scheduler_type,
    max_steps=max_steps,
    #group_by_length=True, # groups dataset by length
)

Finally, initialize the Trainer

In [None]:
from trl import SFTTrainer
from transformers import TrainerCallback
import os

# Needed for this transformers version
class PeftSavingCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
        kwargs["model"].save_pretrained(checkpoint_path)

        if "pytorch_model.bin" in os.listdir(checkpoint_path):
            os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))

max_seq_length = 1376 # Max token length for each dataset sequence, SFTTrainer pads all sequences to this length
dataset_text_field = "text"

trainer = SFTTrainer(
    model=model,
    train_dataset=sql,
    peft_config=peft_config,
    dataset_text_field=dataset_text_field,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    callbacks=[PeftSavingCallback]
)

### Train the model

Pre-process the model by upcasting the layer norms in float 32 for more stable training.

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train()

In [None]:
trainer.create_model_card()

##### Download the checkpoint-XXX folder with your adapter and config JSONs from the `output_dir` to get the model LoRA adapter and config.

```bibtex
@article{dettmers2023qlora,
  title={QLoRA: Efficient Finetuning of Quantized LLMs},
  author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
  journal={arXiv preprint arXiv:2305.14314},
  year={2023}
}
```

# Merge PEFT and QLoRa
- Provided with help from TheBloke. I have not tested this using a local path to the adapter. I have only tested it using the QLoRa adapter from the hub.
- Need about 40GB of CPU RAM or VRAM to merge 16B models, if you want to use VRAM, you will have to change the `device_map`.

In [None]:
from huggingface_hub import login

login('<hf-token>')

In [None]:
base_model_hf_or_path = 'WizardLM/WizardCoder-15B-V1.0'
qlora_adapter_hf_or_path = '<hf-username>/<adapter-name>' # if local path from this nb 'qlora-wizard-coder-adapter/checkpoint-<step>'
output_dir = '<model-name>'
push_to_hub = True

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer # if LlamaModel, use LlamaTokenizer
from peft import PeftModel
import torch
import logging
import os

logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

logger.info(f"Loading base model: {base_model_hf_or_path}")
base_model = AutoModelForCausalLM.from_pretrained(
  base_model_hf_or_path,
  return_dict=True,
  torch_dtype=torch.float16,
  low_cpu_mem_usage=True
)

logger.info(f"Loading PEFT: {qlora_adapter_hf_or_path}")
model = PeftModel.from_pretrained(base_model, qlora_adapter_hf_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)

logger.info(f"Running merge_and_unload")
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(base_model_hf_or_path) # if LlamaModel, use LlamaTokenizer

if push_to_hub:
    logger.info(f"Saving to hub ...")
    model.push_to_hub(f"{output_dir}", use_temp_dir=False, max_shard_size='950MB')
    tokenizer.push_to_hub(f"{output_dir}", use_temp_dir=False)
else:
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir, torch_dtype=torch.float16)
    logger.info(f"Model saved to {output_dir}")
