## QLoRA vs OpenGPT on Colab

⚠ VERY EXPERIMENTAL ⚠

Initial attempts to train an LLM using QLoRA on a free T4 GPU on Colab along the lines of the OpenGPT approach.

Borrowing heavily from and combining the following:
- https://github.com/CogStack/OpenGPT
- https://github.com/artidoro/qlora

Major thanks to the developers of both.


In [None]:
# **Make sure you have the runtime on GPU as QLoRA currently doesn't work without it**

In [None]:
# Install all the things!
! pip install -U bitsandbytes
! pip install -U git+https://github.com/huggingface/transformers.git
! pip install -U git+https://github.com/huggingface/peft.git
! pip install -U git+https://github.com/huggingface/accelerate.git

! pip install -U opengpt

! pip install einops xformers

! pip install -U tokenizers
! pip install -U protobuf==3.20.3
! pip install -U sentencepiece

import os

os.kill(os.getpid(), 9)

In [None]:
# Once packages installed - run from here

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    pipeline,
    BitsAndBytesConfig,
    LlamaTokenizer,
    DataCollatorForLanguageModeling,
)

import datasets


from opengpt.config import Config
from opengpt.model_utils import add_tokens_to_model_and_tokenizer
from opengpt.dataset_utils import create_labels, pack_examples
from opengpt.data_collator import DataCollatorWithPadding

import torch

from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model

In [None]:
print(torch.cuda.get_device_name())

In [None]:
# Download the configs and data from OpenGPT repo https://github.com/CogStack/OpenGPT

! wget https://raw.githubusercontent.com/CogStack/OpenGPT/main/data/nhs_uk_full/prepared_generated_data_for_nhs_uk_qa.csv
! wget https://raw.githubusercontent.com/CogStack/OpenGPT/main/data/nhs_uk_full/prepared_generated_data_for_nhs_uk_conversations.csv
! wget https://raw.githubusercontent.com/CogStack/OpenGPT/main/data/medical_tasks_gpt4/prepared_generated_data_for_medical_tasks.csv
! wget https://raw.githubusercontent.com/CogStack/OpenGPT/main/configs/example_train_config.yaml
! wget https://raw.githubusercontent.com/CogStack/OpenGPT/main/data/example_project_data/prepared_generated_data_for_example_project.csv

In [None]:
# Load the config - we actually don't use lots of it
config = Config(yaml_path="./example_train_config.yaml")

# This config can be used as a template
config.train.to_dict()

In [None]:
model_id = "decapoda-research/llama-13b-hf"

In [None]:
# Load the model and tokenizer
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
)

In [None]:
tokenizer.model_max_length = 1024  # config.train.max_seq_len
# Can push this to 1024 according to table in blog:
# https://huggingface.co/blog/4bit-transformers-bitsandbytes#what-other-consequences-are-there

add_tokens_to_model_and_tokenizer(config, tokenizer, model)

config.train.datasets = [
    "./prepared_generated_data_for_nhs_uk_qa.csv",
    "./prepared_generated_data_for_nhs_uk_conversations.csv",
    "./prepared_generated_data_for_medical_tasks.csv",
]

# Load datasets and shuffle if needed
train_dataset = datasets.Dataset.from_csv(config.train.datasets)
if config.train.shuffle_dataset:
    train_dataset = train_dataset.shuffle()
    print("Shuffling dataset!")

train_dataset

In [None]:
train_dataset[0]["text"]

In [None]:
to_remove = list(train_dataset.column_names)
to_remove.remove("text")
train_dataset = train_dataset.remove_columns(to_remove)

In [None]:
train_dataset[0]

In [None]:
# Minor fix to create_labels (was required locally at least)
def create_labels(examples, config, tokenizer):
    r"""This is used with a prepared HF dataset that is already tokenized. It will add labels
    so that only the AI generated parts (answers) will be trained on.
    """

    user_token_id = tokenizer.get_vocab()[config.special_tokens.user]
    ai_token_id = tokenizer.get_vocab()[config.special_tokens.ai]
    # Everything written by an AI will be used for training, and everything by a user will be ignored

    examples["labels"] = []
    for i in range(len(examples["input_ids"])):
        labels = []
        ignore = True
        for tkn_id in examples["input_ids"][i]:
            if tkn_id == user_token_id:
                ignore = True
            elif tkn_id == ai_token_id:
                ignore = False

            if ignore:
                labels.append(config.train.ignore_index)
            else:
                labels.append(tkn_id)
        examples["labels"].append(labels)
    return examples

In [None]:
# Ignore max_seq_len warning, it is handled by the packer or data_collator
train_dataset = train_dataset.map(
    lambda examples: tokenizer(examples["text"], add_special_tokens=False),
    batched=True,
    num_proc=1,
    remove_columns=["text"],
)

In [None]:
# Create labels for supervised training (meaning we do not train on questions, but only on answers)

# Llama Temp Fix in create_labels function
# user_token_id = tokenizer.get_vocab()[config.special_tokens.user]
# ai_token_id = tokenizer.get_vocab()[config.special_tokens.ai]

train_dataset = train_dataset.map(
    lambda examples: create_labels(examples, config, tokenizer),
    batched=True,
    batch_size=1_000,
    num_proc=1,
)

In [None]:
# We only do packing for the train set
train_dataset = train_dataset.map(
    lambda examples: pack_examples(
        examples, config.train.max_seq_len, packing_type=config.train.packing_type
    ),
    batched=True,
    batch_size=1_000,
    num_proc=1,
)

In [None]:
# Check the new train_dataset (take note of how the labels look). 
# The USER (Question) part of the input should have a label of -100,
# and the AI part (Answer) should have labels equal to input_ids
for i in range(50):
    print(
        train_dataset[0]["input_ids"][i],
        train_dataset[0]["labels"][i],
        train_dataset[0]["attention_mask"][i],
    )

In [None]:
# PEFT apply
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        (
            f"trainable params: {trainable_params} || "
            f"all params: {all_param} || trainable %: "
            f"{100 * trainable_params / all_param}"
        )
    )

In [None]:
model.modules

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",  # Should we include this?
    ],  # ["query_key_value"],  # ["k_proj", "v_proj", "q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

In [None]:
# needed for gpt-neo-x tokenizer in demo - do we need this for Llama?
tokenizer.pad_token = tokenizer.eos_token

dc = DataCollatorWithPadding(
    tokenizer.pad_token_id,
    config.train.ignore_index,
    max_seq_len=config.train.max_seq_len,
)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        # warmup_steps=2,
        max_steps=1_000,
        learning_rate=1e-4,  # 3e-4,  # 1.2e-4,  # 2e-4,
        # lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        logging_strategy="steps",
        logging_steps=1,
        # save_strategy="steps",
        # save_steps=250,
        seed=11,
        # num_train_epochs=1,
        fp16=True,
        max_grad_norm=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        # load_best_model_at_end=True,
    ),
    data_collator=dc,  # DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

## Eval on some OpenGPT examples

In [None]:
def responder(t):
    """Take an input and respond"""

    input_ids = tokenizer(t, return_tensors="pt").input_ids
    input_ids = input_ids.to(model.device)

    generate_kwargs = dict(
        input_ids=input_ids, do_sample=True, max_length=128, temperature=0.2
    )

    with torch.no_grad():
        outputs = model.generate(**generate_kwargs)

    return tokenizer.batch_decode(outputs.detach().cpu().numpy())[0]

In [None]:
model.config.use_cache = True  # Re-enable

model.eval()

t = "<|user|> What is diabetes? <|eos|> <|ai|>"  # The format with special tokens is required, because of training

print(responder(t))

In [None]:
t = "<|user|> What is vitamin d3 and should I take it? <|eos|> <|ai|>"

print(responder(t))

In [None]:
t = "<|user|> What is HTN? <|eos|> <|ai|>"

print(responder(t))

In [None]:
t = "<|user|> What is the capital of France? <|eos|> <|ai|>"

print(responder(t))