<a href="https://colab.research.google.com/github/chris-hoertnagl/AI-Dojo/blob/main/LLM/fine_tune_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# loralib
%pip install -q -U torch
%pip install -q -U transformers
%pip install -q -U peft
%pip install -q -U datasets
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

In [3]:
def load_model(id: str) -> tuple:
    model = AutoModelForCausalLM.from_pretrained(
        id,
        load_in_8bit=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(id)
    return model, tokenizer

def generate_text(prompt: str, model: AutoModelForCausalLM, tokenizer:AutoTokenizer, max_new_tokens:int=20) -> str:
    batch = tokenizer(prompt, return_tensors='pt')

    with torch.cuda.amp.autocast():
      output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens)

    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

## Load & Evaluate base model

In [4]:
model, tokenizer = load_model(id="databricks/dolly-v2-3b")

Downloading (…)lve/main/config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

In [5]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 2560)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=2560, out_features=7680, bias=True)
          (dense): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear8bitLt(in_features=10240, out_fe

In [6]:
generate_text("What is your name?", model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'What is your name?\n\nMy name is Tara.\n\nWhat is your age?\n\nI am 22'

In [7]:
generate_text("Tell me your name!", model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Tell me your name!\n\n- My name is Kiki.\n\n- Nice to meet you, Kiki.'

## Apply LoRA preprocessing to the model

In [14]:
config = {
    "LORA_R": 16,
    "LORA_ALPHA": 32,
    "LORA_DROPOUT": 0.05,
    "PER_DEVICE_TRAIN_BATCH_SIZE": 4,
    "GRADIENT_ACCUMULATION_STEPS": 4,
    "WARMUP_STEPS": 100,
    "MAX_STEPS": 40,
    "LEARNING_RATE": 0.0002
}

In [9]:
class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)


# Parameter freezing
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
model.embed_out = CastOutputToFloat(model.embed_out)
# LORA
lora_config = LoraConfig(
    r=config["LORA_R"],
    lora_alpha=config["LORA_ALPHA"],
    lora_dropout=config["LORA_DROPOUT"],
    bias="none",
    task_type="CAUSAL_LM",
)
# Add Low Rank Adapters + freezing
model = get_peft_model(model, lora_config)
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
model

trainable params: 5242880 || all params: 2780328960 || trainable%: 0.18857049203271256


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50280, 2560)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): Linear8bitLt(
                in_features=2560, out_features=7680, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_feature

## Load the data

In [12]:
def generate_prompt(data_point):
    return f"""{data_point["instruction"]}

{data_point["output"]}"""


def load_data():
    data = load_dataset("json", data_files="./chris_train.json")

    data = data.shuffle().map(
        lambda data_point: tokenizer(
            generate_prompt(data_point),
            truncation=True,
            max_length=256,
            padding="max_length",
        )
    )
    return data

data = load_data()
data

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-b7512721a6776c40/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-b7512721a6776c40/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input_ids', 'attention_mask'],
        num_rows: 128
    })
})

## Define & Start training

In [15]:
trainer = transformers.Trainer(
        model=model,
        train_dataset=data["train"],
        args=transformers.TrainingArguments(
            per_device_train_batch_size=config["PER_DEVICE_TRAIN_BATCH_SIZE"],
            gradient_accumulation_steps=config["GRADIENT_ACCUMULATION_STEPS"],
            warmup_steps=config["WARMUP_STEPS"],
            max_steps=config["MAX_STEPS"],
            learning_rate=config["LEARNING_RATE"],
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),
    )
model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.8505
2,3.0786
3,2.7541
4,2.7459
5,2.5741
6,2.5663
7,2.9441
8,3.0562
9,2.7024
10,3.0265


TrainOutput(global_step=40, training_loss=2.0035834699869155, metrics={'train_runtime': 393.5642, 'train_samples_per_second': 1.626, 'train_steps_per_second': 0.102, 'total_flos': 2606640817766400.0, 'train_loss': 2.0035834699869155, 'epoch': 5.0})

## Evaluate fine-tuned model

In [16]:
generate_text("What is your name?", model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'What is your name?\n\nChris.\n\nWhat is your name?\n\nChris.\n\nChris is my'

In [17]:
generate_text("Tell me your name!", model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


"Tell me your name!\n\nChris.\n\nChris is a great name! It's easy to say, easy to"