In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
0.0 GB of memory reserved.


In [4]:
! pip freeze

absl-py==1.4.0
accelerate==0.26.1
aiohttp==3.9.3
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.6.0
anyio==3.7.1
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.14.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.0.0
bitsandbytes==0.42.0
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.1
build==1.2.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.2.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.16.0
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.2
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.4
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==42.0.

# Fine-tuning a 7B model on a 4090 card  
> Or, on any card with >=24 GB of memory

## Creating the environment  
> Note: already ran in the terminal

Pause: why are we not using the `llm-env` we worked so hard to create? Short-answer: Mac vs. NVIDIA.

In [5]:
# # Install Pytorch & other libraries
# !pip install "torch==2.1.2" tensorboard

# # Install Hugging Face libraries
# !pip install  --upgrade \
#   "transformers==4.36.2" \
#   "datasets==2.16.1" \
#   "accelerate==0.26.1" \
#   "evaluate==0.4.1" \
#   "bitsandbytes==0.42.0"
# #   "trl==0.7.10" # \
# #   "peft==0.7.1" \

# # install peft & trl from github
# !pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
# !pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

# # install flash-attn
# !pip install ninja packaging
# !MAX_JOBS=2 pip install flash-attn --no-build-isolation

In [6]:
# from google.colab import drive
# drive.mount('/content/drive')

# Imports

Bring in everything we will need, ordered roughly by when we use it.

In [7]:
from huggingface_hub import login
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments, AutoTokenizer, pipeline
from peft import LoraConfig
from peft import AutoPeftModelForCausalLM
from trl import setup_chat_format
from trl import SFTTrainer
from random import randint

# Logging into HuggingFace hub so we can download datasets (and models)

In [8]:
# log in to the HF hub so we can download datasets and models
login(
  token="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", # <- my token
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Processing the Diataxis dataset for fine-tuning.

In [9]:
# Convert dataset to OAI messages
system_message = """You are a professional writer of excellent technical documentation. Users will ask you questions about writing good technical content, based on a framework called Diátaxis. You will answer based on the framework's four general documents: References, Tutorials, How-To's, and Explanations."""

def create_conversation(sample):
  """Formats our dataset into a ChatML format for easy fine-tuning."""
  return {
    "messages": [
      {"role": "system", "content": system_message},
      # NOTE: formatting fix from last time
      {"role": "user", "content": sample["conversations"][0]["value"]},
      {"role": "assistant", "content": sample["conversations"][1]["value"]}
    ]
  }

# Load our dataset from the hub
dataset = load_dataset("enzokro/some-test-dataset")
dataset = dataset.shuffle()

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/232 [00:00<?, ? examples/s]

In [10]:
# check how many samples we have
dataset.shape

{'train': (232, 2)}

In [11]:
# view a random message
print(dataset["train"][100]["messages"])

[{'content': "You are a professional writer of excellent technical documentation. Users will ask you questions about writing good technical content, based on a framework called Diátaxis. You will answer based on the framework's four general documents: References, Tutorials, How-To's, and Explanations.", 'role': 'system'}, {'content': 'Q: How does the structure of documentation form when using Diátaxis?', 'role': 'user'}, {'content': 'A: When using Diátaxis, the structure of documentation forms from the inside as changes are made according to Diátaxis principles. At a certain point, the changes made will demand that material be moved under a certain Diátaxis heading, and that is how the top-level structure will form.', 'role': 'assistant'}]


Next we make a simple test dataset from ~10% of our Q&A pairs.  

This part is crucial, and here we only show a simple scaffolding. We will compare the LLMs answers before/after fine-tuning to see what it learned.

In [12]:
# Create a simple, dummy test set from ~10% of the messages
dataset = dataset['train'].train_test_split(test_size=34/232)

In [13]:
# save the formatted datasets to disk for convenience
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

44420

Now, we have everything we need to start fine-tuning.

# Fine-tuning the model

In [14]:
# Load the train dataset from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Next, we set which model we want to load.

In [15]:
# Hugging Face model id
model_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
# model_id = "mistralai/Mistral-7B-Instruct-v0.2" # <- plug in anything you'd like

The full model won't fit for training on the GPU. This is due to the Gradient Descent step. Specficially, the optimizer we use called Adam keeps two moving averages of the gradients. That means that each weight technically has three values to track: the current gradient, its moving average, and the moving average of their squared values.

You can think of it like tracking the speed and velocity of the gradients, as oppposed to only tracking their current position.

To get around this, we have to load the model in 4-bit precision. The excellent `bitsandbytes` library comes in to the rescue, and with a simple config setup the entire model will fit on the card.

In [16]:
# BitsAndBytesConfig int-4 config so we can fit the training model on the 4090
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

Now, we can load the model telling it to use this 4-bit config (note the `quantization_config` argument below).

In [17]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32032, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
   

Next up we can apply the specific chat format to our Q&A pairs that prepare them for fine-tuning. In other words, this is a crucial text pre-processing step that makes sure the model sees Q&A pairs in a familiar format. It is not uncommon for the entire model to fall apart because certain words in its chat template were messed up. LLMs are powerful, but finicky...

In [19]:
# set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

# Visualizing our pre-processed samples

In [20]:
# let's apply the template to one of our Q&A pairs
input_ids = tokenizer.apply_chat_template(dataset['messages'][0]); input_ids

[32001,
 1587,
 13,
 1976,
 460,
 264,
 5024,
 6953,
 302,
 8099,
 10067,
 12905,
 28723,
 25777,
 622,
 1460,
 368,
 4224,
 684,
 3653,
 1179,
 10067,
 3036,
 28725,
 2818,
 356,
 264,
 10782,
 1987,
 6216,
 4566,
 8405,
 28723,
 995,
 622,
 4372,
 2818,
 356,
 272,
 10782,
 28742,
 28713,
 2308,
 2952,
 10181,
 28747,
 1298,
 8831,
 28725,
 25048,
 8525,
 28713,
 28725,
 1602,
 28733,
 1551,
 28742,
 28713,
 28725,
 304,
 1529,
 11009,
 697,
 28723,
 32000,
 28705,
 13,
 32001,
 2188,
 13,
 28824,
 28747,
 10934,
 624,
 1464,
 298,
 4160,
 2475,
 467,
 12655,
 302,
 771,
 1159,
 20449,
 739,
 1413,
 6216,
 4566,
 8405,
 28804,
 32000,
 28705,
 13,
 32001,
 13892,
 13,
 28741,
 28747,
 1770,
 28725,
 624,
 1023,
 4461,
 272,
 16372,
 352,
 298,
 4160,
 2475,
 467,
 12655,
 302,
 771,
 1159,
 20449,
 28723,
 4203,
 3707,
 297,
 272,
 1103,
 5007,
 349,
 4407,
 20449,
 5347,
 28723,
 32000,
 28705,
 13]

In [21]:
# IMPORTANT! Always look at the decoded tokens, to make sure the model has not broken and will read gibberish
print(tokenizer.decode(input_ids))

<|im_start|> system
You are a professional writer of excellent technical documentation. Users will ask you questions about writing good technical content, based on a framework called Diátaxis. You will answer based on the framework's four general documents: References, Tutorials, How-To's, and Explanations.<|im_end|> 
<|im_start|> user
Q: Should one try to complete large tranches of work before publishing when using Diátaxis?<|im_end|> 
<|im_start|> assistant
A: No, one should avoid the temptation to complete large tranches of work before publishing. Every step in the right direction is worth publishing immediately.<|im_end|> 



If your string outputs look weird in a notebook, make sure you use `print`.

# LORA config

The following LORA config setup is what lets us actually train the model on the 4090. Technically, we are not learning or training the entire model. We are only doing gradient descent on the LORA adapters.  

The config and parameters in the next section are good "best-practice" starting points. As you iterate on the model, you can modify certain parameters. It quickly gets overwhelming, so before any serious parameters experimentation you want to have some sort of eval harness - a way of measuring whether your model is getting better or worse.

In [22]:
# LoRA config based on QLoRA paper & Sebastian Raschka experiments
# Reference: https://magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

# Preparing the Training Arguments

The section below also includes excellent starting-point parameters for the values.

In [23]:

# preparing the training arguments
args = TrainingArguments(
    output_dir="hermes-mistral-7b-diataxis", # directory to save and repository id
    num_train_epochs=5,                      # number of training epochs
    per_device_train_batch_size=3,           # batch size per device during training
    gradient_accumulation_steps=2,           # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=1,                        # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)


# Preparing the model for training

In [24]:
# max sequence length
max_seq_length = 3072

# this trainer will run the fine-tuning
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    # we add the following kwargs since the chatML is handling everything for us
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

# Training the model

In [25]:
# magic starts: fine-tune the model on the Diataxis Q&A dataset
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
1,1.6127
2,3.0095
3,1.4852
4,0.9939
5,0.8458
6,0.7192
7,0.7143
8,0.4709
9,0.5182
10,0.4605




TrainOutput(global_step=10, training_loss=1.083012008666992, metrics={'train_runtime': 121.3308, 'train_samples_per_second': 0.412, 'train_steps_per_second': 0.082, 'total_flos': 7171780431052800.0, 'train_loss': 1.083012008666992, 'epoch': 5.0})

## Save the model, once the loop is done. Don't lose all our hard work!

In [26]:
# save model
# NOTE: files live inside of /content on the mounted VM
trainer.save_model()

In [27]:
# view our saved outputs
!ls .

hermes-mistral-7b-diataxis  sample_data  test_dataset.json  train_dataset.json


In [28]:
from google.colab import files
files.download('hermes-mistral-7b-diataxis')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

NOTE: the calls below can be a lifesaver

In [29]:
# clean up gpu memory
del model
del trainer
torch.cuda.empty_cache()

## Eventually, we will need to merge in these new weights to deploy the model...

In [30]:
#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
# from peft import AutoPeftModelForCausalLM

# # Load PEFT model on CPU
# model = AutoPeftModelForCausalLM.from_pretrained(
#     args.output_dir,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# )
# # Merge LoRA and base model and save
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")

# Drumroll... did our model learn anything?

> Reference: https://hamel.dev/blog/posts/evals/

Now we find out: we load in the fine-tuned model (specifically, the model + the learned LORA adapters) and check its answer to some questions in the test set.

The sections below are somewhat of a patch. We will compare the original Answer from the Q&A to what the model outputs. It has not seen the questions in the test dataset, so this will be a fair comparison.

Granted, this is a fully subjective and arbitrary measure. One of the first things to do in a proper development setup would be to create an eval harness. In this case, we only have ~34 test set questions. We could set up a way to measure how much the model's outputs align with the expected answer.

In [31]:
# load the fine-tuned model we just saved
peft_model_id = "./hermes-mistral-7b-diataxis"

# Load the model with the PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="cuda", # <- sometimes weird errors appear if you don't specifically place the model on the GPU
  torch_dtype=torch.float16 # <- Now that training ran quantized, we can bump back up to "full" precisions
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


With the PEFT model, we can hook in our patchy eval: a text-generation pipeline (hello HuggingFace pipelines my old friend...)

In [32]:
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

## Eval loop

First, start by loading the test set data.

In [33]:
# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Now, run the following code many times to get an idea of what/if the model learned

In [47]:
# picks a random dataset sample
rand_idx = randint(0, len(eval_dataset) - 1)

# Test the model on this sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(
    prompt,
    max_new_tokens=1024,
    do_sample=False,
    temperature=0,
    top_k=50,
    top_p=0.9,
    eos_token_id=pipe.tokenizer.eos_token_id,
    pad_token_id=pipe.tokenizer.pad_token_id,
)

# what does she know?
print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Query:
Q: How can focusing on the underlying principles of Diátaxis, rather than a particular structural template, help documentation authors to create more effective and user-friendly content?
Original Answer:
A: Focusing on the underlying principles of Diátaxis, such as identifying and addressing different user needs, can help documentation authors to create more effective and user-friendly content by ensuring that the structure and organization of the documentation emerge from a deep understanding of the audience and their goals. This approach allows for greater flexibility in the face of complex content and unique product requirements while still maintaining the benefits of a structured, purposeful documentation system.
Generated Answer:
A: Focusing on the underlying principles of Diátaxis, such as addressing user needs, organizing content by user requirements, and using language appropriately, can help documentation authors to create more effective and user-friendly content by gui