### Import prerequisites

In [1]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import torch
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from huggingface_hub import login

### Load Accelerator
For efficient training, we will leverage the Fully Sharded Data Parallel Plugin and Accelerator. This will allow for optimized memory usage and potentially faster training times.

If you wish to do so, you can also plug in different accelerators in this cell to see how they effect training

In [2]:
## Load accelerator
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

### Data Preparation
Next, we will load our training dataset, shuffle it, and split it into training, validation, and test subsets.

The dataset chosen for this fine tune is vicgalle/alpaca-gpt4, which contains English Instruction-Following generated by GPT-4 using Alpaca prompts for fine-tuning LLMs.

The dataset contains approximately 56k samples, of which we will only use 10k.

The following section is where you can import different datasets and perform some exploratory data analysis to decide what information you want to use to train.

In [3]:
# Load the full training dataset
full_train_dataset = load_dataset('vicgalle/alpaca-gpt4', split='train')

# Shuffle and select the first 10,000 samples
subset_dataset = full_train_dataset.shuffle(seed=42).select(range(10000))

# Split the subset into train, validation, and test sets
train_dataset = subset_dataset.train_test_split(test_size=0.2)['train']  # 8,000 samples for training
temp_dataset = subset_dataset.train_test_split(test_size=0.2)['test']   # 2,000 samples left
eval_dataset = temp_dataset.train_test_split(test_size=0.5)['train']    # 1,000 samples for validation
test_dataset = temp_dataset.train_test_split(test_size=0.5)['test']     # 1,000 samples for testing

# Print the datasets
print(train_dataset)
print(eval_dataset)
print(test_dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 8000
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 1000
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 1000
})


### Model and Tokenizer Loading
Load the base model with specific configurations and the tokenizer that will be used to preprocess our data.

In [4]:
## Load Base Model
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Tokenization Setup
In this section, we adjust the tokenizer settings and define a tokenization function to preprocess our dataset.

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

## setup tokenize function to make labels and input_ids the same.
tokenizer.pad_token = tokenizer.eos_token

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

## Format all samples into this prompt format
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}
"""
    return tokenize(full_prompt)

### Data Tokenization
For our model to understand and process the data, we need to convert the raw data into a format that it understands. Tokenization breaks down the raw text into tokens, which are integer representations of words or characters

In [6]:
# tokenize each sample based on the prompt format
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

print(tokenized_train_dataset[4]['input_ids'])

# check that the sample has the max length of 512
print(len(tokenized_train_dataset[4]['input_ids']))

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 20811, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 13, 27332, 3133, 3112, 28747, 13, 1976, 460, 3638, 286, 395, 3524, 582, 395, 264, 2623, 354, 17032, 3408, 16298, 297, 264, 2990, 28723, 13, 13, 27332, 11232, 28747, 13, 13, 13, 27332, 12107, 28747, 13, 657, 1745, 298, 7643, 3408, 16298, 297, 264, 2990, 28725, 478, 829, 4808, 272, 2296, 2623, 28747, 13, 13, 28740, 28723, 26256, 3408, 12832, 7034, 28747, 415, 2990, 541, 2405, 5593, 19953, 1771, 298, 11305, 304, 8689, 354, 582, 6336, 288, 298, 3408, 28733, 28627, 14933, 20904, 28725, 15912, 28725, 304, 382, 28790, 1645, 4918, 28723, 13, 13, 28750, 28723, 1294

### Evaluate the Base Model
Before fine-tuning, it's a good practice to evaluate the base model on our task to get a sense of its initial performance.

This way, you can design a prompt that can be validated against once we finish training the model.

In [7]:
print("Instruction: " + test_dataset[1]['instruction'])
print("Input: " + test_dataset[1]['input'])
print("Response: " + test_dataset[1]['output'] + "\n")

eval_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{test_dataset[1]['instruction']}

### Input:
{test_dataset[1]['input']}

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
        print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Instruction: Rewrite this sentence in simple language
Input: The surreptitious actions of the individual evinced a lack of sincerity
Response: The person secretly did things that showed they were not being honest.

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Rewrite this sentence in simple language

### Input:
The surreptitious actions of the individual evinced a lack of sincerity

### Response:
мтрвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтвтв


### Begin Fine-tuning with PEFT
Next, we will start the fine-tuning process using PEFT. PEFT stands for "Progressive Embedding Fine-Tuning", a method that helps in optimizing the model's weights.

In [8]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# print model to examine layers
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

### Integrating LoRa
LoRa (Low-Rank Adapters) is a technique to train neural networks more efficiently by adding low-rank transformations to the intermediate representations of a pre-trained model.

In [9]:
# Define the LoRa config
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


### Applying the Accelerator
To further optimize our model training, we apply the accelerator. This step optimizes the training across multiple devices if available.

In [10]:
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

### Examining the Updated Model
After integrating LoRa and applying the accelerator, let's print the model to observe the changes.

In [11]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): L

### Tracking Training with Weights & Biases (wandb)
Weights & Biases (wandb) provides tools to track and visualize the training process. By integrating with wandb, you can monitor your model's performance, visualize metrics, and more.

In [12]:
## Track the training stats on wandb
import wandb, os
wandb.login()

wandb_project = "marlin-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcorticalstack[0m ([33mcorticalstackteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Training the Model
Now, we'll start the fine-tuning process. This involves specifying various training parameters and using the Trainer class from the transformers library.

In [13]:
import transformers
from datetime import datetime

max_steps = 10 # was 1000
eval_steps = 5 # ws 50
logging_steps = 5 # was 50
save_steps = 5 # was 50
warmup_steps = 2 # was 5

project = "marlin-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=warmup_steps,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=max_steps,
        learning_rate=2.5e-5, 
        logging_steps=logging_steps,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        
        save_strategy="steps",       
        save_steps=save_steps,                
        evaluation_strategy="steps", 
        eval_steps=eval_steps,               
        do_eval=True,                
        report_to="wandb",           
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112329522216493, max=1.0…



Step,Training Loss,Validation Loss
5,1.3994,1.234447
10,1.2455,1.162489


Checkpoint destination directory ./mistral-marlin-finetune/checkpoint-5 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-marlin-finetune/checkpoint-10 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=10, training_loss=1.3224186420440673, metrics={'train_runtime': 648.4818, 'train_samples_per_second': 0.247, 'train_steps_per_second': 0.015, 'total_flos': 3505481507143680.0, 'train_loss': 1.3224186420440673, 'epoch': 0.02})

### Evaluating the Fine-tuned Model
After training, it's important to evaluate the fine-tuned model to understand its performance. This involves loading the base model and comparing its outputs to the fine-tuned model.

In [15]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

from peft import PeftModel
base_model = model = AutoModelForCausalLM.from_pretrained("mistralai/mistral-7b-v0.1", trust_remote_code=True, torch_dtype=torch.float32)
ft_model = PeftModel.from_pretrained(base_model, "mistral-marlin-finetune/checkpoint-100")

ft_model.eval()
with torch.no_grad():
        print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100, pad_token_id=2)[0], skip_special_tokens=True))



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

### Pushing the model to Huggingface Hub
At this point you have completed the fine tuning of mistral-7b-v0.1 on a Stanford Alpaca style dataset ranging a variety of topics. You can stop now if you wish.

However, there are a few more steps if you wish to upload the model to share your completion of this project.

You just need to merge the fine tuned model with the base model, and then push the merged model to your profile.

In [16]:
## push to hub
model = ft_model.merge_and_load()
model.push_to_hub("CorticalStack/marlin-mistral-7b-v0.1") 

AttributeError: 'MistralForCausalLM' object has no attribute 'merge_and_load'