<a href="https://colab.research.google.com/github/edcalderin/LLM_Tech/blob/master/unleash_mistral7b_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unleash Mistral 7B’ Power: How to Efficiently Fine-tune a LLM on Your Own Data

In [9]:
import torch

device_properties = torch.cuda.get_device_properties()
print("Device name:", device_properties.name)
print("Device memory:", round(device_properties.total_memory/(1024**3), 2), "GB")

Device name: Tesla T4
Device memory: 14.74 GB


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

login()

In [None]:
!pip install -q transformers accelerate bitsandbytes datasets peft

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="fp4")

model_id: str = "mistralai/Mistral-7B-v0.3"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [None]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  ### Question:
  {query}

  ### Answer:
  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [None]:
result = get_completion(query="Will capital gains affect my tax bracket?", model=model, tokenizer=tokenizer)
print(result)

In [None]:
from datasets import load_dataset
data = load_dataset("gbharti/finance-alpaca", split='train[:10%]')

# Explore the data
df = data.to_pandas()
df.head(10)

In [None]:
# Define a function to generate a prompt text based on a data point
def generate_prompt(data_point: dict):
    """
    Generate input text based on a prompt, task instruction, (context info.), and answer.
    Args:
        data_point: Data point
    Returns
        Tokenized prompt
    """
    # Check if the data point has additional context information
    if data_point['input']:
        # Create a text with instruction, input, and response
        text = 'Below is an instruction that describes a task, paired with an input that provides' \
               ' further context. Write a response that appropriately completes the request.\n\n'
        text += f'### Instruction:\n{data_point["instruction"]}\n\n'
        text += f'### Input:\n{data_point["input"]}\n\n'
        text += f'### Response:\n{data_point["output"]}'

    # If there's no additional context
    else:
        # Create a text with just instruction and response
        text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'
        text += f'### Instruction:\n{data_point["instruction"]}\n\n'
        text += f'### Response:\n{data_point["output"]}'
    return text

# Add the "prompt" column in the dataset by applying the generate_prompt function to each data point
text_column = [generate_prompt(data_point) for data_point in data]
data = data.add_column("prompt", text_column)

# Shuffle the dataset with a specified seed
data = data.shuffle(seed=1234)

# Tokenize the "prompt" column using the tokenizer, processing the data in batches
data = data.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

# Split the dataset into training and testing subsets, with 90% for training and 10% for testing
data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [None]:
# Import the necessary function from the "peft" library to prepare a model for k-bit training
from peft import prepare_model_for_kbit_training

# Enable gradient checkpointing for the model
model.gradient_checkpointing_enable()

# Prepare the model for k-bit training using the "prepare_model_for_kbit_training" function
model = prepare_model_for_kbit_training(model)

# Define a function to print the number of trainable parameters in the model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    # Iterate through model parameters
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    # Print the number of trainable parameters, total parameters, and the percentage of trainable parameters
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Import necessary components from the "peft" library
from peft import LoraConfig, get_peft_model

# Define a configuration for the LoRA (Learnable Requantization Activation) method
lora_config = LoraConfig(
    r=8,                                   # Number of quantization levels
    lora_alpha=32,                         # Hyperparameter for LoRA
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Modules to apply LoRA to
    lora_dropout=0.05,                     # Dropout probability
    bias="none",                           # Type of bias
    task_type="CAUSAL_LM"                  # Task type (in this case, Causal Language Modeling)
)

# Get a model with LoRA applied to it using the defined configuration
model = get_peft_model(model, lora_config)

# Print the number of trainable parameters in the model after applying LoRA
print_trainable_parameters(model)

In [None]:
import transformers

# Set the pad_token of the tokenizer to be the same as the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Create a trainer for fine-tuning a model
trainer = transformers.Trainer(
    model=model,  # The model to be trained
    train_dataset=train_data,  # Training dataset
    eval_dataset=test_data,    # Evaluation dataset
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,        # Batch size per device during training
        gradient_accumulation_steps=4,        # Number of gradient accumulation steps
        warmup_steps=1,                       # Number of warm-up steps for learning rate
        max_steps=100,                        # Maximum number of training steps
        learning_rate=2e-4,                   # Learning rate
        fp16=True,                            # Enable mixed-precision training
        logging_steps=1,                      # Logging frequency during training

        optim="paged_adamw_8bit",             # Optimizer type
        save_strategy="epoch",                # Strategy for saving checkpoints
        push_to_hub=True                      # Push to the Hugging Face model hub
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    # Data collator for language modeling task
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
result = get_completion(query="Will capital gains affect my tax bracket?", model=model, tokenizer=tokenizer)
print(result)