<a href="https://colab.research.google.com/github/dsmueller3760/aerospace_chatbot/blob/llm_training/scripts/finetuning_colab_medium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets trl peft uuid pandas evaluate transformers bitsandbytes torch

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.7.6-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uuid
  Downloading uuid-1.30.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)
[2K     [9

In [43]:
### Imports

from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

import os
from uuid import uuid4
import pandas as pd

import subprocess
import evaluate
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
import torch

In [44]:
### Functions

def max_token_len(dataset,tokenizer):
    """
    Calculate the maximum token length of text entries in a dataset using a specified tokenizer.
    Args:
    dataset (iterable): A dataset where each entry is a dictionary containing at least a 'text' key.
                        The dataset should be iterable (like a list or a dataset object from popular
                        data processing libraries).
    tokenizer (Tokenizer): An instance of a tokenizer compatible with the text entries in the dataset.

    Returns:
    int: The maximum length (in terms of number of tokens) of any text entry in the dataset after
         tokenization.
    """
    max_seq_length = 0
    for row in dataset:
        tokens = len(tokenizer(row['text'])['input_ids'])
        if tokens > max_seq_length:
            max_seq_length = tokens
    return max_seq_length

from transformers import AutoModel

def calculate_model_parameters(model):
    """
    Calculate the total number of parameters in the model.

    Args:
    model: The Hugging Face model.

    Returns:
    int: Total number of parameters.
    """
    return sum(p.numel() for p in model.parameters())

def calculate_lora_parameters(lora_model):
    """
    Calculate the number of LoRA parameters in the model.

    This function assumes that LoRA parameters have a specific naming convention or identifiable structure.
    Adjust the function depending on how LoRA parameters are implemented in your model.

    Args:
    model: The Hugging Face model.

    Returns:
    int: Number of LoRA parameters.
    """
    # This works too but is more complicated
    # lora_param_count = 0
    # for name, param in lora_model.named_parameters():
    #     if 'lora' in name:  # Assuming LoRA parameters have 'lora' in their names
    #         lora_param_count += param.numel()
    # return lora_param_count

    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def estimate_full_model_memory(model, batch_size, seq_length):
    """
    Estimate the memory required for full model fine-tuning with additional factors.

    Args:
    model: The hugging face model.
    batch_size (int): The batch size used in training.
    seq_length (int): The sequence length of the input.

    Returns:
    float: Estimated memory required in gigabytes.
    """
    print("\nFull training model parameters")
    model_params = calculate_model_parameters(model)

    # Memory for model parameters (INT4)
    model_memory_bytes = model_params / 2  # INT4: 4 bits per parameter
    print(f"Model Memory: {model_memory_bytes / (1024 ** 3)} GB")

    # Memory for gradients (FP16)
    gradient_memory_bytes = 2 * model_params * 2  # FP16: 16 bits per parameter
    print(f"Gradient Memory: {gradient_memory_bytes / (1024 ** 3)} GB")

    # Optimizer state memory (FP16, assuming Adam optimizer)
    optimizer_memory_bytes = 2 * gradient_memory_bytes
    print(f"Optimizer State Memory: {optimizer_memory_bytes / (1024 ** 3)} GB")

    # Activation memory (FP16, estimate)
    activation_memory_bytes = 4 * batch_size * seq_length * 2  # FP16: 16 bits per activation
    print(f"Activation Memory: {activation_memory_bytes / (1024 ** 3)} GB")

    # Total memory
    total_memory_bytes = model_memory_bytes + gradient_memory_bytes + optimizer_memory_bytes + activation_memory_bytes
    total_memory_gb = total_memory_bytes / (1024 ** 3)
    print(f"Total Estimated Memory: {total_memory_gb} GB")
    return total_memory_gb

def estimate_lora_memory(model, lora_model, batch_size, seq_length):
    """
    Estimate the memory required for LoRA fine-tuning with additional factors.

    Args:
    model: The hugging face model.
    batch_size (int): The batch size used in training.
    seq_length (int): The sequence length of the input.

    Returns:
    float: Estimated memory required in gigabytes.
    """
    print("\nLoRA training model parameters")
    model_params = calculate_model_parameters(model)
    lora_params = calculate_lora_parameters(lora_model)

    # Memory for model parameters (INT4) and LoRA parameters (INT4)
    model_memory_bytes = model_params / 2  # INT4: 4 bits per parameter
    lora_memory_bytes = lora_params / 2  # INT4: 4 bits per parameter
    print(f"Model Memory: {model_memory_bytes / (1024 ** 3)} GB")
    print(f"LoRA Memory: {lora_memory_bytes / (1024 ** 3)} GB")

    # Memory for gradients (FP16)
    gradient_memory_bytes = 2 * lora_params * 2  # FP16: 16 bits per parameter
    print(f"Gradient Memory: {gradient_memory_bytes / (1024 ** 3)} GB")

    # Optimizer state memory (FP16, assuming Adam optimizer)
    optimizer_memory_bytes = 2 * gradient_memory_bytes
    print(f"Optimizer State Memory: {optimizer_memory_bytes / (1024 ** 3)} GB")

    # Activation memory (FP16, estimate)
    activation_memory_bytes = 4 * batch_size * seq_length * 2  # FP16: 16 bits per activation
    print(f"Activation Memory: {activation_memory_bytes / (1024 ** 3)} GB")

    # Total memory
    total_memory_bytes = model_memory_bytes + lora_memory_bytes + gradient_memory_bytes + optimizer_memory_bytes + activation_memory_bytes
    total_memory_gb = total_memory_bytes / (1024 ** 3)
    print(f"Total Estimated Memory: {total_memory_gb} GB")
    return total_memory_gb

In [45]:
### Set tokenizer, model

model_name = 'mistralai/Mistral-7B-v0.1'  # Example robust model
# model_name = 'distilbert-base-uncased'  # Example tiny model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)

Model Max Length: 1000000000000000019884624838656


In [56]:
### Read data for training/validation, determine max length

# dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset_name = 'ai-aerospace/ams_data_train_mistral_v0.1_100'
dataset = load_dataset(dataset_name)

# Write dataset files into data directory
# data_directory = './fine_tune_data/'

# Create the data directory if it doesn't exist
# os.makedirs(data_directory, exist_ok=True)

# Write the train data to a CSV file
# train_data='train_data'
# train_filename = os.path.join(data_directory, train_data)
# dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'],tokenizer)
print('Max token length train: '+str(max_token_length_train))

# Write the validation data to a CSV file
# validation_data='validation_data'
# validation_filename = os.path.join(data_directory, validation_data)
# dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'],tokenizer)
print('Max token length validation: '+str(max_token_length_validation))

max_token_length=max(max_token_length_train,max_token_length_validation)
# max_token_length=max_token_length_train
if max_token_length > model_max_length:
    raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length
print('Block size: '+str(block_size))

# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())

Downloading readme:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/220k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Max token length train: 1124
Max token length validation: 42
Block size: 2248


In [57]:
### Set model parameters
model_params={
  "project_name": project_name,
  "model_name": model_name,
  "repo_id": username+'/'+repo_name,
  "block_size": block_size,
  "model_max_length": max_token_length,
  "logging_steps": -1,
  "evaluation_strategy": "epoch",
  "save_total_limit": 1,
  "save_strategy": "epoch",
  "mixed_precision": "fp16",
  "lr": 0.00003,
  "epochs": 3,
  "batch_size": 2,
  "warmup_ratio": 0.1,
  "gradient_accumulation": 1,
  "optimizer": "adamw_torch",
  "scheduler": "linear",
  "weight_decay": 0,
  "max_grad_norm": 1,
  "seed": 42,
  "quantization": "int4",
  "lora_r": 16,
  "lora_alpha": 32,
  "lora_dropout": 0.05
}
for key, value in model_params.items():
  os.environ[key] = str(value)

print(model_params)

{'project_name': './llms/ams_data_train-100_edeb0c6c-2eb4-4822-bd30-77e295ffcff0', 'model_name': 'mistralai/Mistral-7B-v0.1', 'repo_id': 'ai-aerospace/ams-data-train-100-4e756251-cc72-4e94-bbb7-0b61f656019a', 'block_size': 2248, 'model_max_length': 1124, 'logging_steps': -1, 'evaluation_strategy': 'epoch', 'save_total_limit': 1, 'save_strategy': 'epoch', 'mixed_precision': 'fp16', 'lr': 3e-05, 'epochs': 3, 'batch_size': 2, 'warmup_ratio': 0.1, 'gradient_accumulation': 1, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0, 'max_grad_norm': 1, 'seed': 42, 'quantization': 'int4', 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}


In [67]:
args_custom=transformers.TrainingArguments(
    per_device_train_batch_size=model_params['batch_size'],
    per_device_eval_batch_size=model_params['batch_size'],
    gradient_accumulation_steps=model_params['gradient_accumulation'],
    warmup_ratio=model_params['warmup_ratio'],
    num_train_epochs=model_params['epochs'],
    learning_rate=model_params['lr'],
    fp16=True,
    logging_steps=model_params['logging_steps'],
    save_total_limit=model_params['save_total_limit'],
    evaluation_strategy=model_params['evaluation_strategy'],
    metric_for_best_model="f1",
    output_dir='model_outputs',
    logging_dir='model_outputs',
    optim=model_params['optimizer'],
    max_grad_norm=model_params['max_grad_norm'],
    weight_decay=model_params['weight_decay'],
    lr_scheduler_type=model_params['scheduler'],
    remove_unused_columns =False,
    # label_names=['text']
)

# Args from medium article
args_medium=transformers.TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=12276,
    learning_rate=2e-4,
    fp16=True,
    eval_steps= 1000,
    logging_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    output_dir='model_outputs',
    logging_dir='model_outputs',
    remove_unused_columns =False,
    # report_to='wandb',  # enable logging to W&B
    # label_names=['text']
)

In [59]:
### Start trainer
# trainer = SFTTrainer(
#     model_name,
#     train_dataset=dataset,
#     dataset_text_field="text",
#     max_seq_length=512,
# )

peft_config = LoraConfig(
    r=model_params['lora_r'],
    lora_alpha=model_params['lora_alpha'],
    lora_dropout=model_params['lora_dropout']
)

In [22]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [60]:
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940290959023318


In [61]:
full_model_memory = estimate_full_model_memory(model,
                                               model_params['batch_size'],
                                               model_params['model_max_length'])
lora_model_memory = estimate_lora_memory(model,
                                         lora_model,
                                         model_params['batch_size'],
                                         model_params['model_max_length'])


Full training model parameters
Model Memory: 1.7503681182861328 GB
Gradient Memory: 14.002944946289062 GB
Optimizer State Memory: 28.005889892578125 GB
Activation Memory: 1.6748905181884766e-05 GB
Total Estimated Memory: 43.7592197060585 GB

LoRA training model parameters
Model Memory: 1.7503681182861328 GB
LoRA Memory: 0.003173828125 GB
Gradient Memory: 0.025390625 GB
Optimizer State Memory: 0.05078125 GB
Activation Memory: 1.6748905181884766e-05 GB
Total Estimated Memory: 1.8297305703163147 GB


In [68]:
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro"))
    results.update(accuracy_metric.compute(predictions=predictions, references = labels))
    results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro"))

    return results

# See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details
# Not working but it's cause there's some stuff I have to do with the labels on the dataset, see article with colab book
trainer = transformers.Trainer(
    model=lora_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    # compute_metrics=compute_metrics,
    args=args_custom
)
trainer.train()

TypeError: ignored