<a href="https://colab.research.google.com/github/dsmueller3760/aerospace_chatbot/blob/llm_training/scripts/train_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build local packages

In [2]:
# Build everything in one cell
import os
os.chdir('/workspace')
if os.path.exists('requirements.txt'):
    print('requirements.txt exists, installing from it...')
    !cd /workspace && \
    source .venv/bin/activate && \
    pip install -r requirements.txt
else: 
    print('No requirements.txt, installing from scratch...')
    !cd /workspace && \
    rm -rf .venv && \
    pip install virtualenv && \
    python3 -m venv .venv && \
    source .venv/bin/activate && \
    pip install ipykernel datasets trl peft uuid pandas evaluate transformers bitsandbytes torch huggingface_hub accelerate ipywidgets scipy && \
    pip freeze > requirements.txt

requirements.txt exists, installing from it...

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Test for GPU access first

In [3]:
# Test for gpu access first
# https://stackabuse.com/how-to-use-gpus-with-pytorch/
import torch
print(torch.cuda.is_available())

True


In [4]:
print("Number of GPUs: "+str(torch.cuda.device_count()))
print("Current GPU device: "+str(torch.cuda.current_device()))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Number of GPUs: 1
Current GPU device: 0


device(type='cuda')

# LLM Training

In [5]:
### Imports

from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, PeftModel

import os
from uuid import uuid4
import pandas as pd

import subprocess
import evaluate
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
import torch
from huggingface_hub import notebook_login
import time

In [8]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
### Functions 

def max_token_len(dataset,tokenizer):
    """
    Calculate the maximum token length of text entries in a dataset using a specified tokenizer.
    Args:
    dataset (iterable): A dataset where each entry is a dictionary containing at least a 'text' key.
                        The dataset should be iterable (like a list or a dataset object from popular
                        data processing libraries).
    tokenizer (Tokenizer): An instance of a tokenizer compatible with the text entries in the dataset.

    Returns:
    int: The maximum length (in terms of number of tokens) of any text entry in the dataset after
         tokenization.
    """
    max_seq_length = 0
    for row in dataset:
        tokens = len(tokenizer(row['text'])['input_ids'])
        if tokens > max_seq_length:
            max_seq_length = tokens
    return max_seq_length

# def calculate_model_parameters(model):
#     """
#     Calculate the total number of parameters in the model.

#     Args:
#     model: The Hugging Face model.

#     Returns:
#     int: Total number of parameters.
#     """
#     return sum(p.numel() for p in model.parameters())

# def calculate_lora_parameters(lora_model):
#     """
#     Calculate the number of LoRA parameters in the model.

#     This function assumes that LoRA parameters have a specific naming convention or identifiable structure.
#     Adjust the function depending on how LoRA parameters are implemented in your model.

#     Args:
#     model: The Hugging Face model.

#     Returns:
#     int: Number of LoRA parameters.
#     """
#     # This works too but is more complicated
#     # lora_param_count = 0
#     # for name, param in lora_model.named_parameters():
#     #     if 'lora' in name:  # Assuming LoRA parameters have 'lora' in their names
#     #         lora_param_count += param.numel()
#     # return lora_param_count

#     return sum(p.numel() for p in lora_model.parameters() if p.requires_grad)


# def estimate_full_model_memory(model, batch_size, seq_length):
#     """
#     Estimate the memory required for full model fine-tuning with additional factors.

#     Args:
#     model: The hugging face model.
#     batch_size (int): The batch size used in training.
#     seq_length (int): The sequence length of the input.

#     Returns:
#     float: Estimated memory required in gigabytes.
#     """
#     print("\nFull training model parameters")
#     model_params = calculate_model_parameters(model)

#     # Memory for model parameters (INT4)
#     model_memory_bytes = model_params / 2  # INT4: 4 bits per parameter
#     print(f"Model Memory: {model_memory_bytes / (1024 ** 3)} GB")

#     # Memory for gradients (FP16)
#     gradient_memory_bytes = 2 * model_params * 2  # FP16: 16 bits per parameter
#     print(f"Gradient Memory: {gradient_memory_bytes / (1024 ** 3)} GB")

#     # Optimizer state memory (FP16, assuming Adam optimizer)
#     optimizer_memory_bytes = 2 * gradient_memory_bytes
#     print(f"Optimizer State Memory: {optimizer_memory_bytes / (1024 ** 3)} GB")

#     # Activation memory (FP16, estimate)
#     activation_memory_bytes = 4 * batch_size * seq_length * 2  # FP16: 16 bits per activation
#     print(f"Activation Memory: {activation_memory_bytes / (1024 ** 3)} GB")

#     # Total memory
#     total_memory_bytes = model_memory_bytes + gradient_memory_bytes + optimizer_memory_bytes + activation_memory_bytes
#     total_memory_gb = total_memory_bytes / (1024 ** 3)
#     print(f"Total Estimated Memory: {total_memory_gb} GB")
#     return total_memory_gb

# def estimate_lora_memory(model, lora_model, batch_size, seq_length):
#     """
#     Estimate the memory required for LoRA fine-tuning with additional factors.

#     Args:
#     model: The hugging face model.
#     batch_size (int): The batch size used in training.
#     seq_length (int): The sequence length of the input.

#     Returns:
#     float: Estimated memory required in gigabytes.
#     """
#     print("\nLoRA training model parameters")
#     model_params = calculate_model_parameters(model)
#     lora_params = calculate_lora_parameters(lora_model)

#     # Memory for model parameters (INT4) and LoRA parameters (INT4)
#     model_memory_bytes = model_params / 2  # INT4: 4 bits per parameter
#     lora_memory_bytes = lora_params / 2  # INT4: 4 bits per parameter
#     print(f"Model Memory: {model_memory_bytes / (1024 ** 3)} GB")
#     print(f"LoRA Memory: {lora_memory_bytes / (1024 ** 3)} GB")

#     # Memory for gradients (FP16)
#     gradient_memory_bytes = 2 * lora_params * 2  # FP16: 16 bits per parameter
#     print(f"Gradient Memory: {gradient_memory_bytes / (1024 ** 3)} GB")

#     # Optimizer state memory (FP16, assuming Adam optimizer)
#     optimizer_memory_bytes = 2 * gradient_memory_bytes
#     print(f"Optimizer State Memory: {optimizer_memory_bytes / (1024 ** 3)} GB")

#     # Activation memory (FP16, estimate)
#     activation_memory_bytes = 4 * batch_size * seq_length * 2  # FP16: 16 bits per activation
#     print(f"Activation Memory: {activation_memory_bytes / (1024 ** 3)} GB")

#     # Total memory
#     total_memory_bytes = model_memory_bytes + lora_memory_bytes + gradient_memory_bytes + optimizer_memory_bytes + activation_memory_bytes
#     total_memory_gb = total_memory_bytes / (1024 ** 3)
#     print(f"Total Estimated Memory: {total_memory_gb} GB")
#     return total_memory_gb

In [11]:
### Set tokenizer, model

# model_name = 'distilbert-base-uncased'  # Example tiny model
# model_name = 'xlm-roberta-large'  # Used in medium article https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78

## mistral specific
model_name = 'mistralai/Mistral-7B-Instruct-v0.1'  # Example robust model
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# tokenizer = AutoTokenizer.from_pretrained(model_name) # Will not work for mistral
new_model=model_name.split('/')[-1]+'_asm_'+str(uuid4())[:8]
print(new_model)

model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [12]:
### Read data for training/validation, determine max length

# dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset_name = 'ai-aerospace/ams_data_train_mistral_v0.1_100'
dataset = load_dataset(dataset_name)

# Write dataset files into data directory
# data_directory = './fine_tune_data/'

# Create the data directory if it doesn't exist
# os.makedirs(data_directory, exist_ok=True)

# Write the train data to a CSV file
# train_data='train_data'
# train_filename = os.path.join(data_directory, train_data)
# dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'],tokenizer)
print('Max token length train: '+str(max_token_length_train))

# Write the validation data to a CSV file
# validation_data='validation_data'
# validation_filename = os.path.join(data_directory, validation_data)
# dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'],tokenizer)
print('Max token length validation: '+str(max_token_length_validation))

max_token_length=min(model_max_length,max(max_token_length_train,max_token_length_validation))
# max_token_length=max_token_length_train
if max_token_length > model_max_length:
    raise ValueError("Maximum token length exceeds model limits.")
print('Max token length to use: '+str(max_token_length))
block_size=2*max_token_length
print('Block size: '+str(block_size))

# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())

Downloading readme:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/220k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [13]:
### Set model parameters
model_params={
  "project_name": project_name,
  "model_name": model_name,
  "repo_id": username+'/'+repo_name,
  "block_size": block_size,
  "model_max_length": max_token_length,
  "logging_steps": -1,
  "evaluation_strategy": "epoch",
  "save_total_limit": 1,
  "save_strategy": "epoch",
  "mixed_precision": "fp16",
  "lr": 0.00003,
  "epochs": 3,
  "batch_size": 1,
  "warmup_ratio": 0.1,
  "gradient_accumulation": 1,
  "optimizer": "adamw_torch",
  "scheduler": "linear",
  "weight_decay": 0,
  "max_grad_norm": 1,
  "seed": 42,
  "quantization": "int4",
  "lora_r": 16,
  "lora_alpha": 32,
  "lora_dropout": 0.05
}
for key, value in model_params.items():
  os.environ[key] = str(value)

print(model_params)

In [14]:
args_custom=transformers.TrainingArguments(
    per_device_train_batch_size=model_params['batch_size'],
    per_device_eval_batch_size=model_params['batch_size'],
    gradient_accumulation_steps=model_params['gradient_accumulation'],
    warmup_ratio=model_params['warmup_ratio'],
    num_train_epochs=model_params['epochs'],
    learning_rate=model_params['lr'],
    fp16=True,
    logging_steps=model_params['logging_steps'],
    save_total_limit=model_params['save_total_limit'],
    evaluation_strategy=model_params['evaluation_strategy'],
    metric_for_best_model="f1",
    output_dir='model_outputs',
    logging_dir='model_outputs',
    optim=model_params['optimizer'],
    max_grad_norm=model_params['max_grad_norm'],
    weight_decay=model_params['weight_decay'],
    lr_scheduler_type=model_params['scheduler'],
    remove_unused_columns =False,
)

# Args from medium article
args_medium=transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    warmup_steps=100,
    max_steps=12276,
    learning_rate=2e-4,
    fp16=True,
    eval_steps= 1000,
    logging_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    output_dir='model_outputs',
    logging_dir='model_outputs',
    remove_unused_columns =False,
    # report_to='wandb',  # enable logging to W&B
)

In [15]:
peft_config = LoraConfig(
    r=model_params['lora_r'],
    lora_alpha=model_params['lora_alpha'],
    lora_dropout=model_params['lora_dropout']
)

In [16]:
# Load the model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [18]:
lora_model = get_peft_model(base_model, peft_config)
lora_model.print_trainable_parameters()

In [22]:
# Not accurate, need to find a better way of estimating this. Use this example as one way to find out.
# full_model_memory = estimate_full_model_memory(base_model,
#                                                model_params['batch_size'],
#                                                model_params['model_max_length'])
# lora_model_memory = estimate_lora_memory(base_model,
#                                          lora_model,
#                                          model_params['batch_size'],
#                                          model_params['model_max_length'])

# Run Supervised fine training (SFFTraining)

## Calculate memory available

In [24]:
import subprocess as sp
import os

def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_gpu_memory()

[43172]

## Run Supervised fine tuning trainer

In [25]:
trainer = SFTTrainer(
    model=base_model,
    peft_config=peft_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    dataset_text_field='text',
    max_seq_length=model_params['model_max_length'],
    tokenizer=tokenizer,
    args=args_custom,
    packing=False
)

start_time = time.perf_counter()  # Start the clock
trainer.train()
end_time = time.perf_counter()  # End the clock
print('Elaped time for training: '+str(end_time-start_time))

trainer.model.save_pretrained(new_model)


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,2.599000,No log


In [26]:
# Merge the model with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.json')

In [27]:
merged_model.push_to_hub('ai-aerospace'+'/'+new_model)

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ai-aerospace/Mistral-7B-Instruct-v0.1_asm_e0575fd7/commit/b84dc128482064c073f5522082894302b82ff308', commit_message='Upload MistralForCausalLM', commit_description='', oid='b84dc128482064c073f5522082894302b82ff308', pr_url=None, pr_revision=None, pr_num=None)