## Tuning mistral using sagemaker jobs

From: https://www.philschmid.de/sagemaker-train-evalaute-llms-2024

Using Huggingface TRL and SageMaker to finetune a LLM

Uses a training script contained in scripts/ (be mindful also of the requirements.txt contained within)

In [None]:
#!pip install transformers "datasets[s3]==2.18.0" "sagemaker>=2.190.0" "huggingface_hub[cli]" --upgrade --quiet


In [126]:
import os
from huggingface_hub import login
from dotenv import load_dotenv, find_dotenv
import json 
import random
from datetime import datetime

import sagemaker
import boto3

In [205]:
#load globals
_ = load_dotenv(find_dotenv(), override=True) # read local .env file
WANDB_API = os.getenv('WANDB_API')  # Get weights and biases login
HF_TOKEN = os.getenv('HF_TOKEN') #get huggingface login
AWS_REGION = os.getenv('AWS_REGION') #get huggingface login
os.environ['AWS_DEFAULT_REGION'] = AWS_REGION
login(token=HF_TOKEN) #login to huggingface

BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.1"
DATASET_TEXT_FIELD="text"
TUNING_DATA_FILE="data/tuning_entries_all.json"
LOCAL_TRAIN_FILE="data/training/train_dataset.json"
LOCAL_EVAL_FILE="data/validation/eval_dataset.json"
S3_FOLDER='mistral-tuning-sagemaker'

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/dpradilla/.cache/huggingface/token
Login successful


In [178]:
# Create a boto3 session in the specified region
boto_session = boto3.Session(region_name=AWS_REGION)

# Create a SageMaker session using the boto3 session
sess = sagemaker.Session(boto_session=boto_session)
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()
print(f"sagemaker bucket: {sess.default_bucket()}")

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='aws-sagemaker-execution-role')['Role']['Arn']

sess = sagemaker.Session(boto_session=boto_session, default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


sagemaker bucket: sagemaker-us-east-1-510357808667




sagemaker role arn: arn:aws:iam::510357808667:role/aws-sagemaker-execution-role
sagemaker session region: us-east-1


## Prepare the data

In [180]:
tuning_entries = []
data = []
with open(TUNING_DATA_FILE, 'r', encoding='utf-8') as file:
    for line in file:
        data.append(json.loads(line))

for row in data:
    entry = {DATASET_TEXT_FIELD:f"<s>[INST]{row['prompt']}[/INST]{row['completion']}</s>"} #mistral expected format
    tuning_entries.append(entry)

# Define the ratio of data to use for validation (e.g., 20% for validation)
validation_ratio = 0.2

# Calculate the number of validation samples based on the ratio
num_validation_samples = int(len(tuning_entries) * validation_ratio)

# Randomly shuffle the data
random.shuffle(tuning_entries)

# Split the data into training and validation sets
training_data = tuning_entries[num_validation_samples:]
validation_data = tuning_entries[:num_validation_samples]

input_file_dir = os.path.dirname(TUNING_DATA_FILE)

# Save the training and validation datasets to separate files
with open(LOCAL_TRAIN_FILE, 'w', encoding='utf-8') as train_file:
    for item in training_data:
        train_file.write(json.dumps(item) + '\n')

with open(LOCAL_EVAL_FILE, 'w', encoding='utf-8') as valid_file:
    for item in validation_data:
        valid_file.write(json.dumps(item) + '\n')

print(f"Split {len(training_data)} samples for training and {len(validation_data)} samples for validation.")

Split 2143 samples for training and 535 samples for validation.


### save datasets to S3

In [188]:
# Create an S3 client
s3_client = boto_session.client('s3', region_name=AWS_REGION)

# List all .jsonl files in the specified folder
response = s3_client.list_objects_v2(Bucket=sagemaker_session_bucket, Prefix=S3_FOLDER)

# Delete .jsonl files if found
if 'Contents' in response:
    for file in response['Contents']:
        if file['Key'].endswith('.jsonl'):
            s3_client.delete_object(Bucket=sagemaker_session_bucket, Key=file['Key'])
            print(f"Deleted {file['Key']}")

s3_training_folder_path = f"{S3_FOLDER}/data/training"
s3_training_folder_uri = f"s3://{sagemaker_session_bucket}/{s3_training_folder_path}"
s3_evaluation_folder_path = f"{S3_FOLDER}/data/validation"
s3_evaluation_folder_uri = f"s3://{sagemaker_session_bucket}/{s3_evaluation_folder_path}"
print("uploading files")
s3_client.upload_file(LOCAL_TRAIN_FILE, sagemaker_session_bucket, f"{s3_training_folder_path}/train_dataset.json")
s3_client.upload_file(LOCAL_EVAL_FILE, sagemaker_session_bucket, f"{s3_evaluation_folder_path}/eval_dataset.json")

response = s3_client.list_objects_v2(Bucket=sagemaker_session_bucket, Prefix=S3_FOLDER)

# Print each file name (key) within the folder
print(f"checking {sagemaker_session_bucket}")
if 'Contents' in response:
    for file in response['Contents']:
        print(file['Key'])
else:
    print("No files found in the specified folder.")
    

uploading files
checking sagemaker-us-east-1-510357808667
mistral-tuning-sagemaker/data/
mistral-tuning-sagemaker/data/training/
mistral-tuning-sagemaker/data/training/train_dataset.json
mistral-tuning-sagemaker/data/validation/
mistral-tuning-sagemaker/data/validation/eval_dataset.json
mistral-tuning-sagemaker/training/train_dataset.json
mistral-tuning-sagemaker/validation/eval_dataset.json


## Fine-tune LLM using trl on Amazon SageMaker

We will use the SFTTrainer (supervised fine-tuning) from trl to fine-tune our model. The SFTTrainer is a subclass of the Trainer from the transformers library and supports all the same features, including logging, evaluation, and checkpointing, but adds additiional quality of life features.

We will use the dataset formatting, packing and PEFT features in our example. As peft method we will use QLoRA a technique to reduce the memory footprint of large language models during finetuning, without sacrificing performance by using quantization

There is a run_sft.py, which uses trl with all of the features describe above. The script is re-usable, but still hackable if you want to make changes. Paramters are provided via CLI arguments using the HFArgumentParser, which cann parse any CLI argument from the TrainingArguments or from our ScriptArguments.

This means you can easily adjust the hyperparameters below and change the model_id. The parameters we selected should work for any 7B model, but you can adjust them to your needs.

In [219]:
# hyperparameters, which are passed into the training job
# define Training Job Name
job_name = BASE_MODEL_ID.replace('-', '').replace('/', '').replace('.', '')
print(f"Job Name: {job_name}")


# Define the format of the timestamp
timestamp_format = '%Y%m%d%H%M%S'  # e.g., 2024-04-22-15-30-25
# Current timestamp
current_timestamp = datetime.now().strftime(timestamp_format)
# define run Job Name
run_name = f"{BASE_MODEL_ID}-{current_timestamp}".replace('-', '').replace('/', '').replace('.', '')


hyperparameters = {
  ### SCRIPT PARAMETERS ###
  'train_dataset_path': '/opt/ml/input/data/training/train_dataset.json', # path where sagemaker will save training dataset
  'eval_dataset_path': '/opt/ml/input/data/validation/eval_dataset.json', # path where sagemaker will save validation dataset
  'model_id': BASE_MODEL_ID,           
  'dataset_text_field':DATASET_TEXT_FIELD,
  'max_seq_len': 3072,                               # max sequence length for model and packing of the dataset
  'use_qlora': True,                                 # use QLoRA model
  ### TRAINING PARAMETERS ###
  'num_train_epochs': 3,                             # number of training epochs
  'per_device_train_batch_size': 1,                  # batch size per device during training
  'gradient_accumulation_steps': 4,                  # number of steps before performing a backward/update pass
  'gradient_checkpointing': True,                    # use gradient checkpointing to save memory
  'optim': "adamw_torch_fused",                      # use fused adamw optimizer
  'logging_steps': 10,                               # log every 10 steps
  'save_strategy': "epoch",                          # save checkpoint every epoch
  'learning_rate': 2e-4,                             # learning rate, based on QLoRA paper
  'bf16': True,                                      # use bfloat16 precision
  'tf32': True,                                      # use tf32 precision
  'max_grad_norm': 0.3,                              # max gradient norm based on QLoRA paper
  'warmup_ratio': 0.03,                              # warmup ratio based on QLoRA paper
  'lr_scheduler_type': "constant",                   # use constant learning rate scheduler
  #'report_to': "tensorboard",                        # report metrics to tensorboard
  'output_dir': '/tmp/tun',                          # Temporary output directory for model checkpoints
  'merge_adapters': True,                            # merge LoRA adapters into model for easier deployment
  'report_to': "wandb",                              # report metrics to wandb
  'run_name':  run_name

}


Job Name: mistralaiMistral7BInstructv01


In order to create a sagemaker training job we need an HuggingFace Estimator. The Estimator handles end-to-end Amazon SageMaker training and deployment tasks. The Estimator manages the infrastructure use. Amazon SagMaker takes care of starting and managing all the required ec2 instances for us, provides the correct huggingface container, uploads the provided scripts and downloads the data from our S3 bucket into the container at /opt/ml/input/data. Then, it starts the training job by running.



In [220]:
from sagemaker.huggingface import HuggingFace


# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_sft.py',    # train script
    source_dir           = 'scripts',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.2xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.36',            # the transformers version used in the training job
    pytorch_version      = '2.1',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    disable_output_compression = True,        # not compress output to save training time and cost
    environment          = {
                            "AWS_REGION": AWS_REGION,
                            "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache models in /tmp
                            "HF_TOKEN": HF_TOKEN, # huggingface token to access gated models, e.g. llama 2
                            "WANDB_API": WANDB_API,
                            "WANDB_PROJECT": 'mistral-tuning-sagemaker',
                            "WANDB_NOTEBOOK_NAME": 'zmistral-tuning-sagemaker'
                            },
)


## Launch training

The "data" object indicates the channels where the data will be uploaded
https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html


In [221]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': f"{s3_training_folder_uri}" ,'validation': f"{s3_evaluation_folder_uri}"}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: mistralaiMistral7BInstructv01-2024-04-24-15-43-52-111


2024-04-24 15:43:55 Starting - Starting the training job...
2024-04-24 15:44:20 Pending - Preparing the instances for training.
2024-04-24 15:45:13 Downloading - Downloading the training image......
2024-04-24 15:48:09 Training - Training image download completed. Training in progress....bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-04-24 15:48:47,457 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-04-24 15:48:47,475 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-04-24 15:48:47,485 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-04-24 15:48:47,487 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-04-24 15:48:48,935 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.10 -m pip install -r requir

## Get trained model location

In [222]:
model_data = huggingface_estimator.model_data["S3DataSource"]["S3Uri"].replace("s3://", "https://s3.console.aws.amazon.com/s3/buckets/")
model_data

'https://s3.console.aws.amazon.com/s3/buckets/sagemaker-us-east-1-510357808667/mistralaiMistral7BInstructv01-2024-04-24-15-43-52-111/output/model/'

## Deploy model

We are going to use the Hugging Face LLM Inference DLC a purpose-built Inference Container to easily deploy LLMs in a secure and managed environment. The DLC is powered by Text Generation Inference (TGI) solution for deploying and serving Large Language Models (LLMs).

In [223]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.4.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

INFO:sagemaker.image_uris:Defaulting to only available Python version: py310
INFO:sagemaker.image_uris:Defaulting to only supported image scope: gpu.


llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.0-gpu-py310-cu121-ubuntu20.04


In [224]:
import json
from sagemaker.huggingface import HuggingFaceModel

# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

In [225]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to give SageMaker the time to download the model
)

INFO:sagemaker:Creating model with name: huggingface-pytorch-tgi-inference-2024-04-24-20-48-14-158
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-tgi-inference-2024-04-24-20-48-15-276
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-tgi-inference-2024-04-24-20-48-15-276


----------!

## Test model

Original training data was diary entries for an author at a certain age

In [286]:
import random
import json 
import re
# Read all lines from the file
with open(LOCAL_EVAL_FILE, 'r') as file:
    lines = file.readlines()

# Select one random line
random_line = random.choice(lines)

# Optionally, you might want to convert the JSON line into a Python dictionary
import json
data = json.loads(random_line)

# Use regex to find content between [INST] and [/INST]
match = re.search(r'\[INST\](.*?)\[/INST\]', data['text'])
if match:
    inst_content = match.group(0)
else:
    inst_content = "No match found"

print(inst_content)



[INST]Escribe una entrada media de diario por ROBELLO CHEBELO MAMELO cuando tenía 36 años[/INST]


In [301]:
from transformers import AutoTokenizer
from sagemaker.s3 import S3Downloader

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

def request(prompt):
    outputs = llm.predict({
      "inputs": prompt,
      "parameters": {
        "max_new_tokens": 1024,
        "do_sample": False,
        "return_full_text": False,
        "stop": ["<|im_end|>"],
        "temperature": 0.9
      }
    })
    return outputs[0]['generated_text']

print(inst_content)
request(inst_content)

[INST]Escribe una entrada media de diario por ROBELLO CHEBELO MAMELO cuando tenía 36 años[/INST]


'Ayer en la noche, mi línea de fuego de árboles se trocó y el tráfico se detuvo durante un largo rato en la Carretera de la Playa, pelo en medio de la carretera y dos autos subiendo nuevamente el camino de tierra que desemboca en la autopista. La gente creó brevemente una árgilla, todos los que pudimos, quisimos, tuvimos que seguir caminando.\n\nDupá valió la pena, a pesar de que en realidad Dupá no es tan especial, nada de lo que pasa en Dupá puede suceder en Caracas; a pesar de que ninguna señora puede acordar ir ahí y disfrazarse de una víbora en los próximos 50 años. Dupá está muy lejos, prehistórica, un eco de cuando Chacao era el cuarto de Caracas (en vez de el cuarto menos importante, un cuarto que sólo se mantuvo quizás por su proximidad a Dupá). Dupá tiene el desaforo natural que da la distancia, el caos organizado del pasado.\n\nQue lástima. De lo que pasa en Dupá yo a pesar de mi egoísmo paternal, después de todo río que me ha oxidado, fisicamente y espiritualmente, quiero d

In [292]:
len(lines)

535

In [305]:
import pandas as pd
import random

# Define the basic prompt template
base_prompt = "[INST]Escribe una entrada {length} de diario por ROBELLO CHEBELO MAMELO cuando tenía {age} años[/INST]"

# Define possible lengths and age range
lengths = ['corta', 'media', 'larga']
ages = range(19, 46) 

# Generate records
records = []
for _ in range(150):
    length_choice = random.choice(lengths)
    age_choice = random.choice(ages)
    prompt = base_prompt.format(length=length_choice, age=age_choice)
    records.append({'prompt': prompt})

total = len(records)
for index, item in enumerate(records):
    response = request(item['prompt'])  # Call the request function
    item['response'] = response
    print(f"Processed {index + 1}/{total} requests")  # Progress indicator

# Create DataFrame
df = pd.DataFrame(records)

# Save the DataFrame to a CSV file
df.to_csv('data/generated.csv', index=False)


Processed 1/150 requests
Processed 2/150 requests
Processed 3/150 requests
Processed 4/150 requests
Processed 5/150 requests
Processed 6/150 requests
Processed 7/150 requests
Processed 8/150 requests
Processed 9/150 requests
Processed 10/150 requests
Processed 11/150 requests
Processed 12/150 requests
Processed 13/150 requests
Processed 14/150 requests
Processed 15/150 requests
Processed 16/150 requests
Processed 17/150 requests
Processed 18/150 requests
Processed 19/150 requests
Processed 20/150 requests
Processed 21/150 requests
Processed 22/150 requests
Processed 23/150 requests
Processed 24/150 requests
Processed 25/150 requests
Processed 26/150 requests
Processed 27/150 requests
Processed 28/150 requests
Processed 29/150 requests
Processed 30/150 requests
Processed 31/150 requests
Processed 32/150 requests
Processed 33/150 requests
Processed 34/150 requests
Processed 35/150 requests
Processed 36/150 requests
Processed 37/150 requests
Processed 38/150 requests
Processed 39/150 requ

In [306]:
llm.delete_model()
llm.delete_endpoint()

INFO:sagemaker:Deleting model with name: huggingface-pytorch-tgi-inference-2024-04-24-20-48-14-158
INFO:sagemaker:Deleting endpoint configuration with name: huggingface-pytorch-tgi-inference-2024-04-24-20-48-15-276
INFO:sagemaker:Deleting endpoint with name: huggingface-pytorch-tgi-inference-2024-04-24-20-48-15-276
