# Fine-Tuning and Evaluating LLMs with SageMaker Pipelines and MLflow

In [1]:
%pip install sagemaker==2.225.0  datasets==2.18.0 transformers==4.40.0 mlflow==2.13.2 sagemaker-mlflow==0.1.0 --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.function_step import step
from steps.finetune_llama8b_hf import finetune_llama8b
from steps.preprocess_llama3 import preprocess
from steps.eval_mlflow import evaluation
from steps.utils import create_training_job_name
import os

os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/gilbertyoung/Library/Application Support/sagemaker/config.yaml


  from .autonotebook import tqdm as notebook_tqdm


## 1. SageMaker Session & IAM Role

In [4]:


import boto3

try:
    role = sagemaker.get_execution_role()
    print(role)
except ValueError:
    iam = boto3.client("iam")
    # Hard coded ARN since, I'm running this notebook locally to reduce AWS costs
    role = "arn:aws:iam::891612587330:role/service-role/AmazonSageMaker-ExecutionRole-20241230T123665"

boto3_session = boto3.Session(profile_name="default")
sess = sagemaker.Session(boto_session=boto3_session)
print(sess.account_id())

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


sagemaker.config INFO - Fetched defaults config from location: /Users/gilbertyoung/Documents/llm-finetuning-sagemaker


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


891612587330


In [5]:
sagemaker.image_uris.get_base_python_image_uri('us-east-2', py_version='311')

'429704687514.dkr.ecr.us-east-2.amazonaws.com/sagemaker-base-python-311:1.0'

## 2. Training Configurations 

In [6]:
train_config = {
    "experiment_name": "all_target_modules_1K",
    "model_id": "meta-llama/Meta-Llama-3-8B",
    "model_version": "3.0.2",
    "model_name": "llama-3-8b",
    "endpoint_name": "llama-3-8b",
    "finetune_instance_type": "ml.g5.2xlarge",
    "finetune_num_instances": 1,
    "instance_type": "ml.g5.2xlarge",
    "num_instances": 1,
    "epoch": 1,
    "per_device_train_batch_size": 4,
}

### LoRA Parameters

In [7]:
lora_params = {"lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05}

### MLFlow Setup

In [8]:
mlflow_arn = "arn:aws:sagemaker:us-east-2:891612587330:mlflow-tracking-server/llm-finetuning-experiment"  # fill MLflow tracking server ARN
experiment_name = "llm-finetuning-experiment"

### Dataset Configurations 

In [9]:
dataset_name = "HuggingFaceH4/no_robots"

### Setting up Pipeline Steps

In [10]:
from sagemaker.workflow.parameters import ParameterString
import json

In [11]:
lora_config = ParameterString(name="lora_config", default_value=json.dumps(lora_params))

#### Preprocessing step

In [12]:

pipeline_name = "training-evaulation-pipeline-mlflow"

#default_bucket = sagemaker.Session().default_bucket()
default_bucket = sess.default_bucket()

print(default_bucket)
main_data_path = f"s3://{default_bucket}"
evaluation_data_path = (
    main_data_path
    + "/datasets/hf_no_robots/evaluation/automatic_small/dataset_evaluation_small.jsonl"
)
output_data_path = main_data_path + "/datasets/hf_no_robots/output_" + pipeline_name

# You can add your own evaluation dataset code into this step
preprocess_step_ret = step(preprocess, name="preprocess")(
    default_bucket,
    dataset_name,
    train_sample=100,
    eval_sample=100,
    mlflow_arn=mlflow_arn,
    experiment_name=experiment_name,
    run_name=ExecutionVariables.PIPELINE_EXECUTION_ID,
)

print("The pipeline name is " + pipeline_name)
# Mark the name of this bucket for reviewing the artifacts generated by this pipeline at the end of the execution
print("Output S3 bucket: " + output_data_path)

sagemaker-us-east-2-891612587330
The pipeline name is training-evaulation-pipeline-mlflow
Output S3 bucket: s3://sagemaker-us-east-2-891612587330/datasets/hf_no_robots/output_training-evaulation-pipeline-mlflow


#### Fine-tuning step

In [13]:
finetune_ret = step(finetune_llama8b, name="finetune_llama8b_instruction")(
    preprocess_step_ret,
    train_config,
    lora_config,
    role,
    mlflow_arn,
    experiment_name,
    ExecutionVariables.PIPELINE_EXECUTION_ID,
)

#### Evaluation Step

In [14]:
evaluate_finetuned_llama7b_instruction_mlflow = step(
    evaluation,
    name="evaluate_finetuned_llama8b_instr",
    # keep_alive_period_in_seconds=1200,
    instance_type="ml.g5.12xlarge",
    volume_size=100,
)(train_config, preprocess_step_ret, finetune_ret, mlflow_arn, experiment_name, "")

#### Pipeline Creation & Execution

In [15]:


from sagemaker import get_execution_role

pipeline = Pipeline(
    name=pipeline_name,
    steps=[evaluate_finetuned_llama7b_instruction_mlflow],
    parameters=[lora_config],
    sagemaker_session=sess
)

In [16]:
pipeline.upsert(role)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.ImageUri
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.RoleArn


2025-02-26 22:27:43,747 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/evaluate_finetuned_llama8b_instr/2025-02-26-22-27-42-930/function
2025-02-26 22:27:44,122 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/evaluate_finetuned_llama8b_instr/2025-02-26-22-27-42-930/arguments
2025-02-26 22:27:45,176 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/var/folders/87/9dqc4s4d7vv8wl84d0yy35tw0000gn/T/tmpdpn6n5b9/requirements.txt'
2025-02-26 22:27:45,383 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/evaluate_finetuned_llama8b_instr/2025-02-26-22-27-42-930/pre_exec_script_and_dependencies'
2025-02-26 22:27:45,722 sagemaker.remote

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.ImageUri
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.RoleArn


2025-02-26 22:27:49,256 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/preprocess/2025-02-26-22-27-42-930/function
2025-02-26 22:27:49,667 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/preprocess/2025-02-26-22-27-42-930/arguments
2025-02-26 22:27:50,093 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/var/folders/87/9dqc4s4d7vv8wl84d0yy35tw0000gn/T/tmp2kakkpq_/requirements.txt'
2025-02-26 22:27:50,301 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/preprocess/2025-02-26-22-27-42-930/pre_exec_script_and_dependencies'


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.ImageUri
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.RoleArn


2025-02-26 22:27:51,162 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/finetune_llama8b_instruction/2025-02-26-22-27-42-930/function
2025-02-26 22:27:51,534 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/finetune_llama8b_instruction/2025-02-26-22-27-42-930/arguments
2025-02-26 22:27:51,934 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/var/folders/87/9dqc4s4d7vv8wl84d0yy35tw0000gn/T/tmph8aj7f85/requirements.txt'
2025-02-26 22:27:52,130 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-2-891612587330/training-evaulation-pipeline-mlflow/finetune_llama8b_instruction/2025-02-26-22-27-42-930/pre_exec_script_and_dependencies'
2025-02-26 22:27:53,077 sagemaker.remote_function IN

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:891612587330:pipeline/training-evaulation-pipeline-mlflow',
 'ResponseMetadata': {'RequestId': '0bad6e8a-0ab1-4f70-9db5-14f9a3c7d656',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0bad6e8a-0ab1-4f70-9db5-14f9a3c7d656',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '103',
   'date': 'Wed, 26 Feb 2025 22:27:59 GMT'},
  'RetryAttempts': 0}}

The following cell with execute the pipeline.

In [17]:
execution1 = pipeline.start()