# SageMaker 에서 Llama 3 파인 튜닝

## 1. 환경 설정

In [3]:
from dotenv import load_dotenv

import os

HF_TOKEN = os.getenv('HF_TOKEN')
!huggingface-cli login --token {HF_TOKEN}


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/ec2-user/.cache/huggingface/token
Login successful


### 저장된 데이터 불러오기

In [4]:
%store -r data_folder
%store -r train_data_json 
%store -r validation_data_json 
%store -r test_data_json 

print("data_folder: ", data_folder)
print("train_data_json: ", train_data_json)
print("validation_data_json: ", validation_data_json)
print("test_data_json: ", test_data_json)

data_folder:  ../data/naver-news-summarization-ko
train_data_json:  ../data/naver-news-summarization-ko/train_dataset.json
validation_data_json:  ../data/naver-news-summarization-ko/validation_dataset.json
test_data_json:  ../data/naver-news-summarization-ko/test_dataset.json


### SageMaker 기본 변수 가져오기

In [5]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/SageMaker/.xdg/config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::057716757052:role/gen_ai_gsmoon
sagemaker bucket: sagemaker-us-east-1-057716757052
sagemaker session region: us-east-1


## 2. 데이터 준비

### S3 데이터 셋 경로 생성

In [6]:
dataset_name = data_folder.split('/')[-1]
# save train_dataset to s3 using our SageMaker session
input_path = f's3://{sess.default_bucket()}/datasets/{dataset_name}'
print("input_path: \n", input_path)

trian_file_name = train_data_json.split('/')[-1]
validation_file_name = validation_data_json.split('/')[-1]
test_file_name = test_data_json.split('/')[-1]

train_dataset_s3_path = f"{input_path}/train/{trian_file_name}"
validation_dataset_s3_path = f"{input_path}/test/{validation_file_name}"
test_dataset_s3_path = f"{input_path}/test/{test_file_name}"

print("train_dataset_s3_path: \n", train_dataset_s3_path)
print("validation_dataset_s3_path: \n", validation_dataset_s3_path)
print("test_dataset_s3_path: \n", test_dataset_s3_path)




input_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko
train_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json
validation_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/validation_dataset.json
test_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json


### 데이타를 S3 에 업로딩

In [7]:
! aws s3 cp {train_data_json} {train_dataset_s3_path}
! aws s3 cp {validation_data_json} {validation_dataset_s3_path}
! aws s3 cp {test_data_json} {test_dataset_s3_path}


upload: ../data/naver-news-summarization-ko/train_dataset.json to s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json
upload: ../data/naver-news-summarization-ko/validation_dataset.json to s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/validation_dataset.json
upload: ../data/naver-news-summarization-ko/test_dataset.json to s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json


In [8]:
! aws s3 ls {input_path} --recursive

2024-06-28 07:32:13       1744 datasets/naver-news-summarization-ko/config/llama_3_8b_fsdp_qlora.yaml
2024-06-28 07:54:52       1929 datasets/naver-news-summarization-ko/config/sm_llama_3_70b_fsdp_qlora.yaml
2024-06-29 07:54:06       1929 datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml
2024-06-29 10:21:54      29089 datasets/naver-news-summarization-ko/test/test_dataset.json
2024-06-29 10:21:53      29089 datasets/naver-news-summarization-ko/test/validation_dataset.json
2024-06-29 10:21:53      28761 datasets/naver-news-summarization-ko/train/train_dataset.json


In [9]:
# ! aws s3 rm {input_path} --recursive

## 3. 훈련 준비

### 훈련 설정 파일 준비

In [10]:
%%writefile sm_llama_3_8b_fsdp_qlora.yaml
# script parameters
model_id:  "meta-llama/Meta-Llama-3-8B" # Hugging Face model id
max_seq_len:  2048              # max sequence length for model and packing of the dataset
# sagemaker specific parameters
train_dataset_path: "/opt/ml/input/data/train/" # path to where SageMaker saves train dataset
test_dataset_path: "/opt/ml/input/data/test/"   # path to where SageMaker saves test dataset
# output_dir: "/opt/ml/model"            # path to where SageMaker will upload the model 
output_dir: "/tmp/llama3"            # path to where SageMaker will upload the model 
# training parameters
report_to: "tensorboard"               # report metrics to tensorboard
learning_rate: 0.0002                  # learning rate 2e-4
lr_scheduler_type: "constant"          # learning rate scheduler
num_train_epochs: 1                    # number of training epochs
per_device_train_batch_size: 1         # batch size per device during training
per_device_eval_batch_size: 1          # batch size for evaluation
gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
optim: adamw_torch                     # use torch adamw optimizer
logging_steps: 10                      # log every 10 steps
save_strategy: epoch                   # save checkpoint every epoch
evaluation_strategy: epoch             # evaluate every epoch
max_grad_norm: 0.3                     # max gradient norm
warmup_ratio: 0.03                     # warmup ratio
bf16: true                             # use bfloat16 precision
tf32: true                             # use tf32 precision
gradient_checkpointing: true           # use gradient checkpointing to save memory
# FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
  backward_prefetch: "backward_pre"
  forward_prefetch: "false"
  use_orig_params: "false"

Overwriting sm_llama_3_8b_fsdp_qlora.yaml


### 설정 파일을 S3 에 업로드



In [11]:
from sagemaker.s3 import S3Uploader

# upload the model yaml file to s3
model_yaml = "sm_llama_3_8b_fsdp_qlora.yaml"
train_config_s3_path = S3Uploader.upload(local_path=model_yaml, desired_s3_uri=f"{input_path}/config")

print(f"Training config uploaded to:")
print(train_config_s3_path)

Training config uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml


### 데이터 입력 구성

In [12]:

local_data = {
  'train': f'file://{train_data_json}',
  'test': f'file://{test_data_json}',
  'config': f'file://{model_yaml}'
  }

s3_data = {
  'train': train_dataset_s3_path,
  'test': test_dataset_s3_path,
  'config': train_config_s3_path
  }

### Clolud 모드 및 Local 사용
- 현재 로컬 모드는 에러 발행. 확인 중 임

In [13]:
# USE_LOCAL_MODE = True
USE_LOCAL_MODE = False

import torch

if USE_LOCAL_MODE:
    instance_type = 'local_gpu' if torch.cuda.is_available() else 'local'
    instance_count = 1
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    # data = local_data 
    data = s3_data
    nKeepAliveSeconds = None # Warmpool feature
    print("## Local mode is set")
else:
    instance_type = 'ml.g5.12xlarge'
    # instance_type = 'ml.p4d.24xlarge'
    instance_count = 1
    sagemaker_session = sagemaker.session.Session()
    data = s3_data
    nKeepAliveSeconds = 3600 # Warmpool feature, 1 hour
    print(f"## Cloud mode is set with {instance_type} and {instance_count} of instance_count")
print("dataset: \n", data)

## Cloud mode is set with ml.g5.12xlarge and 1 of instance_count
dataset: 
 {'train': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json', 'test': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json', 'config': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml'}


### 훈련 Estimator 생성

In [14]:
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

import time
# define Training Job Name 
job_name = f'llama3-8b-naver-news-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
# chkpt_s3_path = f's3://{sess.default_bucket()}/{s3_prefix}/native/checkpoints'

# create the Estimator
os.environ['USE_SHORT_LIVED_CREDENTIALS']="1" 
huggingface_estimator = HuggingFace(
    entry_point          = 'sm_run_fsdp_qlora_llama3.py',      # train script
    source_dir           = '../scripts',  # directory which includes all the files needed for training
    instance_type        = instance_type,  # instances type used for the training job
    instance_count       = instance_count,                 # the number of instances used for training
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 500,               # the size of the EBS volume in GB
    transformers_version = '4.36.0',          # the transformers version used in the training job
    pytorch_version      = '2.1.0',           # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  {
        "config": "/opt/ml/input/data/config/sm_llama_3_8b_fsdp_qlora.yaml" # path to TRL config which was uploaded to s3
    },
    disable_output_compression = True,        # not compress output to save training time and cost
    # keep_alive_period_in_seconds = nKeepAliveSeconds,     # warm pool 
    distribution={"torch_distributed": {"enabled": True}},   # enables torchrun
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache models in /tmp
        "HF_TOKEN": HF_TOKEN,       # huggingface token to access gated models, e.g. llama 3
        "ACCELERATE_USE_FSDP": "1",             # enable FSDP
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "1"   # enable CPU RAM efficient loading
    }, 
)

  from .autonotebook import tqdm as notebook_tqdm


## 4. 훈련 실행

In [15]:
huggingface_estimator.fit(data, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: llama3-8b-naver-news-2024-06-29-10-22-0-2024-06-29-10-22-00-994


In [16]:
huggingface_estimator.logs()

2024-06-29 10:22:02 Starting - Starting the training job...
2024-06-29 10:22:27 Starting - Preparing the instances for training...
2024-06-29 10:23:04 Downloading - Downloading input data...
2024-06-29 10:23:20 Downloading - Downloading the training image..................
2024-06-29 10:26:30 Training - Training image download completed. Training in progress.....bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-06-29 10:27:02,637 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-06-29 10:27:02,672 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-06-29 10:27:02,683 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-06-29 10:27:02,685 sagemaker_pytorch_container.training INFO     Invoking TorchDistributed...
2024-06-29 10:27:02,685 sagemaker_pytorch_container.training INFO     Invoking user

In our example the training Llama 3 70B with Flash Attention for 2 epochs with a dataset of 10k samples takes 5052 seconds (~84minutes) on a `ml.p4d.24xlarge` or ~$50.

## 5. 모델 경로 저장

In [17]:
model_s3_path = huggingface_estimator.model_data
print("model_s3_path: \n", model_s3_path)

%store model_s3_path

model_s3_path: 
 {'S3DataSource': {'S3Uri': 's3://sagemaker-us-east-1-057716757052/llama3-8b-naver-news-2024-06-29-10-22-0-2024-06-29-10-22-00-994/output/model/', 'S3DataType': 'S3Prefix', 'CompressionType': 'None'}}
Stored 'model_s3_path' (dict)
