# Llama 3 Fine-tuning with PyTorch FSDP and Q-Lora on Amazon SageMaker (Local Mode)

## 1. Setup Development Environment

In [None]:
!pip install transformers "datasets[s3]==2.18.0" "sagemaker>=2.190.0" "huggingface_hub[cli]" --upgrade --quiet

# Hugging Face 로그인 (실제 토큰으로 교체 필요)
os.environ['TRANSFORMERS_CACHE'] = '/home/ec2-user/SageMaker/.cache/huggingface'
os.environ['HF_HOME'] = '/home/ec2-user/SageMaker/.cache/huggingface'
!huggingface-cli login --token YOUR_TOKEN

In [None]:
import sagemaker
import boto3
from sagemaker.local import LocalSession

# SageMaker local 모드 설정
sagemaker_session = LocalSession()
sagemaker_session.config = {'local': {'local_code': True}}

role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sagemaker_session.boto_region_name}")

## 2. Create and prepare the dataset

In [None]:
from datasets import load_dataset

system_message = """You are Llama, an AI assistant created by Philipp to be helpful and honest. Your knowledge spans a wide range of topics, allowing you to engage in substantive conversations and provide analysis on complex subjects."""

def create_conversation(sample):
    if sample["messages"][0]["role"] == "system":
        return sample
    else:
      sample["messages"] = [{"role": "system", "content": system_message}] + sample["messages"]
      return sample

dataset = load_dataset("HuggingFaceH4/no_robots")

columns_to_remove = list(dataset["train"].features)
columns_to_remove.remove("messages")
dataset = dataset.map(create_conversation, remove_columns=columns_to_remove, batched=False)

dataset["train"] = dataset["train"].filter(lambda x: len(x["messages"][1:]) % 2 == 0)
dataset["test"] = dataset["test"].filter(lambda x: len(x["messages"][1:]) % 2 == 0)

# 로컬 파일 시스템에 데이터셋 저장
import os

local_data_path = '/tmp/llama3_data'
os.makedirs(f"{local_data_path}/train", exist_ok=True)
os.makedirs(f"{local_data_path}/test", exist_ok=True)

dataset["train"].to_json(f"{local_data_path}/train/dataset.json", orient="records")
dataset["test"].to_json(f"{local_data_path}/test/dataset.json", orient="records")

print(f"Training data saved to: {local_data_path}")

#### Template would look like this:

You are a helpful Assistant. 

Human: What is 2+2? 

Assistant: 2+2 equals 4.

## 3. Fine-tune Llama 3 on Amazon SageMaker (Local Mode)

In [None]:
%%writefile llama_3_8b_fsdp_qlora.yaml
model_id: "meta-llama/Meta-Llama-3-8b"
max_seq_len: 3072
train_dataset_path: "/opt/ml/input/data/train/"
test_dataset_path: "/opt/ml/input/data/test/"
output_dir: "/tmp/llama3"
report_to: "tensorboard"
learning_rate: 0.0002
lr_scheduler_type: "constant"
num_train_epochs: 1
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 2
optim: adamw_torch
logging_steps: 10
save_strategy: epoch
evaluation_strategy: epoch
max_grad_norm: 0.3
warmup_ratio: 0.03
bf16: false
fp16: true
gradient_checkpointing: true
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
    backward_prefetch: "backward_pre"
    forward_prefetch: "false"
    use_orig_params: "false"

In [None]:
environment={
    # ... 기존 환경 변수들 ...
    "NCCL_P2P_DISABLE": "1",
    "NCCL_IB_DISABLE": "1",
    "NCCL_SOCKET_IFNAME": "lo",
    "CUDA_DEVICE_ORDER": "PCI_BUS_ID",
}

In [None]:
import os
from sagemaker.huggingface import HuggingFace

job_name = f'llama3-8b-exp1-local'

# Hugging Face 토큰 (실제 토큰으로 교체 필요)
huggingface_estimator = HuggingFace(
    entry_point='run_fsdp_qlora.py',
    source_dir='./scripts/fsdp',
    instance_type='local_gpu',
    instance_count=1,
    role=role,
    transformers_version='4.36.0',
    pytorch_version='2.1.0',
    py_version='py310',
    hyperparameters={
        "config": "/opt/ml/input/data/config/llama_3_8b_fsdp_qlora.yaml"
    },
    environment={
        "HUGGINGFACE_HUB_CACHE": "/home/ec2-user/SageMaker/.cache/huggingface",
        "TRANSFORMERS_CACHE": "/home/ec2-user/SageMaker/.cache/huggingface",
        "HF_HOME": "/home/ec2-user/SageMaker/.cache/huggingface",
        "HF_TOKEN": "YOUR_TOKEN",
        "ACCELERATE_USE_FSDP": "1",
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "1",
        "NCCL_DEBUG": "INFO", # Only for Local Instance 
        "NCCL_SOCKET_IFNAME": "lo",
        "NCCL_IB_DISABLE": "1",
        "NCCL_P2P_DISABLE": "1",
        "TMPDIR": "/home/ec2-user/SageMaker/tmp",
        "TEMP": "/home/ec2-user/SageMaker/tmp",
        "TMP": "/home/ec2-user/SageMaker/tmp",
    },
    distribution={"torch_distributed": {"enabled": True}},
    sagemaker_session=sagemaker_session,
)

# 학습 시작

# 절대 경로로 변경
local_data_path = os.path.abspath('/tmp/llama3_data')

data = {
    'train': f"file://{local_data_path}/train",
    'test': f"file://{local_data_path}/test",
    'config': f"file://{os.path.abspath('llama_3_8b_fsdp_qlora.yaml')}"
}

huggingface_estimator.fit(data, wait=True)

## 4. Deploy & Test fine-tuned Llama 3 on Amazon SageMaker (Local Mode)

In [None]:
from sagemaker.huggingface import HuggingFaceModel

config = {
    'HF_MODEL_ID': "/opt/ml/model",
    'SM_NUM_GPUS': "1",
    'MAX_INPUT_LENGTH': "8000",
    'MAX_TOTAL_TOKENS': "8096",
    'MAX_BATCH_PREFILL_TOKENS': "16182",
    'MESSAGES_API_ENABLED': "true",
}

llm_model = HuggingFaceModel(
    model_data=huggingface_estimator.model_data,
    role=role,
    transformers_version="4.36.0",
    pytorch_version="2.1.0",
    py_version="py310",
    env=config,
    sagemaker_session=sagemaker_session
)

# 로컬 엔드포인트에 모델 배포
llm = llm_model.deploy(initial_instance_count=1, instance_type='local')

# 모델 테스트
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me something about Amazon SageMaker?"}
]

parameters = {
    "model": "meta-llama-3-fine-tuned",
    "top_p": 0.6,
    "temperature": 0.9,
    "max_tokens": 512,
    "stop": ["<|eot_id|>"],
}

chat = llm.predict({"messages": messages, **parameters})

print(chat["choices"][0]["message"]["content"].strip())

# 정리
llm.delete_endpoint()