In [3]:
hyperparameters = {
    'bf16': True,
    'num_train_epochs': 1,
    'per_device_train_batch_size': 16,
    'per_device_eval_batch_size': 4,
    'gradient_accumulation_steps': 8,
    'evaluation_strategy': 'no',
    'save_steps': 2000,
    'learning_rate': 2e-5,
    'weight_decay': 0.,
    'warmup_ratio': 0.03,
    'lr_scheduler_type': 'cosine',
    'fsdp': 'full_shard auto_wrap',
    'fsdp_transformer_layer_cls_to_wrap': 'OPTDecoderLayer',
    'tf32': True
}

In [4]:
distribution = {
    'torch_distributed': {
        'enabled': True
    }
}

In [2]:
metric_definitions=[
        {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
        {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
        {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [3]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
print(f"AWS bucket: {bucket}")

client = boto3.client("sts")
account = client.get_caller_identity()["Account"]
print(f"AWS account: {account}")

session = boto3.session.Session()
region = session.region_name
print(f"AWS region: {region}")

AWS bucket: sagemaker-us-east-1-457411337639
AWS account: 457411337639
AWS region: us-east-1


In [4]:
train_s3 = {
    'model': sagemaker.inputs.TrainingInput(s3_data='s3://nlp-sagemakers/baiyang', content_type='application/x-sagemaker-model'),
    'data': sagemaker.inputs.TrainingInput(s3_data='s3://nlp-sagemakers/data/alpaca_data.json', content_type='application/json')
}

In [5]:
 from sagemaker.pytorch import PyTorch

In [6]:
from sagemaker import get_execution_role
role = get_execution_role()

In [7]:
image_uri = '457411337639.dkr.ecr.us-east-1.amazonaws.com/sagemaker-studio-d-kqffqpxpikw8:zhuojunjie'
job_name = 'nlp-alapca-125m'

In [13]:
pytorch_estimator = PyTorch(base_job_name=job_name,
                            source_dir="./stanford_alpaca",
                            entry_point="train-opt125m.sh",  #  the entry point that launches the training script with options
                            role=role,
                            image_uri=image_uri,
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            metric_definitions=metric_definitions,
                            # hyperparameters=hyperparameters,
                            # distribution=distribution
                           )

In [14]:
pytorch_estimator.fit(train_s3)
# pytorch_estimator.fit()

INFO:sagemaker:Creating training-job with name: nlp-alapca-125m-2023-03-31-15-07-58-009


2023-03-31 15:08:05 Starting - Starting the training job...
2023-03-31 15:08:39 Starting - Preparing the instances for training......
2023-03-31 15:09:40 Downloading - Downloading input data......
2023-03-31 15:10:15 Training - Downloading the training image.....................
2023-03-31 15:14:07 Training - Training image download completed. Training in progress....[34m2023-03-31 15:14:33,386 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-03-31 15:14:33,413 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-03-31 15:14:33,439 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-03-31 15:14:33,451 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "data": "/opt/ml/input/data/data",
        "model": "/opt/ml/inpu

In [16]:
sm = boto3.client("sagemaker")


def stop_training_job(name):
    status = sm.describe_training_job(TrainingJobName=name)["TrainingJobStatus"]
    if status == "InProgress":
        sm.stop_training_job(TrainingJobName=name)

print(pytorch_estimator.latest_training_job.name)
stop_training_job(pytorch_estimator.latest_training_job.name)

nlp-alapca-125m-2023-03-24-04-03-18-624
