### Install/Update the environment

In [1]:
!pip install --upgrade awscli botocore sagemaker -q

### Import the libraries

In [2]:
import time
import boto3
import sagemaker
from sagemaker import get_execution_role, Session, image_uris
from sagemaker.pytorch import PyTorch
from sagemaker.huggingface import HuggingFace

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:

sagemaker_session = Session()
region = boto3.Session().region_name
execution_role = get_execution_role()


### Define Data Location

In [4]:
# Adjust this to your local folder path
s3_data_location = "s3://amplify-models-aws/data/uniref50/uniref50_sample_100.csv"


### Define the instance type 

In [8]:
instance_type = "ml.g5.12xlarge"

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


### Define the container

In [16]:
image_uri = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04"


### Define the estimator

In [17]:
estimator = HuggingFace(
    py_version="3.10",
    entry_point='train.py', 
    source_dir='code',        
    role=execution_role,
    image_uri = image_uri,
    instance_count=1,
    instance_type=instance_type, 
    keep_alive_period_in_seconds=1800,
)

### Start Training

In [19]:
training_job_name = f"AMPLIFY-hf-training-job-{int(time.time())}"


estimator.fit({
    'train': s3_data_location
}, job_name=training_job_name)

INFO:sagemaker:Creating training-job with name: AMPLIFY-hf-training-job-1728946972


2024-10-14 23:02:52 Starting - Starting the training job...
2024-10-14 23:03:23 Downloading - Downloading the training image
2024-10-14 23:03:23 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-10-14 23:03:24,077 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-10-14 23:03:24,114 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-10-14 23:03:24,126 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-10-14 23:03:24,128 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-10-14 23:03:25,652 sagemaker-training-toolkit INFO     Installing dependencies from requiremen

### Get the data model data 

In [20]:
estimator = HuggingFace.attach(training_job_name)


2024-10-14 23:06:15 Starting - Found matching resource for reuse
2024-10-14 23:06:15 Downloading - Downloading the training image
2024-10-14 23:06:15 Training - Training image download completed. Training in progress.
2024-10-14 23:06:15 Uploading - Uploading generated training model
2024-10-14 23:06:15 Completed - Resource retained for reuse


In [21]:
estimator.model_data

's3://sagemaker-us-west-2-111918798052/AMPLIFY-hf-training-job-1728946972/output/model.tar.gz'