Get data path

In [41]:
bucket = 'cmajorsolo-autoformer-data'
file_name = 'exchange_rate_short.csv'
s3_data_distribution_type = 'FullyReplicated'
data_url = "https://cmajorsolo-autoformer-data.s3.eu-west-1.amazonaws.com/exchange_rate_short.csv"
data_s3_uri = 's3://{}/{}'.format(bucket, file_name)

Train model with Sagemaker

# Build docker container for the train job

In [36]:
import boto3
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name
ecr_repository = 'hz_first_test'
tag = 'latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository)

In [37]:
!docker build -t $ecr_repository .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h[1A[0G[?25l[+] Building 0.1s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 37B                                        0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/pytorch/pytorch:2.0.1-cuda11.7  0.0s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 37B                                        0.0s
[0m[34m => [internal] load .dockerignore                           

# Upload image to AWS ECR

In [38]:
# !aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
!docker tag {ecr_repository} $processing_repository_uri
!docker push $processing_repository_uri

Using default tag: latest
The push refers to repository [291287855072.dkr.ecr.eu-west-1.amazonaws.com/hz_first_test]

[1B38bb0a16: Preparing 
[1Bc34122fd: Preparing 
[1B732c9258: Preparing 
[1Bbf18a086: Preparing 
[1B6af2f4ef: Preparing 
[1B6e8c217d: Preparing 
[1Bca73c74f: Waiting g denied: Your authorization token has expired. Reauthenticate and try again.


# train model with the customised image created earlier

In [42]:
import os
import sagemaker

execution_role = "AmazonSageMaker-ExecutionRole-20210905T154857"
custom_image_uri = "291287855072.dkr.ecr.eu-west-1.amazonaws.com/hz_first_test:latest"
# source_dirs = ["data_provider", "dataset", "exp", "layers", "models", "utils"]

In [49]:
#Create the estimator object for PyTorch
import os
from sagemaker.pytorch.estimator import PyTorch # import PyTorch Estimator class 

estimator = PyTorch(
    # Use the image created and pushed in the previous steps
    # image_uri=custom_image_uri, #our custom pytorch image URI
    entry_point = "run_aws.py", # training script
    # Below two params are used with the default pytorch image that built by SageMaker
    framework_version = "1.8.1", #PyTorch version
    py_version = "py3", # Compatible Python version to use
    instance_count = 1, #number of EC2 instances needed for training
    # instance_type = "ml.c5.xlarge", #Type of EC2 instance/s needed for training
    instance_type = "ml.p3.2xlarge", #Type of EC2 instance with GPU needed for training
    disable_profiler = True, #Disable profiler, as it's not needed
    role = execution_role, #Execution role used by training job
    source_dir = "./", #Directory where training script is located
    base_job_name='autoformer-training-job', #Name of training job on AWS   
    hyperparameters={}
)

s3_input_train = sagemaker.TrainingInput(s3_data=data_s3_uri, content_type='csv')

inputs = {"train":s3_input_train}

#Start the training in the ephemeral remote compute 
estimator.fit(inputs, wait=True)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: autoformer-training-job-2023-08-29-22-00-35-785


2023-08-29 22:01:59 Starting - Starting the training job...
2023-08-29 22:02:25 Starting - Preparing the instances for training.........
2023-08-29 22:03:57 Downloading - Downloading input data
2023-08-29 22:03:57 Training - Downloading the training image.....................
2023-08-29 22:07:29 Training - Training image download completed. Training in progress....bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-08-29 22:07:50,192 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-08-29 22:07:50,225 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-08-29 22:07:50,228 sagemaker_pytorch_container.training INFO     Invoking user training script.
2023-08-29 22:07:52,579 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.6 -m pip install -r requirements.txt
Collecting reformer_pytor

ToDos: 
1. Save the trained model into S3 bucket  
    - Done by adding sm-model-dir in the parameter in run.py
2. Use GPU to train 
    - Done by changing the estimator attribute to: instance_type = "ml.p3.2xlarge", #Type of EC2 instance with GPU needed for training
3. Print out the test charts
    - Done by updating folder_path params in exp_main.py
4. Get BTC data running with Autoformer
5. Set up early stop on epochs

In [None]:
# deploy model with SageMaker
# test model with SageMaker

# Clean up

In [None]:
import boto3

# Initialize boto3 clients
s3 = boto3.resource('s3')
sagemaker = boto3.client('sagemaker')
logs = boto3.client('logs')
ecr = boto3.client('ecr')

In [None]:
# # Remove S3 artifacts
# bucket_name = 'sagemaker-eu-west-1-291287855072'
# prefix = 'autoformer-training-job'  # Prefix to narrow down to specific files/directories
# bucket = s3.Bucket(bucket_name)
# for obj in bucket.objects.filter(Prefix=prefix):    
#     pritn("Deleting S3 object: "+obj.key)
#     obj.delete()

In [None]:
# # Stop SageMaker notebook instances
# notebook_instance_name = 'YOUR_NOTEBOOK_INSTANCE_NAME'
# sagemaker.stop_notebook_instance(NotebookInstanceName=notebook_instance_name)

In [None]:
## Delete CloudWatch logs
# log_group_name = '/aws/sagemaker/TrainingJobs'
# response = logs.describe_log_streams(logGroupName=log_group_name, orderBy='LastEventTime', descending=True)
# for log_stream in response['logStreams']:
#     if(log_stream['logStreamName'].startswith('autoformer-training-job')):
#         print("Deleting log: "+log_stream['logStreamName'])
#         logs.delete_log_stream(logGroupName=log_group_name, logStreamName=log_stream['logStreamName'])

In [None]:
## Delete Docker images from ECR
# repository_name = 'hz_first_test'
# images = ecr.list_images(repositoryName=repository_name)
# for image in images.get('imageIds', []):
#     if imageTag == "latest":
#         # ecr.batch_delete_image(repositoryName=repository_name, imageIds=[image])
#         print(image)