In [1]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session
# import uuid

## Create bucket & Validation Region for S3

In [2]:
bucket_name = 'aps360project' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-2


## Create Paths to S3 Buckets for storage of model data

In [3]:
# Prefix for files in bucket
prefix = 'chest_x_ray_classification'

# Dataset directory
dataset = 'Xray_Dataset'

# Model output folder name
output_dir_name = 'trial_13'

# S3 Path bucket to get the data for training (Train, Test, Validation)
dataset_dir = 's3://{}/{}/{}'.format(bucket_name, prefix, dataset)

# output path for SageMaker to dump all model artifacts and graphs etc
output_dir = 's3://{}/{}/{}'.format(bucket_name, prefix, output_dir_name)

# # checkpoints for spot training
# checkpoint_suffix = str(uuid.uuid4())[:8]
# checkpoint_s3_path = 's3://{}/{}/{}/checkpoint-{}'.format(bucket_name, prefix, output_dir_name, checkpoint_suffix)

# sanity check for output path for model data
print('Dataset directory <dataset_dir>: ', dataset_dir)
print('Model Output directory <output_dir>: ', output_dir)
# print('Checkpointing Path: <checkpoint_s3_path>: {}'.format(checkpoint_s3_path))


Dataset directory <dataset_dir>:  s3://aps360project/chest_x_ray_classification/Xray_Dataset
Model Output directory <output_dir>:  s3://aps360project/chest_x_ray_classification/trial_13


## Manage Spot Training

In [4]:
# use_spot_instances = True
# max_run=24*60*60
# max_wait = 24*60*60

In [5]:
# initialize hyperparamters
hyperparameters = {
    'epochs': 8,
    'batch-size': 64,
    'learning-rate': 0.0005 
}

# Training instance
training_instance = 'ml.g4dn.xlarge'

# Create the current role to use sagemaker 
role = sagemaker.get_execution_role()

In [6]:
from sagemaker.pytorch import PyTorch

# Create a Pytorch estimator to run the training script on AWS Sagemaker
estimator = PyTorch(
                entry_point='trial13xray.py',
                role=role,
                framework_version='1.8.0',
                py_version='py3',
                output_path=output_dir,
                train_instance_count=1,
                script_mode=True,
                train_instance_type=training_instance,
                hyperparameters= hyperparameters,
                base_job_name='trial-13-MobileNetV3-bs-64-lr-0005-epoch-7',
            )

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({'training': dataset_dir})

2021-04-03 23:29:59 Starting - Starting the training job...
2021-04-03 23:30:21 Starting - Launching requested ML instancesProfilerReport-1617492598: InProgress
......
2021-04-03 23:31:22 Starting - Preparing the instances for training...
2021-04-03 23:32:00 Downloading - Downloading input data......
2021-04-03 23:32:57 Training - Downloading the training image...............
2021-04-03 23:35:23 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-04-03 23:35:14,572 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-04-03 23:35:14,593 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-04-03 23:35:14,598 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-04-03 23:35:14,963 sagemaker-training-