### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [2]:
pytorch_custom_image_name="character-embedding:gpu-1.0.0-201908270722"
instance_type = "ml.p3.2xlarge" 

In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = sagemaker_session.default_bucket()

In [5]:
train = "s3://{}/email_mock_train/".format(bucket)
test="s3://{}/email_mock_test/".format(bucket)
val="s3://{}/email_mock_val/".format(bucket)
s3_output_path= "s3://{}/email_mock_model/".format(bucket)

### Start training

In [6]:
inputs = {
    "train" : train,
    "val" :val
}

In [7]:
hyperparameters = {
    "batchsize": "32",
    "epochs" : "1000",
    "log-level" : "INFO"
}

In [8]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainAccuracy",
                     "Regex": "###score: train_accuracy### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAccuracy",
                     "Regex": "###score: val_accuracy### (\d*[.]?\d*)"}
                    ]

In [12]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     entry_point='experiment_email.py',
                    source_dir = 'src',
                    dependencies =['src'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    base_job_name ="Character-embedding-adam")

In [None]:
estimator.fit(inputs)

2019-08-28 03:51:07 Starting - Starting the training job...
2019-08-28 03:51:09 Starting - Launching requested ML instances...
2019-08-28 03:52:05 Starting - Preparing the instances for training...
2019-08-28 03:52:44 Downloading - Downloading input data...
2019-08-28 03:52:58 Training - Downloading the training image.......
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-08-28 03:54:35,150 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-08-28 03:54:35,175 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-08-28 03:54:36,587 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-08-28 03:54:36,878 sagemaker-containers INFO     Installing module with the following command:[0m
[31m/usr/bin/python -m pip install -U . [0m
[31mProcessing /opt/ml/code[0m
[31m

[31mValidation  Confusion matrix,  [0m
[31m[[ 4940    19]
 [   60 49534]][0m
[31m###score: train_loss### 23.86510725133121[0m
[31m###score: val_loss### 9.231972955167294[0m
[31m###score: train_accuracy### 99.90101623535156[0m
[31m###score: val_accuracy### 99.85519409179688[0m
[31m2019-08-28 03:57:41,015 - train - INFO - epoch: 1, train_loss 23.86510725133121, val_loss 9.231972955167294, train_accuracy 99.90101623535156, val_accuracy 99.85519409179688[0m
[31mTrain Confusion matrix,  [0m
[31m[[ 19724    114]
 [    60 198314]][0m
[31mValidation  Confusion matrix,  [0m
[31m[[ 4916    43]
 [   10 49584]][0m
[31m2019-08-28 03:59:05,805 - model_snapshotter - INFO - Snappshotting model to /opt/ml/model/snapshot_lowest_loss_model.pt [0m
[31m###score: train_loss### 17.75984643213451[0m
[31m###score: val_loss### 5.942366420291364[0m
[31m###score: train_accuracy### 99.9202651977539[0m
[31m###score: val_accuracy### 99.9028549194336[0m
[31m2019-08-28 03:59:05,826 - t