# Training and evaluating the models

In [1]:
!\cp *.py aws

In [2]:
!pip install -r requirements.txt --quiet

[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Setup

In [3]:
import torch 
from dl_utils import *
from models import *
from utils import *

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on {device}!')

IMG_SIZE = 224

Running on cpu!


In [4]:
# sagemaker
import boto3
import sagemaker
from sagemaker import get_execution_role

# import a PyTorch wrapper
from sagemaker.pytorch import PyTorch

# importing PyTorchModel
from sagemaker.pytorch import PyTorchModel

In [5]:
# SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# default S3 bucket
bucket = sagemaker_session.default_bucket()

## Data

In [6]:
# iterate through S3 objects and print contents
counter = 0
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    if counter == 10:
        break
    print(obj.key)
    counter += 1

images/train/wario/wario_land_3_img_0.jpg
images/train/wario/wario_land_3_img_1.jpg
images/train/wario/wario_land_3_img_10.jpg
images/train/wario/wario_land_3_img_1000.jpg
images/train/wario/wario_land_3_img_1001.jpg
images/train/wario/wario_land_3_img_1002.jpg
images/train/wario/wario_land_3_img_1003.jpg
images/train/wario/wario_land_3_img_1004.jpg
images/train/wario/wario_land_3_img_1005.jpg
images/train/wario/wario_land_3_img_1006.jpg


In [7]:
input_data = 's3://sagemaker-eu-west-1-873555039102/images'

## Training

### V0 - lab version 1

In [10]:
# specify an output path
prefix = 'model_v0'
output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate a pytorch estimator
estimator = PyTorch(entry_point='train_v0.py',
                    source_dir='aws', 
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    output_path=output_path,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 50,
                        'lab-version': 1
                    })

In [None]:
%%time 
# train the estimator on S3 training data
estimator.fit({'train': input_data})

In [25]:
FROM_PATH = 'model_v0/pytorch-training-2020-04-10-18-22-38-005/output/model.tar.gz'
TO_PATH = 'results/model_v0_lab1/model.tar.gz'
boto3.resource('s3').Bucket(bucket).download_file(FROM_PATH, TO_PATH)

In [27]:
shutil.unpack_archive(TO_PATH, 'results/model_v0_lab1/')

In [None]:
def extract_all(archives, extract_path):
    for filename in archives:
        shutil.unpack_archive(filename, extract_path)

In [26]:
import gzip
import shutil
with gzip.open(TO_PATH, 'rb') as f_in:
    with open('model', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
import boto3
import botocore

BUCKET_NAME = 'my-bucket' # replace with your bucket name
KEY = 'my_image_in_s3.jpg' # replace with your object key

s3 = boto3.resource('s3')

try:
    s3.Bucket(BUCKET_NAME).download_file(KEY, 'my_local_image.jpg')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

In [23]:
PATH='s3://sagemaker-eu-west-1-873555039102/model_v0/pytorch-training-2020-04-10-18-22-38-005/output/model.tar.gz'
downloader = S3Downloader()
downloader.download(PATH, 'results/model_v0_lab1', sagemaker_session)

ValueError: Invalid extra_args key 'SSEKMSKeyId', must be one of: VersionId, SSECustomerAlgorithm, SSECustomerKey, SSECustomerKeyMD5, RequestPayer

In [None]:
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))

In [None]:
model = ColorCNN_v0(lab_version=1).to(device)
model.load_state_dict(checkpoint['state_dict'])

In [None]:
train_losses = checkpoint['train_losses']
valid_losses = checkpoint['valid_losses']
plot_losses(train_losses, valid_losses)

In [None]:
show_model_results(model, 
                   model_name='Model V0', 
                   lab_version=1, 
                   path=get_random_file('images/valid/wario/', 'jpg'), 
                   img_size=IMG_SIZE, 
                   device=device)

### V0 - lab version 2

In [8]:
# specify an output path
prefix = 'model_v0_lab2'
output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate a pytorch estimator
estimator = PyTorch(entry_point='train_v0.py',
                    source_dir='aws', 
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    output_path=output_path,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 50,
                        'lab-version': 2
                    })

In [None]:
%%time 
# train the estimator on S3 training data
estimator.fit({'train': input_data})

2020-04-10 21:58:34 Starting - Starting the training job...
2020-04-10 21:58:36 Starting - Launching requested ML instances...
2020-04-10 21:59:33 Starting - Preparing the instances for training.........
2020-04-10 22:00:55 Downloading - Downloading input data......
2020-04-10 22:02:01 Training - Downloading the training image.........
2020-04-10 22:03:31 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-04-10 22:03:32,358 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-04-10 22:03:32,385 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-04-10 22:03:32,389 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-04-10 22:03:32,708 sagemaker-containers INFO     Module default_user_module_name doe

## Deploying the model for inference

In [16]:
estimator.model_data

's3://sagemaker-eu-west-1-873555039102/model_v0/pytorch-training-2020-04-09-22-55-08-370/output/model.tar.gz'

In [None]:



# Create a model from the trained estimator data
# And point to the prediction script
model = PyTorchModel(model_data=estimator.model_data,
                     role = role,
                     framework_version='1.0',
                     entry_point='predict.py',
                     source_dir='source_solution')

In [None]:
%%time
# deploy and create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
s3://bucketname/image_folder