In [20]:
import boto3
import numpy 
import sagemaker
from sagemaker.pytorch import PyTorch
import torch
import os

### RESNEXT_29_4x64D

#### Without Containers

In [3]:
# Sagemaker Notebook must be of type, conda_pytorch_p36

!pip install -r '/home/ec2-user/SageMaker/w210-capstone/models/pytorch_imageclass/requirements.txt'

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
# Need to add this to requirements.txt
!pip install tensorboard

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# Train the model per the settings specified for ResNext 29_4x64d in the original paper
os.chdir('/home/ec2-user/SageMaker/w210-capstone/models/pytorch_imageclass/')
!python train.py --config configs/cifar/resnext.yaml \
    model.resnext.cardinality 4 \
    train.batch_size 128 \
    train.base_lr 0.1 \
    train.output_dir /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00 \
    scheduler.epochs 300

# Number of epochs should be 300!

[32m[2020-06-08 19:33:15] __main__ INFO: [0mdevice: cuda
cudnn:
  benchmark: True
  deterministic: False
dataset:
  name: CIFAR10
  dataset_dir: ~/.torch/datasets/CIFAR10
  image_size: 32
  n_channels: 3
  n_classes: 10
model:
  type: cifar
  name: resnext
  init_mode: kaiming_fan_out
  vgg:
    n_channels: [64, 128, 256, 512, 512]
    n_layers: [2, 2, 3, 3, 3]
    use_bn: True
  resnet:
    depth: 110
    n_blocks: [2, 2, 2, 2]
    block_type: basic
    initial_channels: 16
  resnet_preact:
    depth: 110
    n_blocks: [2, 2, 2, 2]
    block_type: basic
    initial_channels: 16
    remove_first_relu: False
    add_last_bn: False
    preact_stage: [True, True, True]
  wrn:
    depth: 28
    initial_channels: 16
    widening_factor: 10
    drop_rate: 0.0
  densenet:
    depth: 100
    n_blocks: [6, 12, 24, 16]
    block_type: bottleneck
    growth_rate: 12
    drop_rate: 0.0
    compression_rate: 0.5
  pyramidnet:
    depth: 272
    n_blocks: [3, 24, 36, 3]
    initial_channels: 16
  

In [27]:
## Evaluate the trained, saved model using the CIFAR 10 test dataset 
# Right the results to the test output directory specified.
!python evaluate.py --config configs/cifar/resnext.yaml \
   model.resnext.cardinality 4 \
   test.batch_size 128 \
   test.checkpoint /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/checkpoint_00300.pth \
   test.output_dir /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/test_results_0300

[32m[2020-06-11 01:08:55] fvcore.common.checkpoint INFO: [0mLoading checkpoint from /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/checkpoint_00300.pth
Files already downloaded and verified
100%|███████████████████████████████████████████| 79/79 [00:34<00:00,  2.29it/s]
[32m[2020-06-11 01:09:30] __main__ INFO: [0mElapsed 34.44
[32m[2020-06-11 01:09:30] __main__ INFO: [0mLoss 0.1517 Accuracy 0.9535


In [44]:
!python evaluate.py --config configs/cifar/resnext.yaml \
   model.resnext.cardinality 4 \
   test.batch_size 128 \
   test.checkpoint /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/checkpoint_00200.pth \
   test.output_dir /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/test_results_0200

[32m[2020-06-11 01:50:14] fvcore.common.checkpoint INFO: [0mLoading checkpoint from /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/checkpoint_00200.pth
Files already downloaded and verified
100%|███████████████████████████████████████████| 79/79 [00:34<00:00,  2.30it/s]
[32m[2020-06-11 01:50:49] __main__ INFO: [0mElapsed 34.36
[32m[2020-06-11 01:50:49] __main__ INFO: [0mLoss 0.2311 Accuracy 0.9321


In [45]:
!python evaluate.py --config configs/cifar/resnext.yaml \
   model.resnext.cardinality 4 \
   test.batch_size 128 \
   test.checkpoint /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/checkpoint_00100.pth \
   test.output_dir /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/test_results_0100

[32m[2020-06-11 01:51:05] fvcore.common.checkpoint INFO: [0mLoading checkpoint from /home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/checkpoint_00100.pth
Files already downloaded and verified
100%|███████████████████████████████████████████| 79/79 [00:34<00:00,  2.28it/s]
[32m[2020-06-11 01:51:41] __main__ INFO: [0mElapsed 34.69
[32m[2020-06-11 01:51:41] __main__ INFO: [0mLoss 0.6746 Accuracy 0.8019


In [53]:
# Write the results to a CSV file so that we can analyze later.
import pandas as pd

results = {'Model': ['resnext_29_4x64d', 'resnext_29_4x64d', 'resnext_29_4x64d'],
           'Testset': ['cifar10', 'cifar10', 'cifar10'],
           'Epoch': [100, 200, 300],
           'Loss': [0.6746, 0.2311, 0.1517],
           'Accuracy': [0.8019, 0.9321, 0.9535],
           'Original_Accuracy': [96.4, 96.4, 96.4],
           'Original_CI': [(96.0, 96.7), (96.0, 96.7), (96.0, 96.7)]
           }

df = pd.DataFrame(results, columns = ['Model', 'Testset', 'Epoch', 'Loss', 'Accuracy', 
                                      'Original_Accuracy', 'Original_CI'])


df.to_csv('/home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/results.csv')
df.head()

Unnamed: 0,Model,Testset,Epoch,Loss,Accuracy,Original_Accuracy,Original_CI
0,resnext_29_4x64d,cifar10,100,0.6746,0.8019,96.4,"(96.0, 96.7)"
1,resnext_29_4x64d,cifar10,200,0.2311,0.9321,96.4,"(96.0, 96.7)"
2,resnext_29_4x64d,cifar10,300,0.1517,0.9535,96.4,"(96.0, 96.7)"


In [60]:
# Peak inside the output file for predictions
import numpy as np
output = '/home/ec2-user/SageMaker/experiments/resnext_29_4x64d/exp00/test_results_0300/predictions.npz'
npzfile = np.load(output)
print(npzfile.files)
npzfile['preds']

['preds', 'probs', 'labels', 'loss', 'acc']


array([[-1.2041907 , -1.8999833 , -0.24285015, ..., -1.5008752 ,
        -1.8426697 , -2.8560946 ],
       [ 0.5460079 ,  2.220384  , -1.9393705 , ..., -2.6070693 ,
        11.327686  , -1.2085156 ],
       [-1.3446747 ,  2.1730833 , -1.1615647 , ..., -2.2299995 ,
        10.984515  , -0.75660706],
       ...,
       [-2.4790986 , -1.3337001 ,  0.61669415, ..., -0.83421385,
        -1.8529658 , -1.7280097 ],
       [-0.90489024,  9.350766  ,  1.0618937 , ..., -2.3210623 ,
        -0.9061641 , -1.8115013 ],
       [-1.4560711 , -1.0518838 , -1.4613396 , ..., 12.668192  ,
        -2.1191459 , -0.8881919 ]], dtype=float32)

In [54]:
# Upload the model checkpoints, configs, and results to S3 
bucket='sagemaker-may29'
prefix = 'sagemaker/results/original-models/'
path = '/home/ec2-user/SageMaker/experiments/'

s3_resource = boto3.resource("s3", region_name="us-east-2")

def uploadDirectory(local_path,bucket_name,s3_prefix):

    my_bucket = s3_resource.Bucket(bucket_name)
    
    for path, subdirs, files in os.walk(local_path):
        path = path.replace("\\","/")
        directory_name = path.replace(local_path,"")
        for file in files:
            #print("Local File:", os.path.join(path, file))
            #print("      Dest:", s3_prefix+directory_name+'/'+file)
            my_bucket.upload_file(os.path.join(path, file), s3_prefix+directory_name+'/'+file)
    
uploadDirectory(path,bucket,prefix)

In [43]:
os.getcwd()


'/home/ec2-user/SageMaker/w210-capstone/models/pytorch_imageclass'

## Everything Below Is In-Work

#### SageMaker Way

In [2]:
# S3 Variable Definitions 
bucket='sagemaker-may29'
prefix = '/sagemaker-may29/sagemaker/cifar102/data/cifar10_10k'
model_path = '/home/ec2-user/SageMaker/w210-capstone/models/pytorch_imageclass/'

role = sagemaker.get_execution_role()
#bucket = sagemaker.Session().default_bucket(
#sagemaker_session = sagemaker.Session()
bucket='sagemaker-may29'
sagemaker_session = sagemaker.Session(default_bucket=bucket)


# Set S3 dataset path 
#inputs = 's3://' + bucket + '/sagemaker/{}'.format(args.train_data)
inputs = 's3://' + bucket + '/sagemaker-may29/sagemaker/cifar102/data/cifar10_10k/cifar10_10k_test'
inputs = 's3://' + bucket + '/sagemaker-may29/sagemaker/cifar102/data/cifar10_10k/'

In [3]:
model_path = '/home/ec2-user/SageMaker/w210-capstone/models/pytorch_imageclass/'
container_data_dir = '/opt/ml/input/data/training'
container_model_dir = '/opt/ml/model'

parameters = {
    'config': 'resnext.yaml',
    'resnext.depth': 29,
    'train.batch_size': 128,
    'train.base_lr': 0.1,
    #'data_dir': container_data_dir,
    #'dataset.dataset_dir': container_data_dir
    #'output_dir': container_model_dir,
    #'num_train_epochs': 3,
    #'per_gpu_train_batch_size': 64,
    #'per_gpu_eval_batch_size': 64,
    #'save_steps': 150,
    #'logging_steps': 150
} 

In [4]:
# Create an instance of the PyTorch class that enables the model script to run as a 
# training job on the SageMaker distributed, managed training infrastructure
estimator = PyTorch(entry_point= model_path + 'train.py',
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    hyperparameters=parameters)

In [5]:
# Train the model
#inputs = 's3://' + bucket + '/sagemaker-may29/sagemaker/cifar102/data/cifar10_10k/cifar10_10k_test'
inputs = 's3://' + bucket + '/sagemaker-may29/sagemaker/cifar102/data/cifar10_10k/'
#estimator.fit({'training': inputs})
estimator.fit()

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-06-08 14:49:34 Starting - Starting the training job...
2020-06-08 14:49:36 Starting - Launching requested ML instances.........
2020-06-08 14:51:06 Starting - Preparing the instances for training......
2020-06-08 14:52:21 Downloading - Downloading input data...
2020-06-08 14:53:01 Training - Downloading the training image...........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-06-08 14:54:49,375 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-06-08 14:54:49,397 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-06-08 14:54:49,403 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-06-08 14:54:49,684 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-06-08 14:54:49,68

UnexpectedStatusException: Error for Training job pytorch-training-2020-06-08-14-49-34-014: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python train.py --config resnext.yaml --resnext.depth 29 --train.base_lr 0.1 --train.batch_size 128"
Traceback (most recent call last):
  File "train.py", line 7, in <module>
    import apex
ModuleNotFoundError: No module named 'apex'

In [8]:
os.getcwd()

'/home/ec2-user/SageMaker'

### Image Augmentation 
https://imgaug.readthedocs.io/en/latest/source/overview/collections.html

In [5]:
!pip install imgaug

[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
import imgaug.augmenters as iaa
aug = iaa.RandAugment(n=2, m=(0, 9))  
  # n is the number of transformations to apply per image
  # m is magnitude -> specifying a tuple will randomly select values between the min and max (max is 30)

In [26]:
import boto3 
from PIL import Image

cifar10 = "sagemaker/cifar-10/cifar/"
#cifar10_test_rec = "s3://sagemaker-may29/sagemaker/cifar-10/cifar/test.rec"

def download_all_objects_in_folder(b, p):
    s3_resource = boto3.resource('s3')
    my_bucket = s3_resource.Bucket(b)
    objects = my_bucket.objects.filter(Prefix=p)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        my_bucket.download_file(obj.key, filename)    

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Download the unzipped data from the cifar10 folder
download_all_objects_in_folder(bucket, cifar10)



test.rec: data


### RandAugment 
https://arxiv.org/abs/1909.13719
https://pypi.org/project/randaugment/

In [2]:
!pip install randaugment

Collecting randaugment
  Using cached https://files.pythonhosted.org/packages/fb/ea/e24549f459800dc3bed21cd4e9c0d49d5b8deed65214b2444bd3e5a49f30/randaugment-1.0.2-py3-none-any.whl
[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: randaugment
Successfully installed randaugment-1.0.2
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
from randaugment import RandAugment, ImageNetPolicy
data = ImageFolder('/home/ec2-user/SageMaker/w210-capstone/data/cifar10/subset300', transform=transforms.Compose(
                        [
                            transforms.RandomCrop(32, padding=4, fill=128), # fill parameter needs torchvision installed from source
                            transforms.RandomHorizontalFlip(), 
                            RandAugment(),
                            #ImageNetPolicy(),
                            transforms.ToTensor(), 
                            Cutout(size=16), # (https://github.com/uoguelph-mlrg/Cutout/blob/master/util/cutout.py)
                            transforms.Normalize(...)
                        ])
)

NameError: name 'ImageFolder' is not defined

### Deep Augment
https://blog.insightdatascience.com/automl-for-data-augmentation-e87cf692c366
https://colab.research.google.com/drive/1KCAv2i_F3E3m_PKh56nbbZY8WnaASvgl#scrollTo=SuhR6Q3AMFy3

In [5]:
!pip install deepaugment

Collecting deepaugment
  Downloading https://files.pythonhosted.org/packages/99/f9/40211d827039df475091639c6aded9a1786849f898b9c619e24c15efc82a/deepaugment-1.1.2-py2.py3-none-any.whl
Collecting keras-applications==1.0.6 (from deepaugment)
[?25l  Downloading https://files.pythonhosted.org/packages/3f/c4/2ff40221029f7098d58f8d7fb99b97e8100f3293f9856f0fb5834bef100b/Keras_Applications-1.0.6-py2.py3-none-any.whl (44kB)
[K    100% |████████████████████████████████| 51kB 4.5MB/s eta 0:00:01
[?25hCollecting pandas==0.23.4 (from deepaugment)
[?25l  Downloading https://files.pythonhosted.org/packages/e1/d8/feeb346d41f181e83fba45224ab14a8d8af019b48af742e047f3845d8cff/pandas-0.23.4-cp36-cp36m-manylinux1_x86_64.whl (8.9MB)
[K    100% |████████████████████████████████| 8.9MB 5.1MB/s eta 0:00:01
[?25hCollecting scikit-optimize==0.5.2 (from deepaugment)
[?25l  Downloading https://files.pythonhosted.org/packages/f4/44/60f82c97d1caa98752c7da2c1681cab5c7a390a0fdd3a55fac672b321cac/scikit_optimize-0

In [8]:
from deepaugment.deepaugment import DeepAugment

deepaug = DeepAugment('/home/ec2-user/SageMaker/w210-capstone/data/cifar10/subset300', 
                      '/home/ec2-user/SageMaker/w210-capstone/data/cifar10/trainLabels.csv')

best_policies = deepaug.optimize()

TypeError: __new__() got an unexpected keyword argument 'serialized_options'

In [6]:
!ls /home/ec2-user/SageMaker/w210-capstone/data/cifar10/subset300

/home/ec2-user/SageMaker


In [1]:
# Must be conda_pytroch_p36 notebook
import argparse
import numpy as np
import os
import sagemaker
import torch
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch 

In [2]:
!pip install -r ./w210-capstone/models/pytorch_imageclass/requirements.txt

# Do we need NVIDIA installed?

[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [5]:
# Must be conda_pytroch_p36 notebook
!python w210-capstone/models/pytorch_imageclass/train.py --config w210-capstone/models/pytorch_imageclass/configs/cifar/resnext.yaml \
   model.resnext.depth 29 \
   model.resnext.cardinality 4 \
   train.batch_size 128 \
   train.base_lr 0.1 \
   train.output_dir experiments/resnext_29_4x64d/exp00

[32m[2020-06-04 00:41:31] __main__ INFO: [0mdevice: cpu
cudnn:
  benchmark: True
  deterministic: False
dataset:
  name: CIFAR10
  dataset_dir: ~/.torch/datasets/CIFAR10
  image_size: 32
  n_channels: 3
  n_classes: 10
model:
  type: cifar
  name: resnext
  init_mode: kaiming_fan_out
  vgg:
    n_channels: [64, 128, 256, 512, 512]
    n_layers: [2, 2, 3, 3, 3]
    use_bn: True
  resnet:
    depth: 110
    n_blocks: [2, 2, 2, 2]
    block_type: basic
    initial_channels: 16
  resnet_preact:
    depth: 110
    n_blocks: [2, 2, 2, 2]
    block_type: basic
    initial_channels: 16
    remove_first_relu: False
    add_last_bn: False
    preact_stage: [True, True, True]
  wrn:
    depth: 28
    initial_channels: 16
    widening_factor: 10
    drop_rate: 0.0
  densenet:
    depth: 100
    n_blocks: [6, 12, 24, 16]
    block_type: bottleneck
    growth_rate: 12
    drop_rate: 0.0
    compression_rate: 0.5
  pyramidnet:
    depth: 272
    n_blocks: [3, 24, 36, 3]
    initial_channels: 16
   

## SCRATCH

In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=100, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--backend', type=str, default=None,
                        help='backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)')

    
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--learning-rate', type=float, default=0.1)
    
    parser.add_argument('--model_name', dest='model_name', type=str, help='model to train')
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'],
                        help='directory to save trained model to')
    parser.add_argument('--train_data', dest='train_data', type=str, help='dataset for model training') 
    parser.add_argument('--workers', dest='workers', type=int, help='number of V100 worker instances; 1 indicates non-distributed training')

    args = parser.parse_args()
    
    
    
    # Container environment
    env = sagemaker_containers.training_env()
    parser.add_argument('--hosts', type=list, default=env.hosts)
    parser.add_argument('--current-host', type=str, default=env.current_host)
    parser.add_argument('--model-dir', type=str, default=env.model_dir)
    parser.add_argument('--data-dir', type=str,
                        default=env.channel_input_dirs['training'])
    parser.add_argument('--num-gpus', type=int, default=env.num_gpus)

    train(parser.parse_args())

In [9]:
#torch.cuda.get_device_name(0)
echo $CUDA_PATH

SyntaxError: invalid syntax (<ipython-input-9-3319f1f978a5>, line 2)

In [16]:
!pip install -r ./w210-capstone/models/pytorch_/requirements.txt

# Do we need NVIDIA installed?

[31mCould not open requirements file: [Errno 2] No such file or directory: './w210-capstone/models/pytorch/requirements.txt'[0m
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [19]:
import subprocess

instance_type = 'local'

if subprocess.call('nvidia-smi') == 0:
    ## Set type to GPU if one is present
    instance_type = 'local_gpu'
    
print("Instance type = " + instance_type)

Instance type = local


In [10]:
#!nvidia-smi

In [33]:
# ResNext-29_4x64
!python w210-capstone/models/pytorch/train.py --config w210-capstone/models/pytorch/configs/cifar/resnext.yaml \
   model.resnext.depth 29 \
   model.resnext.cardinality 4 \
   train.batch_size 128 \
   train.base_lr 0.1 \
   train.output_dir experiments/resnext_29_4x64d/exp00

# ResNeXt-29 4x64d with a single GPU, batch size 32 and initial learning rate 0.025 
# (8 GPUs, batch size 128 and initial learning rate 0.1 in paper).

[32m[2020-06-03 20:54:56] __main__ INFO: [0mdevice: cpu
cudnn:
  benchmark: True
  deterministic: False
dataset:
  name: CIFAR10
  dataset_dir: ~/.torch/datasets/CIFAR10
  image_size: 32
  n_channels: 3
  n_classes: 10
model:
  type: cifar
  name: resnext
  init_mode: kaiming_fan_out
  vgg:
    n_channels: [64, 128, 256, 512, 512]
    n_layers: [2, 2, 3, 3, 3]
    use_bn: True
  resnet:
    depth: 110
    n_blocks: [2, 2, 2, 2]
    block_type: basic
    initial_channels: 16
  resnet_preact:
    depth: 110
    n_blocks: [2, 2, 2, 2]
    block_type: basic
    initial_channels: 16
    remove_first_relu: False
    add_last_bn: False
    preact_stage: [True, True, True]
  wrn:
    depth: 28
    initial_channels: 16
    widening_factor: 10
    drop_rate: 0.0
  densenet:
    depth: 100
    n_blocks: [6, 12, 24, 16]
    block_type: bottleneck
    growth_rate: 12
    drop_rate: 0.0
    compression_rate: 0.5
  pyramidnet:
    depth: 272
    n_blocks: [3, 24, 36, 3]
    initial_channels: 16
   

In [None]:

# initialize command line argument parsing
if __name__=='__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--learning-rate', type=float, default=0.1)
    
    parser.add_argument('--model_name', dest='model_name', type=str, help='model to train')
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'],
                        help='directory to save trained model to')
    parser.add_argument('--train_data', dest='train_data', type=str, help='dataset for model training') 
    parser.add_argument('--workers', dest='workers', type=int, help='number of V100 worker instances; 1 indicates non-distributed training')

    args = parser.parse_args()

    # Check quality of arguments
    valid_args = {'datasets': ['cifar10', 'cifar10_10k', 'cifar10_30k', 'cifar102', 'cifar102_30k'],
                  'model_names': ['wrn', 'shake_shake_32', 'shake_shake_96', 'shake_shake_112', 'pyramid_net']}

    if args.train_data not in valid_args['datasets']:
        parser.error('Invalid train_data parameter')

    if args.model_name not in valid_args['model_names']:
        parser.error('Invalid model_name parameter')
    
    if args.workers < 1:
        parser.error('Invalid number of workers')

    if not args.model_name:
        parser.error('--model_name parameter is required')
    elif not args.train_data:
        parser.error('--train_data parameter is required')
    elif not args.workers:
        parser.error('--workers parameter is required')

    # Set SageMaker session & execution role
    bucket='sagemaker-may29'
    sagemaker_session = sagemaker.Session(default_bucket=bucket)
    role = get_execution_role()


    # Set S3 path for data batches
    inputs = 's3://' + bucket + '/sagemaker/{}'.format(args.train_data)

    # Create the sagemaker estimator
    pytorch_estimator = PyTorch('./w210-capstone/models/pytorch/train.py',
                                train_instance_type='ml.p3.2xlarge',
                                train_instance_count=1,
                                framework_version='1.0.0',
                                hyperparameters = {'epochs': 20, 'batch-size': 64, 'learning-rate': 0.1})

    --config w210-capstone/models/pytorch/configs/cifar/resnext.yaml \
   model.resnext.depth 29 \
   model.resnext.cardinality 4 \
   train.batch_size 128 \
   train.base_lr 0.1 \
   train.output_dir experiments/resnext_29_4x64d/exp00
    
    
    # Train the Model
    pytorch_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data',
                           'test': 's3://my-data-bucket/path/to/my/test/data'})


    # After training, save the model to `model_dir`
    with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f:
        torch.save(model.state_dict(), f)
        