<h1>C4 Solution</h1>

Setup notebook
Notes about the instance size and kernel setup:

The Python 3 (Data Science) kernel
The ml.t3.medium Sagemaker notebook instance

In [1]:
s3_bucket = 'eli-udacitysolutions'

<h3>Install and import</h3>

In [2]:
%%capture
import sys
!{sys.executable} -m pip install smdebug torch torchvision tqdm

In [3]:
%%capture
!pip install smdebug torch torchvision tqdm

In [4]:
!pip install -Iv "numexpr==2.8.0"


Using pip 23.3.1 from /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pip (python 3.10)
Collecting numexpr==2.8.0
  Obtaining dependency information for numexpr==2.8.0 from https://files.pythonhosted.org/packages/28/17/3e7a34cf7acde4445545a69356ee1e769cfb2c2b6c74c43ae1dbdeebfa56/numexpr-2.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached numexpr-2.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.0 kB)
Collecting numpy>=1.7 (from numexpr==2.8.0)
  Obtaining dependency information for numpy>=1.7 from https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numexpr-2.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (380 kB)
Using cached numpy-1.26.4-c

In [5]:
import os
import boto3
import sagemaker
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs, ProfilerConfig, FrameworkProfile
from sagemaker.analytics import HyperparameterTuningJobAnalytics

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


<h3>Get the data and copy it to S3</h3>

In [6]:
# %%capture
# !wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
# !unzip dogImages.zip
# !aws s3 cp dogImages s3://eli-udacitysolutions/ --recursive

In [7]:
# !aws s3 cp dogImages s3://eli-udacitysolutions/ --recursive
# !aws s3 cp dogImages s3://{s3_bucket}/ --recursive

<h3>Set up parameters, estimator, and tuner</h3>

In [8]:
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.001, 0.1),
    "batch_size": CategoricalParameter([2, 3]) #32, 64, 128, 256, 512]),
}

role = sagemaker.get_execution_role()

objective_metric_name = "Test Loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Test Loss", "Regex": "Testing Loss: ([0-9\\.]+)"}]

In [9]:
estimator = PyTorch(
    entry_point="hpo.py",
    base_job_name='pytorch-dog-hpo',
    role=role,
    framework_version="1.4.0",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    py_version='py3'
)

job_name = "pytorch-training-240301-1416"
tuner = HyperparameterTuner.attach(job_name) #s3://sagemaker-us-east-1-339840815706/dog-pytorch-2024-03-01-15-13-15-028/
tuner
# tuner = HyperparameterTuner(
#     estimator,
#     objective_metric_name,
#     hyperparameter_ranges,
#     metric_definitions,
#     max_jobs=2,
#     max_parallel_jobs=1,  # you once have one ml.g4dn.xlarge instance available
#     objective_type=objective_type
# )

<sagemaker.tuner.HyperparameterTuner at 0x7fea5200b670>

<h3>Fit the tuner</h3>

# When you call fit, the training is run on a separate instance. You can run your notebook on a ml.t3.medium while initiate a training job on a g4dn.xlarge instance.

In [10]:
os.environ['SM_CHANNEL_TRAINING']='s3://eli-udacitysolutions/'
os.environ['SM_MODEL_DIR']='s3://eli-udacitysolutions/model/'
os.environ['SM_OUTPUT_DATA_DIR']='s3://eli-udacitysolutions/output/'
# tuner.fit({"training": "s3://eli-udacitysolutions/"})
# tuner.fit({"training": f"s3://{s3_bucket}/"})

<h3>Describe the tuning results</h3>


In [11]:
exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name='pytorch-training-240301-1416')

jobs = exp.dataframe()

jobs.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,batch_size,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,"""3""",0.011317,pytorch-training-240301-1416-001-bdb8915c,Completed,14.0,2024-03-01 14:17:40+00:00,2024-03-01 14:38:56+00:00,1276.0
0,"""2""",0.05788,pytorch-training-240301-1416-002-41985f56,Completed,9.0,2024-03-01 14:41:37+00:00,2024-03-01 15:03:28+00:00,1311.0


## Imp: If kernel dies, how to continue from a completed training job

In [12]:
#BetterTrainingJobName='pytorch-training-210623-2156-001-fdd5e081'
#my_estimator = sagemaker.estimator.Estimator.attach(BetterTrainingJobName)
#my_estimator.hyperparameters()
#best_estimator=my_estimator

<h3>Prepare to perform Training on Best Estimator</h3>

In [13]:
#inputs = "s3://s3://eli-udacitysolutions/dogimages"
best_estimator=tuner.best_estimator()


2024-03-01 15:03:30 Starting - Found matching resource for reuse
2024-03-01 15:03:30 Downloading - Downloading the training image
2024-03-01 15:03:30 Training - Training image download completed. Training in progress.
2024-03-01 15:03:30 Uploading - Uploading generated training model
2024-03-01 15:03:30 Completed - Resource released due to keep alive period expiry


In [14]:
best_estimator.hyperparameters()

{'_tuning_objective_metric': '"Test Loss"',
 'batch_size': '"2"',
 'learning_rate': '0.05788007639163844',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"pytorch_dog_hpo-2024-03-01-14-16-39-430"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-339840815706/pytorch_dog_hpo-2024-03-01-14-16-39-430/source/sourcedir.tar.gz"'}

In [15]:
hyperparameters = {"batch_size": int(best_estimator.hyperparameters()['batch_size'].replace('"', '')), \
                   "learning_rate": best_estimator.hyperparameters()['learning_rate']}
hyperparameters

{'batch_size': 2, 'learning_rate': '0.05788007639163844'}

In [16]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [17]:
hook_config = DebuggerHookConfig(
    hook_parameters={
        "train.save_interval": "1",
        "eval.save_interval": "1"
    }
)

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=1)
)

Framework profiling will be deprecated from tensorflow 2.12 and pytorch 2.0 in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


<h2>Creating an Estimator</h2>

In [18]:
#adjust this cell to accomplish multi-instance training
estimator = PyTorch(
    entry_point='hpo.py',
    base_job_name='dog-pytorch',
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    framework_version='1.4.0',
    py_version='py3',
    hyperparameters=hyperparameters,
    ## Debugger and Profiler parameters
    rules = rules,
    debugger_hook_config=hook_config,
    profiler_config=profiler_config,
)

In [19]:
estimator.fit({"training": "s3://eli-udacitysolutions/"}, wait=False)
# estimator.fit({"training": "s3://{s3_bucket}/"}, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: dog-pytorch-2024-03-04-13-17-59-793


In [20]:
#adjust this cell to accomplish multi-instance training
estimator_multi_instance = PyTorch(
    entry_point='hpo.py',
    base_job_name='dog-pytorch-multi-instance',
    role=role,
    instance_count=4,
    instance_type='ml.m5.2xlarge',
    framework_version='1.4.0',
    py_version='py3',
    hyperparameters=hyperparameters,
    ## Debugger and Profiler parameters
    rules = rules,
    debugger_hook_config=hook_config,
    profiler_config=profiler_config,
)

<h2>Creating an Estimator - Multi-Instance Training,</h2>

In [21]:
###in this cell, create and fit an estimator using multi-instance training
estimator_multi_instance.fit({"training": "s3://eli-udacitysolutions/"}, wait=False)
# estimator_multi_instance.fit({"training": "s3://{s3_bucket}/"}, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: dog-pytorch-multi-instance-2024-03-04-13-18-01-110


In [22]:
#estimator_multi_instance.model_data

<h2>Deployment</h2>

In [23]:
#model_location='s3://sagemaker-us-east-1-339840815706/dog-pytorch-2024-03-01-15-13-15-028/source/sourcedir.tar.gz' # estimator.model_data


In [26]:
model_location=estimator.model_data
model_location

's3://sagemaker-us-east-1-339840815706/dog-pytorch-2024-03-04-13-17-59-793/output/model.tar.gz'

In [27]:
multi_instance_model_location=estimator_multi_instance.model_data
multi_instance_model_location

's3://sagemaker-us-east-1-339840815706/dog-pytorch-multi-instance-2024-03-04-13-18-01-110/output/model.tar.gz'

In [28]:
import sagemaker
import boto3
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor


In [30]:
jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()


class ImagePredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(ImagePredictor, self).__init__(
            endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=jpeg_serializer,
            deserializer=json_deserializer,
        )

In [31]:
multi_instance_pytorch_model = PyTorchModel(model_data=multi_instance_model_location, role=role, entry_point='infernce2.py',py_version='py3',
                             framework_version='1.4',
                             predictor_cls=ImagePredictor)

In [None]:
pytorch_model = PyTorchModel(model_data=model_location, role=role, entry_point='infernce2.py',py_version='py3',
                             framework_version='1.4',
                             predictor_cls=ImagePredictor)
# pytorch-inference-2024-03-02-20-17-42-241

In [32]:
multi_instance_predictor = multi_instance_pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.large')

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-339840815706/dog-pytorch-multi-instance-2024-03-04-13-18-01-110/output/model.tar.gz), script artifact (None), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-339840815706/pytorch-inference-2024-03-04-13-40-26-674/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-03-04-13-40-39-193
INFO:sagemaker:Creating endpoint-config with name pytorch-inference-2024-03-04-13-40-39-863
INFO:sagemaker:Creating endpoint with name pytorch-inference-2024-03-04-13-40-39-863


----!

In [None]:
# https://stackoverflow.com/questions/56467434/making-a-prediction-sagemaker-pytorch

In [None]:
# Gets inference from the model hosted at the specified endpoint:
# response = sagemaker_runtime.invoke_endpoint(
#     EndpointName=endpoint_name, 
#     Body=bytes('{"features": ["This is great!"]}', 'utf-8')
#     )

In [None]:
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.large')


In [None]:
multi_instance_predictor.endpoint_name

In [None]:
# Multi-instance endpoint
predictor.endpoint_name

In [None]:
import requests
#request_dict={ "url": "https://cdn1-www.cattime.com/assets/uploads/2011/12/file_2744_british-shorthair-460x290-460x290.jpg" }
request_dict={ "url": "https://s3.amazonaws.com/cdn-origin-etr.akc.org/wp-content/uploads/2017/11/20113314/Carolina-Dog-standing-outdoors.jpg" }

img_bytes = requests.get(request_dict['url']).content
type(img_bytes)

In [None]:
from PIL import Image
import io
Image.open(io.BytesIO(img_bytes))

In [None]:
response=predictor.predict(img_bytes, initial_args={"ContentType": "image/jpeg"})

In [None]:
import json
response2=predictor.predict(json.dumps(request_dict), initial_args={"ContentType": "application/json"})

In [None]:
type(response2[0][0])

In [None]:
response2[0]

In [None]:
import torch
import numpy as np
np.argmax(response, 1)

In [None]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

In [None]:
dog_names = ['Affenpinscher', 'Afghan_hound', 'Airedale_terrier', 'Akita',
 'Alaskan_malamute', 'American_eskimo_dog', 'American_foxhound',
 'American_staffordshire_terrier', 'American_water_spaniel',
 'Anatolian_shepherd_dog', 'Australian_cattle_dog', 'Australian_shepherd',
 'Australian_terrier', 'Basenji', 'Basset_hound', 'Beagle', 'Bearded_collie',
 'Beauceron', 'Bedlington_terrier', 'Belgian_malinois', 'Belgian_sheepdog',
 'Belgian_tervuren', 'Bernese_mountain_dog', 'Bichon_frise',
 'Black_and_tan_coonhound', 'Black_russian_terrier', 'Bloodhound',
 'Bluetick_coonhound', 'Border_collie', 'Border_terrier', 'Borzoi',
 'Boston_terrier', 'Bouvier_des_flandres', 'Boxer', 'Boykin_spaniel', 'Briard',
 'Brittany', 'Brussels_griffon', 'Bull_terrier', 'Bulldog', 'Bullmastiff',
 'Cairn_terrier', 'Canaan_dog', 'Cane_corso', 'Cardigan_welsh_corgi',
 'Cavalier_king_charles_spaniel', 'Chesapeake_bay_retriever', 'Chihuahua',
 'Chinese_crested', 'Chinese_shar-pei', 'Chow_chow', 'Clumber_spaniel',
 'Cocker_spaniel', 'Collie', 'Curly-coated_retriever', 'Dachshund',
 'Dalmatian', 'Dandie_dinmont_terrier', 'Doberman_pinscher',
 'Dogue_de_bordeaux', 'English_cocker_spaniel', 'English_setter',
 'English_springer_spaniel', 'English_toy_spaniel',
 'Entlebucher_mountain_dog', 'Field_spaniel', 'Finnish_spitz',
 'Flat-coated_retriever', 'French_bulldog', 'German_pinscher',
 'German_shepherd_dog', 'German_shorthaired_pointer',
 'German_wirehaired_pointer', 'Giant_schnauzer', 'Glen_of_imaal_terrier',
 'Golden_retriever', 'Gordon_setter', 'Great_dane', 'Great_pyrenees',
 'Greater_swiss_mountain_dog', 'Greyhound', 'Havanese', 'Ibizan_hound',
 'Icelandic_sheepdog', 'Irish_red_and_white_setter', 'Irish_setter',
 'Irish_terrier', 'Irish_water_spaniel', 'Irish_wolfhound',
 'Italian_greyhound', 'Japanese_chin', 'Keeshond', 'Kerry_blue_terrier',
 'Komondor', 'Kuvasz', 'Labrador_retriever', 'Lakeland_terrier', 'Leonberger',
 'Lhasa_apso', 'Lowchen', 'Maltese', 'Manchester_terrier', 'Mastiff',
 'Miniature_schnauzer', 'Neapolitan_mastiff', 'Newfoundland',
 'Norfolk_terrier', 'Norwegian_buhund', 'Norwegian_elkhound',
 'Norwegian_lundehund', 'Norwich_terrier',
 'Nova_scotia_duck_tolling_retriever', 'Old_english_sheepdog', 'Otterhound',
 'Papillon', 'Parson_russell_terrier', 'Pekingese', 'Pembroke_welsh_corgi',
 'Petit_basset_griffon_vendeen', 'Pharaoh_hound', 'Plott', 'Pointer',
 'Pomeranian', 'Poodle', 'Portuguese_water_dog', 'Saint_bernard',
 'Silky_terrier', 'Smooth_fox_terrier', 'Tibetan_mastiff',
 'Welsh_springer_spaniel', 'Wirehaired_pointing_griffon', 'Xoloitzcuintli',
 'Yorkshire_terrier',]

In [None]:
logits = response[0]
predicted_index = np.argmax(logits)
print(f"Predicted breed is {dog_names[predicted_index]} with probability of {softmax(logits)[predicted_index]}")