In [40]:
import sys                                                                             # Python system library needed to load custom functions
import numpy as np                                                                     # for performing calculations on numerical arrays
import pandas as pd                                                                    # home of the DataFrame construct, _the_ most important object for Data Science
import seaborn as sns                                                                  # additional plotting library
import matplotlib.pyplot as plt                                                        # allows creation of insightful plots
import os                                                                              # for changing the directory

import sagemaker                                                                       # dedicated sagemaker library to execute training jobs
from sagemaker.pytorch.estimator import PyTorch
import boto3                                                                           # for interacting with S3 buckets

from sklearn.metrics import precision_recall_fscore_support, accuracy_score             # tools to understand how our model is performing

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import torchvision.models as models


sys.path.append('../src')                                                               # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from img_class_util import CatDogDS




In [7]:
entry_point = 'img_class_util.py'
exp_name = entry_point.split('.')[0].replace('_', '-')  # AWS does not allow . and _ as experiment names
exp_name

'img-class-util'

In [8]:
my_session = boto3.session.Session()


account_id = boto3.client('sts').get_caller_identity().get('Account')
role = sagemaker.get_execution_role()
region = my_session.region_name


In [9]:
folder = 'dog_cat_images'

input_channels = {    
    "data": f"s3://sagemaker-us-east-1-175218665739/{folder}/data"    # the directory has to be 'data' because that is the input in SM_Channel
}


input_channels

{'data': 's3://sagemaker-us-east-1-175218665739/dog_cat_images/data'}

In [10]:
sagemaker_bucket = f"sagemaker-{region}-{account_id}"


s3_output_location = f"s3://{sagemaker_bucket}/{exp_name}"
s3_output_location

's3://sagemaker-us-east-1-175218665739/img-class-util'

In [7]:
# This gets entered into the arg.parser

num_epochs = 1

hyperparameters={
    "epochs":num_epochs,                                                   # number of training epochs
    "train_dir":"train",                                           # folder name with training data
    "val_dir":"val",                                               # folder name with validation data
    "test_dir":"test",                                             # folder name with test data

}

In [8]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [9]:
image_uri = '954362353459.dkr.ecr.us-east-1.amazonaws.com/sm-training-custom:latest'

pytorch_estimator = PyTorch(
    entry_point=entry_point,                # fine-tuning script to use in training job
    source_dir="../src",                     # directory where fine-tuning script is stored. This directory will be downloaded to training instance
    instance_type="ml.g4dn.xlarge",         # instance type - ml.g4dn.xlarge is a GPU instance so the training will be faster, only alotted 1. 
    output_path = s3_output_location,       # outputbucket to store our model after training
    instance_count=1,                       # number of instances. We are limited to 1 instance
    role=role,                              # IAM role used in training job to acccess AWS resources (S3)
    image_uri = image_uri,                  # passing our custom image with the required libraries
    py_version="py310",                     # Python version 
    hyperparameters=hyperparameters,        # hyperparameters to use in training job
    metric_definitions = metric_definitions # metrics we want to extract from logs. It will be visible in SageMaker training job UI,
)

In [10]:
pytorch_estimator.fit(input_channels, wait=True)

INFO:sagemaker:Creating training-job with name: sm-training-custom-2023-07-15-16-45-49-730


Using provided s3_resource
2023-07-15 16:45:50 Starting - Starting the training job...
2023-07-15 16:46:04 Starting - Preparing the instances for training......
2023-07-15 16:47:16 Downloading - Downloading input data.........
2023-07-15 16:48:31 Training - Downloading the training image......
2023-07-15 16:49:32 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-07-15 16:49:41,550 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-07-15 16:49:41,567 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-07-15 16:49:41,576 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-07-15 16:49:41,579 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-

## Load Model

In [42]:
model = models.vgg16(weights=True)
model.classifier[-1] = nn.Linear(in_features=4096, out_features= 2)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = model.to(device)



In [43]:

model_url = "s3://sagemaker-us-east-1-175218665739/img-class-util/sm-training-custom-2023-07-15-16-45-49-730/output/"
key = "img-class-util/sm-training-custom-2023-07-15-16-45-49-730/output/model.tar.gz"
local_model_dir = "../models"
    


In [44]:
s3 = boto3.resource("s3", region_name=region)
file_name = "model.tar.gz"

In [14]:
s3.Bucket(sagemaker_bucket).download_file(f"img-class-util/sm-training-custom-2023-07-15-16-45-49-730/output/model.tar.gz", 
                                          f"{local_model_dir}/{file_name}")

In [45]:
model.load_state_dict(torch.load(f"{local_model_dir}/model.pth",map_location=torch.device('cpu')))

<All keys matched successfully>

## Make Inferences

In [47]:
test_ds = CatDogDS("../data/test_meta.csv", "../data/test1", None, True)
test_dl = DataLoader(test_ds, batch_size=16, shuffle=False)

In [50]:
def make_predictions (model, test_dl):
     # Disable gradient updates
    file_names = []
    predictions = []
    with torch.no_grad():
        for data in test_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, paths = data[0].to(device), data[1]
            
            for path in paths:
                file_name = path.split('/')[-1]
                file_names.append(file_name)            
            

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs,1)
            
            
            for i, x in enumerate(prediction):
                predictions.append(x.item())
                     
            # print(prediction)
    
    return file_names, predictions

In [51]:

file_names, predictions = make_predictions(model, test_dl)

test_predictions_df = pd.DataFrame(list(zip(file_names, predictions)),
           columns =['file_name', 'predicted_class_id'])

In [52]:
test_predictions_df.head()

Unnamed: 0,file_name,predicted_class_id
0,10592.jpg,0
1,7217.jpg,0
2,3653.jpg,1
3,4382.jpg,0
4,2924.jpg,1


## References

Discussions
1. https://stackoverflow.com/questions/37514810/how-to-get-the-region-of-the-current-user-from-boto

Articles
1. https://www.analyticsvidhya.com/blog/2021/06/transfer-learning-using-vgg16-in-pytorch/
Documentations