# Image Similarity Modeling and Analysis with PyTorch

In [4]:
import os
import sagemaker

In [None]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/DEMO-pytorch-siamese-network/data'

role = sagemaker.get_execution_role()

## Data

#Getting the data

http://vision.cs.utexas.edu/projects/finegrained/utzap50k/
    
For non-profit use only. 

In [None]:
%%bash
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/ut-zap50k-images-square.zip .
unzip -n ut-zap50k-images-square.zip

Ground truth data for training and scripts for Sagemaker training and inference

In [None]:
%%bash
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/ground_truth.csv .
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/source/requirements.txt .
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/source/batch_inference.py .
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/source/inference.py .
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/source/sim_model.py .
aws s3 cp s3://reinvent-2018-sagemaker-pytorch/source/cnn_siamese_network.py .

[OPTIONAL] You can obtain pre-trained model located at: s3://reinvent-2018-sagemaker-pytorch/models/v1

In [5]:
IMG_PATHS = ["/ut-zap50k-images-square/Boots/Knee High/Anne Klein",
            "/ut-zap50k-images-square/Boots/Knee High/Ariat",
            "/ut-zap50k-images-square/Boots/Mid-Calf/UGG",
            "/ut-zap50k-images-square/Sandals/Athletic/Keen Kids",
            "/ut-zap50k-images-square/Sandals/Heel/Annie",
            "/ut-zap50k-images-square/Sandals/Heel/Fly Flot",
            "/ut-zap50k-images-square/Sandals/Heel/Onex",
            "/ut-zap50k-images-square/Shoes/Oxfords/Calvin Klein",
            "/ut-zap50k-images-square/Shoes/Oxfords/Rockport"]

SOURCE_DIR='source'
WORKING_DIR = os.getcwd()

ARGS = ['--batch_size','--epochs','--learning-rate','--similarity_dims']

PARAM_EPOCHS = 8
PARAM_BATCH_SIZE= 64
PARAM_LR = 1e-4
PARAM_SIMILARITY_DIMS = 64

In [None]:
for path in IMG_PATHS :
    inputs = sagemaker_session.upload_data(path="."+path, bucket=bucket, key_prefix=prefix+path)

In [None]:
sagemaker_session.upload_data(path='ground_truth.csv', bucket=bucket, key_prefix=prefix)
#sagemaker_session.upload_data(path='requirements.txt', bucket=bucket, key_prefix=prefix+"/"+SOURCE_DIR)
sagemaker_session.upload_data(path=, bucket=bucket, key_prefix=prefix)

## Train

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="cnn_siamese_network.py",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p3.2xlarge',
                    source_dir=SOURCE_DIR,
                    hyperparameters={
                        'epochs': PARAM_EPOCHS,
                    })

In [None]:
estimator.fit({'train':'s3://'+bucket+prefix})

## Optional: Real-time Inference

In [None]:
from torchvision import transforms
from PIL import Image

TRANSFORMATIONS = \
transforms.Compose([
    transforms.Resize(224), \
    transforms.ToTensor(), \
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) \
])

def getImageTensor(img_path, transform):
    
    image = Image.open(img_path)
    image_tensor = transform(image)
        
    return image_tensor

In [None]:
from sagemaker.predictor import RealTimePredictor, npy_serializer, json_deserializer

INPUT_CONTENT_TYPE = 'application/npy'
OUTPUT_CONTENT_TYPE = 'text/csv'

class CustomPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(CustomPredictor, self).__init__(endpoint_name, sagemaker_session, npy_serializer, json_deserializer)

In [None]:
training_job_name = estimator.latest_training_job.name
print('last training job: '+training_job_name)
desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
trained_model_location = desc['ModelArtifacts']['S3ModelArtifacts']
print('model location: '+trained_model_location)

model = PyTorchModel(model_data=trained_model_location,
                     role=role,
                     framework_version='0.4.0',
                     entry_point='inference.py',
                     source_dir=SOURCE_DIR,
                     predictor_cls = CustomPredictor)

In [None]:
similiarity_calculator = model.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

# The following snippets are utilities for converting images to a usable format for the batch inference engine. It shouldn't be included in the published notebook

The code blocks below represent a dataset class that is used by Pytorch to traverse our image dataset. It relies on
an index file that provides the location of the images on the local file system.

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class Zappos50kDataset(Dataset):
   
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.index = pd.read_csv(csv_file, header=None, usecols = [0,1])
        self.root_dir = root_dir
        self.transform = transform
  
    def __len__(self):
        return self.index.shape[0]

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.index.iloc[idx, 0])
        image = Image.open(img_name)
        image_tensor = self.transform(image)
        label = self.index.iloc[idx, 1]
        
        return {'name': self.index.iloc[idx, 0], 'tensor': image_tensor, 'label':label}

The snippet below generates an index file used by the custom zappos dataset object to load specific files from the
Zappos50k dataset stored on a local file system.

In [None]:
import hashlib
import csv
import os

IMG_IDX = WORKING_DIR+"/index.csv"

IMG_PATHS = [("ut-zap50k-images-square/Boots/Knee High/Anne Klein", 0),
            ("ut-zap50k-images-square/Boots/Knee High/Ariat", 0),
            ("ut-zap50k-images-square/Boots/Mid-Calf/UGG", 0),
            ("ut-zap50k-images-square/Sandals/Athletic/Keen Kids", 1),
            ("ut-zap50k-images-square/Sandals/Heel/Annie", 1),
            ("ut-zap50k-images-square/Sandals/Heel/Fly Flot", 1),
            ("ut-zap50k-images-square/Sandals/Heel/Onex", 1),
            ("ut-zap50k-images-square/Shoes/Oxfords/Calvin Klein", 2),
            ("ut-zap50k-images-square/Shoes/Oxfords/Rockport", 2)]

def get_categories(img_loc) :

    path, file = os.path.split(img_loc)
    path_parts = path.split(os.sep)
    category = path_parts[1]
    subcategory = path_parts[2]

    return {'category': category, 'sub': subcategory}
    
def generate_index_file() :    
    
    with open(IMG_IDX, 'w') as csvfile:

        try:

            csvwriter = csv.writer(csvfile)
            for (paths,label) in IMG_PATHS:
                
                c = get_categories(paths)
                cid = int(hashlib.sha256(c['category'].encode('utf-8')).hexdigest(), 16) % 10**9
                scid = int(hashlib.sha256(c['sub'].encode('utf-8')).hexdigest(), 16) % 10**9
                    
                files = os.listdir(os.path.join(WORKING_DIR,paths))

                row = []
                for f in files:
                    csvwriter.writerow([os.path.join(paths,f),int(label),cid,scid])

        except csv.Error as e:
            print(e)

        finally:
            csvfile.close()

In [None]:
generate_index_file()

The script below generate input files in NPY format that is required by the batch inference implementation. 

Each file contains images that have been converted to numpy arrays and serialized into gzip files (using Pickle).

Each file contains an array consisting of 4 dimensions: 
    1. Batch size
    2. Channels. The tensors have 3 representing RGB
    3. The last two dimensions are 224x224 representing the pixel values for each image and channel.
    
The first array represents the image that will be compared against other images. For instance, a file that contains
a tensor with the dimensions [53,3,224,224], represents 53 vecotrized images. The first index into the first dimension represents an image of the shape [1,3,224,224] that will be compared against the other slices that represent 52 images
of the same shape.

In [None]:
from io import BytesIO
import pickle, gzip

BATCH_INPUT_PREFIX = 'sagemaker/DEMO-pytorch-siamese-network/batch/in'
BATCH_OUTPUT_PREFIX = 'sagemaker/DEMO-pytorch-siamese-network/batch/out'

img_loc = WORKING_DIR+'/ut-zap50k-images-square/Boots/Knee High/Anne Klein/8059298.310.jpg'
#img_loc = WORKING_DIR+'/ut-zap50k-images-square/Boots/Over the Knee/Calvin Klein Collection/8005712.365488.jpg'
IMG_TENSOR_ROOT = WORKING_DIR+'/tensors'
PARAM_BATCH_SIZE = 1
FILE_PREFIX = '/Boots/Knee High/Anne Klein/8059298.310'
BATCH_INPUT_FILENAME = '/tensors'

def batch_image_to_tensor(img_loc, dataloader, file_prefix, s3_prefix_out, batch_size=26) :
            
    img1 = getImageTensor(img_loc, TRANSFORMATIONS)
    img1.unsqueeze_(0)
    img1 = img1.numpy()

    npy_f = IMG_TENSOR_ROOT+file_prefix+BATCH_INPUT_FILENAME
    
    if not os.path.exists(IMG_TENSOR_ROOT+file_prefix):
        os.makedirs(IMG_TENSOR_ROOT+file_prefix)
           
    i = 0
    nbatch = 1
    
    try:
        
        npy = None
        
        for data in dataloader:

            img_name = data.get('name')[0]
            img2 = data.get('tensor').numpy()  
            batch = np.vstack((img1,img2))

            if (i%batch_size) == 0 : 
                npy = gzip.open(npy_f+str(nbatch)+'.npy.gz', 'wb')

            pickle.dump(batch, npy, 2)

            i+=1
            if (i%batch_size) == 0 : 
                npy.close()
                sagemaker_session.upload_data(path=npy_f+str(nbatch)+'.npy.gz', bucket=bucket, key_prefix=s3_prefix_out)
                nbatch+=1
    finally:
        if npy is not None and not npy.closed:
            npy.close()
            sagemaker_session.upload_data(path=npy_f+str(nbatch)+'.npy.gz', bucket=bucket, key_prefix=s3_prefix_out)

zapposDS = Zappos50kDataset(IMG_IDX,WORKING_DIR, TRANSFORMATIONS)
zapposDL = torch.utils.data.DataLoader(dataset=zapposDS, batch_size= PARAM_BATCH_SIZE, shuffle=False)
batch_image_to_tensor(img_loc, zapposDL, FILE_PREFIX, BATCH_INPUT_PREFIX+FILE_PREFIX)   

## Batch Inference

In [None]:
from sagemaker.pytorch import PyTorchModel

#training_job_name = estimator.latest_training_job.name
training_job_name = 'sagemaker-pytorch-2018-09-14-05-12-38-564'
print('last training job: '+training_job_name)
desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
trained_model_location = desc['ModelArtifacts']['S3ModelArtifacts']
print('model location: '+trained_model_location)

model = PyTorchModel(model_data=trained_model_location,
                     role=role,
                     framework_version='0.4.0',
                     entry_point='batch_inference.py',
                     source_dir=SOURCE_DIR,
                     name = 'sim-model-batch')

Temp hack... it's not clear how to deploy a model without an endpoint at this time. 

In [None]:
similiarity_calculator = model.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

In [None]:
from sagemaker.transformer import Transformer

transformer = Transformer(model_name=model.name,
                          instance_count=1,
                          instance_type='ml.m4.xlarge',
                          accept = 'text/csv',
                          output_path='s3://'+bucket+'/'+BATCH_OUTPUT_PREFIX
                         )
transformer.transform('s3://'+bucket+'/'+BATCH_INPUT_PREFIX, content_type= 'application/x-npy')