# Document Identification with Amazon SageMaker Image Classification

## Introduction

Train a document identification model using SageMaker Image Classification built in algorithm.

## Setup



### Training data folder structure
You need to prepare training data in this following folder structure:


make a zip file for your root folder

In [None]:
#execute this only once
!pip install opencv-python-headless

In [None]:
%%time
import sagemaker
import cv2
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()
bucket=sess.default_bucket()
print(bucket)

In [None]:
# make sure you have same current directory with your notebook
import os
print(os.getcwd())

### Change this!
change this based on your preferences

In [None]:
prefix = 'document-identification' # Put your data source folder prefix here

### Unzip and process your training data
DO THIS FIRST: upload your zip file to same directory with your notebook file

In [None]:
# remove existing folder, in this example KTP is our root folder
!rm -R KTP

In [None]:
# unzip your zip file
!unzip docs.zip

### Delete existing processing folder
This following cell will delete previous processing folder. You don't need to execute this if you are using this notebook for first time

In [None]:
%%bash

rm -R document_dataset
rm -R document_dataset_augmented
rm -R document_dataset_augmented_val
rm -R data_recordio
rm document-train.lst
rm document-val.lst



#### CHANGE THIS!
Change these values based on your class name and your root folder name

In [None]:
# CHANGE THIS BASED ON YOUR CLASSES/FOLDER NAME IN ALFABHETICAL ORDER
document_type = ['KK', 'KTP', 'PASPOR','SIM']

# CHANGE THIS BASED ON YOUR ROOT FOLDER NAME
inputBasePath = "KTP"

### Create temporary dataset for image augmentation

In [None]:
%%bash -s "$inputBasePath"

mkdir -p document_dataset
for i in "$1"/*; do
    c=`basename $i`
    mkdir -p document_dataset/$c
    for j in `ls $i/*.jpg | shuf | head -n 25`; do        
        mv $j document_dataset/$c/
    done
done


## Images augmentation
We will perform images augmentation to enrich our training dataset

In [None]:
# Import libraries
from PIL import Image, ImageEnhance
from pathlib import Path
import matplotlib.pyplot as plt
import uuid
import random
import os

In [None]:
# transformation functions

def sharpen(img, factor):
    enhancer_sharpness = ImageEnhance.Sharpness(img)
    return enhancer_sharpness.enhance(factor)

def contrast(img, factor):
    enhancer_contrast = ImageEnhance.Contrast(img)
    return enhancer_contrast.enhance(factor)

def rotate(img, degrees):
    return img.rotate(degrees)

def save(img, path):
    return img.save(path, "JPEG")

### for training data

In [None]:
# new locations
inputBasePath = './document_dataset/'
outputBasePath = './document_dataset_augmented/'



In [None]:
# transformation
rotations = [0,90,270]
randContrastMin, randContrastMax = (0.8, 1.2)
randSharpenMin, randSharpenMax  = (0.8, 1.2)
multiplier = 20

In [None]:
# transform
for f in document_type:
    plist = Path(inputBasePath + f + '/').glob('*.jpg')

    outpath = outputBasePath + f + '/' 
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    
    for path in plist:
        i = Image.open( path )

        for r in rotations:
            
            for m in range(multiplier):
                
                randContrast = random.uniform(randContrastMin, randContrastMax)
                randSharpen = random.uniform(randSharpenMin, randSharpenMax)

                i = rotate(i, r)
                i = contrast(i, randContrast)
                i = sharpen(i, randSharpen)
                
                save(i, outpath + str(uuid.uuid4()) + '.jpg') 
                print('.', end='')


### for validation data

In [None]:
# new locations, please change inputBasePath to your root folder
inputBasePath = './KTP/'
outputBasePath = './document_dataset_augmented_val/'



In [None]:
# transformation
rotations = [0,90,270]
randContrastMin, randContrastMax = (0.8, 1.2)
randSharpenMin, randSharpenMax  = (0.8, 1.2)
multiplier = 5

In [None]:
# transform
for f in document_type:
    plist = Path(inputBasePath + f + '/').glob('*.jpg')

    outpath = outputBasePath + f + '/' 
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    
    for path in plist:
        i = Image.open( path )

        for r in rotations:
            
            for m in range(multiplier):
                
                randContrast = random.uniform(randContrastMin, randContrastMax)
                randSharpen = random.uniform(randSharpenMin, randSharpenMax)
                i = rotate(i, r)
                i = contrast(i, randContrast)
                i = sharpen(i, randSharpen)
                
                save(i, outpath + str(uuid.uuid4()) + '.jpg') 
                print('.', end='')


## Training Data Preparation

### Download im2rec

In [None]:
import os
import urllib.request

def download(url):
    filename = url.split("/")[-1]
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)
        
# Tool for creating lst file
download('https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py')

### Create lst file

In [None]:
!pip install mxnet

In [None]:
%%bash

python im2rec.py --list --recursive document-train document_dataset_augmented/
python im2rec.py --list --recursive document-val document_dataset_augmented_val/

In [None]:
# number of training samples
training_count = len(open('document-train.lst').readlines())
print(training_count)

### convert to RecordIO format

In [None]:
import shutil
import pathlib

recordio_dir = pathlib.Path('./data_recordio')
recordio_dir.mkdir(exist_ok=True)
shutil.copy('document-train.lst', 'data_recordio/');
shutil.copy('document-val.lst', 'data_recordio/');

In [None]:
!python im2rec.py --resize 224 --quality 90 --num-thread 16 data_recordio/document-train document_dataset_augmented/
!python im2rec.py --resize 224 --quality 90 --num-thread 16 data_recordio/document-val document_dataset_augmented_val/

### Upload data to S3
Upload the data to the S3 bucket. We do this in multiple channels. Channels are simply directories in the bucket that differentiate between training and validation data.
Create these following folders in S3!

In [None]:
s3_uploader = sagemaker.s3.S3Uploader()

data_path = recordio_dir / 'document-train.rec'

data_s3_uri = s3_uploader.upload(
    local_path=data_path.as_posix(), 
    desired_s3_uri=f's3://{bucket}/{prefix}/data/train')

In [None]:
data_path = recordio_dir / 'document-val.rec'

data_s3_uri = s3_uploader.upload(
    local_path=data_path.as_posix(), 
    desired_s3_uri=f's3://{bucket}/{prefix}/data/val')

In [None]:
train_data = sagemaker.inputs.TrainingInput( 
    s3_data=f's3://{bucket}/{prefix}/data/train',
    content_type='application/x-recordio',
    s3_data_type='S3Prefix',
    input_mode='Pipe')

val_data = sagemaker.inputs.TrainingInput( 
    s3_data=f's3://{bucket}/{prefix}/data/val',
    content_type='application/x-recordio',
    s3_data_type='S3Prefix',
    input_mode='Pipe')

data_channels = {'train': train_data, 'validation': val_data}

## Train the Image Classification Model


In [None]:
num_classes = len(document_type)
num_training_samples = training_count

training_image = sagemaker.image_uris.retrieve('image-classification', sagemaker.Session().boto_region_name)

## Hyperparameter Tuning
Since we don't know optimal hyperparameter, we will directly use HPO

In [None]:
sess = sagemaker.Session()
imageclassification = sagemaker.estimator.Estimator(training_image, 
                                                    role, 
                                                    instance_count=1,
                                                    instance_type='ml.p3.8xlarge',
                                                    output_path=f's3://{bucket}/{prefix}/data/output', 
                                                    sagemaker_session=sess)

imageclassification.set_hyperparameters(num_layers=18, 
                                        image_shape='3,224,224',
                                        num_classes=num_classes,
                                        epochs=30, 
                                        top_k='2',
                                        num_training_samples=num_training_samples,
                                        precision_dtype='float32',
                                        augmentation_type='crop')

### set hyperparameter optimization jobs

In [None]:
from time import gmtime, strftime 
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

tuning_job_name = "document-tuning-job-{}".format(strftime("%d-%H-%M-%S", gmtime()))

hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.00001, 1.0),
                         'mini_batch_size': IntegerParameter(16, 64),
                         'optimizer': CategoricalParameter(['sgd', 'adam', 'rmsprop', 'nag'])}

objective_metric_name = 'validation:accuracy'

tuner = HyperparameterTuner(imageclassification, 
                            objective_metric_name, 
                            hyperparameter_ranges,
                            objective_type='Maximize', 
                            max_jobs=20, 
                            max_parallel_jobs=2,
                            early_stopping_type='Auto')

### Execute training jobs

In [None]:
tuner.fit({'train': train_data, 'validation': val_data}, 
          job_name=tuning_job_name, include_cls_metadata=False)
tuner.wait()

In [None]:
tuner_metrics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
tuner_metrics.dataframe().sort_values(['FinalObjectiveValue'], ascending=False).head(5)

In [None]:
total_time = tuner_metrics.dataframe()['TrainingElapsedTimeSeconds'].sum() / 3600
print("The total training time is {:.2f} hours".format(total_time))
tuner_metrics.dataframe()['TrainingJobStatus'].value_counts()

## Model Inference


#### Deploy best model from hyperparameter tuning job

In [None]:
ic_classifier = tuner.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.t2.medium')

## Inference Test
You can upload your test image data in same directory with this notebook

In [None]:
%matplotlib inline  
    
import cv2


#file_name = 'kk1.png'
file_name = 'ktp1.png'
#file_name = 'ktp2.jpg'
#file_name = 'sim.jpg'

#resize
im = cv2.imread(file_name)
im = cv2.resize(im, (600, 400))
cv2.imwrite(file_name, im)


# display test image
from IPython.display import Image, display
img = Image(file_name) 
display(img)

In [None]:
import json

with open(file_name, 'rb') as image:
    f = image.read()
    b = bytearray(f)
#ic_classifier.content_type = 'application/x-image'
#results = ic_classifier.predict(b)
results = ic_classifier.predict(b, initial_args={'ContentType': 'image/jpeg'})
prob = json.loads(results)
classes = document_type
for idx, val in enumerate(classes):
    print('%s:%f '%(classes[idx], prob[idx]), end='')

In [None]:
# CHANGE THIS BASED ON TRAINING DATA INDEX
#print(result)
import numpy as np

index = np.argmax(prob)
print("Result: label - " + document_type[index] + ", probability - " + str(prob[index]))

#### Publish your endpoint using Lambda + API Gateway