### Get a Role

We need to setup the role to access our data from S3

In [None]:
# Dataset is from kaggle: https://www.kaggle.com/pranavraikokte/covid19-image-dataset. Upload it on your S3 bucket

import sagemaker
from sagemaker import get_execution_role
import boto3
import os
import numpy
import pandas as pd
import matplotlib.pyplot as plt

role = get_execution_role()
print(role)

# Fill your bucket name
sess = sagemaker.Session()

# Replace it to your S3 Bucket Here
BUCKET_NAME = "fnandito-covid19-data"
prefix = 'lstFile'

print('using bucket %s'%BUCKET_NAME)

### Define the Label of Image

In [None]:
trainFileArr = []
testFileArr = []

In [None]:
def isCovid(DirName):
    if DirName.find("Covid") != -1:
        return 1
    else:
        return 0

In [None]:
def isPneumonia(DirName):
    if DirName.find("Pneumonia") != -1:
        return 1
    else:
        return 0

### Read Image Files from S3

In [None]:
def GetNameDirFromS3(bucketName, remoteDirectoryName):
    s3_resource = boto3.resource('s3')
    bucket = s3_resource.Bucket(bucketName)
    for obj in bucket.objects.filter(Prefix = remoteDirectoryName):
        FileName =  obj.key.split("/")
        Label = FileName[2]
        Fn = Label + "/" + FileName[3]
        if remoteDirectoryName == "Covid19-dataset/train/":
            trainFileArr.append({'Name': Fn ,'LabelCovid': isCovid(Label), 'LabelPneumonia': isPneumonia(Label)})
        else:
            testFileArr.append({'Name': Fn ,'LabelCovid': isCovid(Label), 'LabelPneumonia': isPneumonia(Label)})

In [None]:
GetNameDirFromS3(BUCKET_NAME, "Covid19-dataset/train/")
GetNameDirFromS3(BUCKET_NAME, "Covid19-dataset/test/")

In [None]:
trainFile = pd.DataFrame(trainFileArr, columns=['Name','LabelCovid','LabelPneumonia'])
testFile = pd.DataFrame(testFileArr, columns=['Name','LabelCovid','LabelPneumonia'])

In [None]:
trainFile

### Data Train Exploration

This step, is where we analyse the amount of data we are having, to see how much the ratio we have for each classes (Covid, Pneumonia, or Normal)

In [None]:
CovidLabel = trainFile.groupby('LabelCovid').count()
PneumoniaLabel = trainFile.groupby('LabelPneumonia').count()
TotalData = CovidLabel["Name"][0] + CovidLabel["Name"][1]
TotalTrainCovid = CovidLabel["Name"][1]
TotalTrainPneumonia = PneumoniaLabel["Name"][1]
TotalTrainNormal = TotalData - TotalTrainCovid - TotalTrainPneumonia
print("Total Train Data : ",TotalData)
print("Total Train Covid Data : ",TotalTrainCovid)
print("Total Train Pneumonia Data : ",TotalTrainPneumonia)
print("Total Train Normal Data : ",TotalTrainNormal)

In [None]:
AxisYBar = [TotalTrainNormal, TotalTrainCovid, TotalTrainPneumonia]
AxisXBar = ["TotalNormal", "TotalCovid", "TotalPneumonia"]

plt.bar(AxisXBar, AxisYBar)
plt.show()

### Create LST File

LST File is necessary for Labeling the image and acts as a reference to your machine learning to gather the data. Since this is a multi-class classification problem, we need 2 labels that represents for existense of covid, or pneumonia.

There will be 3 combinations available:
1 0 label for positive covid patient
0 1 label for positive pneumonia patient
0 0 label for normal patient

In [None]:
with open('TrainImageList.lst', 'w') as fp:
    for index, df in trainFile.iterrows():
        fp.write(str(index) + '\t')
        fp.write(str(df["LabelCovid"]) + '\t')
        fp.write(str(df["LabelPneumonia"]) + '\t')
        fp.write(df["Name"])
        fp.write('\n')
    fp.close()

In [None]:
with open('TestImageList.lst', 'w') as fp:
    for index, df in testFile.iterrows():
        fp.write(str(index) + '\t')
        fp.write(str(df["LabelCovid"]) + '\t')
        fp.write(str(df["LabelPneumonia"]) + '\t')
        fp.write(df["Name"])
        fp.write('\n')
    fp.close()

## Upload LST file to S3

In [None]:
s3train_lst = 's3://{}/{}/train_lst/'.format(BUCKET_NAME, prefix)
s3validation_lst = 's3://{}/{}/validation_lst/'.format(BUCKET_NAME, prefix)

# upload the lst files to train_lst and validation_lst channels
!aws s3 cp TrainImageList.lst $s3train_lst --quiet
!aws s3 cp TestImageList.lst $s3validation_lst --quiet

## Create estimator

To create estimator, first we need to create the image for our model to be deployed. Then, estimator will be used to create the machine learning. It will provision a specific server for training process.

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

training_image = get_image_uri(sess.boto_region_name, 'image-classification', repo_version="latest")
print (training_image)

In [None]:
s3_output_location = 's3://{}/output'.format(BUCKET_NAME)
multilabel_ic = sagemaker.estimator.Estimator(training_image,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.p2.xlarge',
                                         train_volume_size = 50,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

### Set the data Train and test

In [None]:
s3train = "s3://{}/{}/train/".format(BUCKET_NAME, "Covid19-dataset")
s3validation = "s3://{}/{}/test/".format(BUCKET_NAME, "Covid19-dataset")

train_data = sagemaker.session.s3_input(s3train, distribution='FullyReplicated', 
                        content_type='application/x-image', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3validation, distribution='FullyReplicated', 
                             content_type='application/x-image', s3_data_type='S3Prefix')
                             
train_data_lst = sagemaker.session.s3_input(s3train_lst, distribution='FullyReplicated', 
                        content_type='application/x-image', s3_data_type='S3Prefix')
validation_data_lst = sagemaker.session.s3_input(s3validation_lst, distribution='FullyReplicated', 
                             content_type='application/x-image', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data, 'train_lst': train_data_lst, 
                 'validation_lst': validation_data_lst}

## Set hyperparameters

Hyperparameters are being used ton determine the characteristics of your model. Assume this is a configuration for your model, how do you want your setup your model. Several configurations required to optimize the result of Machine Learning, so it will deliver better result.

Sometimes, there are something called "Hyperparameter Optimization (HPO)", meaning there will be a statistics process behind where it will find the most optimum configuration for a machine learning, by calculating the highest accuracy, and the least amount of loss (difference between predicted and actual label).

In [None]:
multilabel_ic.set_hyperparameters(num_layers=18,
                             use_pretrained_model=1,
                             image_shape = "3,224,224",
                             num_classes=2,
                             mini_batch_size=16,
                             resize=256,
                             epochs=5,
                             learning_rate=0.001,
                             num_training_samples=2500,
                             use_weighted_loss=1,
                             augmentation_type = 'crop_color_transform',
                             precision_dtype='float32',
                             multi_label=1)

### Train the model

In [None]:
multilabel_ic.fit(inputs=data_channels, logs=True)

On the top of blue logs, if you see Epoch[0] until Epoch[4]. It means that the machine learning is being trained 5 times using the same data, based on the hyperparameter epoch=5 at the code above. Check the Train-accuracy and Validation-accuracy and see what do you get from there.

### Create Inference

Once the machine learning is created and trained, we need to deploy it. It will create a separate server with size of m4.xlarge, specifically for your machine learning.

In [None]:
ic_classifier = multilabel_ic.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

In [None]:
ic_classifier

### Create Endpoint Configuration

Endpoint Configuration is being used to deploy your machine learning. Before continue running this code, please go to SageMaker console, and copy the name of your model name on your SageMaker Console, in Inference -> Model. Then change 'ModelName' with your model name on ProdictionVariants Array.

you can also change the endpoint_config_name at the code.

In [None]:
sage = boto3.Session().client(service_name='sagemaker') 

endpoint_config_name = "COVID19-ML-Config"
endpoint_config_response = sage.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount':1,
        'ModelName':"image-classification-2021-04-12-14-48-45-751",
        'VariantName':'AllTraffic'}])

print('Endpoint configuration name: {}'.format(endpoint_config_name))
print('Endpoint configuration arn:  {}'.format(endpoint_config_response['EndpointConfigArn']))

### Create Endpoint

Once the endpoint config is created, we need to create the endpoint. You can change the endpoint_name on the code.

In [None]:
endpoint_name = "COVID19-ML-Endpoint"

endpoint_params = {
    'EndpointName': endpoint_name,
    'EndpointConfigName': endpoint_config_name,
}
sagemaker = boto3.client(service_name='sagemaker')
endpoint_response = sagemaker.create_endpoint(**endpoint_params)
print('EndpointArn = {}'.format(endpoint_response['EndpointArn']))

### Check Endpoint Status

Once the above code is being executed, we can check the status of the endpoint, since it might take a while.

In [None]:
# get the status of the endpoint
response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = response['EndpointStatus']
print('EndpointStatus = {}'.format(status))

# wait until the status has changed
sagemaker.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)

# print the status of the endpoint
endpoint_response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
print('Endpoint creation ended with EndpointStatus = {}'.format(status))

if status != 'InService':
    raise Exception('Endpoint creation failed.')

### Evaluate

To Evaluate, we need to download the test image here, and take a several sample

### Download Dataset from S3 to local

In [None]:
def DownloadfromS3(bucketName, remoteDirectoryName):
    s3_resource = boto3.resource('s3')
    bucket = s3_resource.Bucket(bucketName)
    for obj in bucket.objects.filter(Prefix = remoteDirectoryName):
        if not os.path.exists(os.path.dirname(obj.key)):
            os.makedirs(os.path.dirname(obj.key))
        bucket.download_file(obj.key, obj.key) # save to same path

In [None]:
DownloadfromS3(BUCKET_NAME, 'Covid19-dataset/')

Take image sample and evaluate the result

In [None]:
import json

file_name_covid = './Covid19-dataset/test/Covid/0100.jpeg'
file_name_normal = './Covid19-dataset/test/Normal/0101.jpeg'
file_name_pneumonia = './Covid19-dataset/test/Viral Pneumonia/0101.jpeg'

with open(file_name_covid, 'rb') as image:
    f_covid = image.read()
    
with open(file_name_normal, 'rb') as image:
    f_normal = image.read()
    
with open(file_name_pneumonia, 'rb') as image:
    f_pneumonia = image.read()

In [None]:
import numpy as np

runtime = boto3.Session().client(service_name='runtime.sagemaker')
response_covid = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='application/x-image', 
                                   Body=f_covid)

response_normal = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='application/x-image', 
                                   Body=f_normal)

response_pneumonia = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='application/x-image', 
                                   Body=f_pneumonia)

result_covid = response_covid['Body'].read()
result_normal = response_normal['Body'].read()
result_pneumonia = response_pneumonia['Body'].read()
# result will be in json format and convert it to ndarray
result_covid = json.loads(result_covid)
result_normal = json.loads(result_normal)
result_pneumonia = json.loads(result_pneumonia)
# the result will output the probabilities for all classes

# find the class with maximum probability and print the class index
index_covid = np.argmax(result_covid)
index_normal = np.argmax(result_normal)
index_pneumonia = np.argmax(result_pneumonia)

In [None]:
print(result_covid)
print(result_normal)
print(result_pneumonia)