In [1]:
import os
import time
import boto3

In [2]:
pipeline_name = 'lstm-gender-classifier'
run='6'
epochs = '30'
bucket_name='sagemaker-us-east-1-741855114961'
run_name = pipeline_name+"-"+run

In [3]:
!sudo yum install -y docker
!sudo service docker start

Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper
amzn-main                                                | 2.1 kB     00:00     
amzn-updates                                             | 2.5 kB     00:00     
epel/x86_64/metalink                                     |  17 kB     00:00     
epel                                                     | 4.7 kB     00:00     
(1/2): epel/x86_64/updateinfo                              | 776 kB   00:00     
(2/2): epel/x86_64/primary_db                              | 6.0 MB   00:00     
1045 packages excluded due to repository priority protections
Package docker-17.12.1ce-1.135.amzn1.x86_64 already installed and latest version
Nothing to do


In [None]:
!sudo docker info

In [None]:
!sudo rm -rf Sagemaker_BYOA-LSTM_Keras

In [None]:
!sudo rm -rf Sagemaker_BYOA-LSTM_Keras

In [None]:
!git clone https://github.com/dbinoy/Sagemaker_BYOA-LSTM_Keras.git

In [4]:
os.chdir('Sagemaker_BYOA-LSTM_Keras/container')
os.getcwd()

'/home/ec2-user/SageMaker/unicornML/Sagemaker_BYOA-LSTM_Keras/container'

In [5]:
%%writefile lstm/train
#!/usr/bin/env python3

from __future__ import print_function

import os
import json
import pickle
import sys
import traceback

import numpy as np
import pandas as pd
from numpy import genfromtxt
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.models import load_model
from sklearn.utils import shuffle

from os import listdir, sep
from os.path import abspath, basename, isdir
from sys import argv

# These are the paths to where SageMaker mounts interesting things in your container.

prefix = '/opt/ml/'

input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

# This algorithm has a single channel of input data called 'training'.
# Since we run in File mode, the input files are copied to the directory specified here.
channel_name='train'
training_path = os.path.join(input_path, channel_name)
if not os.path.exists(training_path):
    training_path = os.path.join(input_path, 'training')


# The function to execute the training.
def train():
    print('Starting the training.')
    try:
        # Read in any hyperparameters that the user passed with the training job
        with open(param_path, 'r') as tc:
            trainingParams = json.load(tc)
        print("Hyperparameters file : " + json.dumps(trainingParams))
        #Extract the supported hyperparameters
        batch_records = int(trainingParams.get('batch_size', '1000'))
        num_epochs=int(trainingParams.get('num_epochs', '5'))
        dropout_ratio=float(trainingParams.get('dropout_ratio', '0.2'))
        split_ratio=float(trainingParams.get('split_ratio', '0.2'))
        sequence_size=int(trainingParams.get('sequence_size', '512'))
        activation_function=trainingParams.get('activation_function', 'sigmoid')
        loss_function=trainingParams.get('loss_function', 'binary_crossentropy')
        optimizer_function=trainingParams.get('optimizer_function', 'rmsprop')
        metrics_measure=trainingParams.get('metrics_measure', 'accuracy')
        print("Hyperparameters initialized")

        # Original source of training data, which the trainer would defult to if no train channel is specified
        data_filename = "https://s3.amazonaws.com/name-gender/allnames.txt"
        if os.path.exists(training_path) :
            input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
            if len(input_files) == 0:
                print('There are no files in {}.\nUsing default training data set available at {}'.format(training_path, data_filename))
            else:
                data_filename = input_files[0]
        else:
            print('No training folder {}.\nUsing default training data set available at {}'.format(training_path, data_filename))
        print("Loading data from : {}".format(data_filename))

        #Read training data from CSV and load into a data frame
        df=pd.read_csv(data_filename, sep=',', names = ["Name", "Gender", "Count"])
        print("Training data loaded")

        #Remove unnecessary attributes from training data
        df = df.drop(['Count'], axis=1)

        #Remove duplicate rows from training data
        df = df.drop_duplicates()

        #Shuffle training data
        df = shuffle(df)

        #number of names
        num_names = df.shape[0]

        # length of longest name
        max_name_length = (df['Name'].map(len).max())

        # length of shortest name
        min_name_length = (df['Name'].map(len).min())

        #Separate data andn label
        names = df['Name'].values
        genders = df['Gender']

        #Determine Alphabets in the input
        txt = ""
        for n in names:
            txt += n.lower()
        #Alphabet derived as an unordered set containing unique entries of all characters used in name
        chars = sorted(set(txt))
        alphabet_size = len(chars)

        #Assign index values to each symbols in Alphabet
        char_indices = dict((c, i) for i, c in enumerate(chars))

        #One hot encoding to create training-X
        X = np.zeros((num_names, max_name_length, alphabet_size))
        for i,name in enumerate(names):
            name = name.lower()
            for t, char in enumerate(name):
                X[i, t,char_indices[char]] = 1

        #Encode training-Y with 'M' as 1 and 'F' as 0
        Y = np.ones((num_names,1))
        Y[df['Gender'] == 'F',0] = 0

        #Shape of one-hot encoded array is equal to length of longest input string by size of Alphabet
        data_dim = alphabet_size
        timesteps = max_name_length
        print("Training data prepared")

        #Consider this as a problem to recognize just one class of output from the rest, effectively same as binary classification
        num_classes = 1

        #Initiate a sequential model
        model = Sequential()

        # Add an LSTM layer that returns a sequence of vectors of dimension sequence size (512 by default)
        model.add(LSTM(sequence_size, return_sequences=True, input_shape=(timesteps, data_dim)))

        # Drop out certain percentage (20% by default) to prevent over fitting
        if dropout_ratio > 0 and dropout_ratio < 1:
            model.add(Dropout(dropout_ratio))

        # Stack another LSTM layer that returns a single vector of dimension sequence size (512 by default)
        model.add(LSTM(sequence_size, return_sequences=False))

        # Drop out certain percentage (20% by default) to prevent over fitting
        if dropout_ratio > 0 and dropout_ratio < 1:
            model.add(Dropout(dropout_ratio))

        # Finally add an activation layer with a chosen activation function (Sigmoid by default)
        model.add(Dense(num_classes, activation=activation_function))

        # Compile the Stacked LSTM Model with a loss function (binary_crossentropy by default),
        #optimizer function (rmsprop) and a metric for measuring model effectiveness (accuracy by default)
        model.compile(loss=loss_function, optimizer=optimizer_function, metrics=[metrics_measure])
        print("Model compiled")

        # Train the model for a number of epochs (50 by default), with a batch size (1000 by default)
        # Split a portion of trainining data (20% by default) to be used a validation data
        model.fit(X, Y, validation_split=split_ratio, epochs=num_epochs, batch_size=batch_records)
        print("Model trained")

        # Save the model artifacts and character indices under /opt/ml/model
        model_type='lstm-gender-classifier'
        model.save(os.path.join(model_path,'{}-model.h5'.format(model_type)))
        char_indices['max_name_length'] = max_name_length
        np.save(os.path.join(model_path,'{}-indices.npy'.format(model_type)), char_indices)

        print('Training complete.')
    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)
        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)

if __name__ == '__main__':
    train()

    # A zero exit code causes the job to be marked a Succeeded.
    sys.exit(0)

Overwriting lstm/train


In [6]:
%%writefile lstm/predictor.py
# This is the file that implements a flask server to do inferences. It's the file that you will modify to
# implement the scoring for your own algorithm.

from __future__ import print_function

import os
import json
import pickle
from io import StringIO
import sys
import signal
import traceback

import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.models import load_model
import flask

import tensorflow as tf

import pandas as pd

from os import listdir, sep
from os.path import abspath, basename, isdir
from sys import argv

prefix = '/opt/ml/'
model_path = os.path.join(prefix, 'model')

# A singleton for holding the model. This simply loads the model and holds it.
# It has a predict function that does a prediction based on the model and the input data.

class ScoringService(object):
    model_type = None           # Where we keep the model type, qualified by hyperparameters used during training
    model = None                # Where we keep the model when it's loaded
    graph = None
    indices = None              # Where we keep the indices of Alphabet when it's loaded

    @classmethod
    def get_indices(cls):
        #Get the indices for Alphabet for this instance, loading it if it's not already loaded
        if cls.indices == None:
            model_type='lstm-gender-classifier'
            index_path = os.path.join(model_path, '{}-indices.npy'.format(model_type))
            if os.path.exists(index_path):
                cls.indices = np.load(index_path).item()
            else:
                print("Character Indices not found.")
        return cls.indices

    @classmethod
    def get_model(cls):
        #Get the model object for this instance, loading it if it's not already loaded
        if cls.model == None:
            model_type='lstm-gender-classifier'
            mod_path = os.path.join(model_path, '{}-model.h5'.format(model_type))
            if os.path.exists(mod_path):
                cls.model = load_model(mod_path)
                cls.model._make_predict_function()
                cls.graph = tf.get_default_graph()
            else:
                print("LSTM Model not found.")
        return cls.model

    @classmethod
    def predict(cls, input):
        """For the input, do the predictions and return them.

        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        mod = cls.get_model()
        ind = cls.get_indices()

        result = {}

        if mod == None:
            print("Model not loaded.")
        else:
            if 'max_name_length' not in ind:
                max_name_length = 15
                alphabet_size = 26
            else:
                max_name_length = ind['max_name_length']
                ind.pop('max_name_length', None)
                alphabet_size = len(ind)

            inputs_list = input.strip('\n').split(",")
            num_inputs = len(inputs_list)

            X_test = np.zeros((num_inputs, max_name_length, alphabet_size))

            for i,name in enumerate(inputs_list):
                name = name.lower().strip('\n')
                for t, char in enumerate(name):
                    if char in ind:
                        X_test[i, t,ind[char]] = 1

            with cls.graph.as_default():
                predictions = mod.predict(X_test)

            for i,name in enumerate(inputs_list):
                result[name] = 'M' if predictions[i]>0.5 else 'F'
                print("{} ({})".format(inputs_list[i],"M" if predictions[i]>0.5 else "F"))

        return json.dumps(result)

# The flask app for serving predictions
app = flask.Flask(__name__)

@app.route('/ping', methods=['GET'])
def ping():
    #Determine if the container is working and healthy.
    # Declare it healthy if we can load the model successfully.
    health = ScoringService.get_model() is not None and ScoringService.get_indices() is not None
    status = 200 if health else 404
    return flask.Response(response='\n', status=status, mimetype='application/json')

@app.route('/invocations', methods=['POST'])
def transformation():
    #Do an inference on a single batch of data
    data = None

    # Convert from CSV to pandas
    if flask.request.content_type == 'text/csv':
        data = flask.request.data.decode('utf-8')
    else:
        return flask.Response(response='This predictor only supports CSV data', status=415, mimetype='text/plain')

    print('Invoked with {} records'.format(data.count(",")+1))

    # Do the prediction
    predictions = ScoringService.predict(data)

    result = ""
    for prediction in predictions:
        result = result + prediction

    return flask.Response(response=result, status=200, mimetype='text/csv')


Overwriting lstm/predictor.py


In [7]:
!sh build_and_push.sh $run_name

Login Succeeded
Sending build context to Docker daemon  50.17MB
Step 1/14 : FROM nvidia/cuda:9.0-base-ubuntu16.04
 ---> 73cb54e0c584
Step 2/14 : MAINTAINER Binoy Das <binoyd@amazon.com>
 ---> Using cache
 ---> 193c62a42bf2
Step 3/14 : RUN apt-get update && apt-get install -y --no-install-recommends         apt-utils         build-essential         cuda-command-line-tools-9-0         cuda-cublas-9-0         cuda-cufft-9-0         cuda-curand-9-0         cuda-cusolver-9-0         cuda-cusparse-9-0         curl         libcudnn7=7.0.5.15-1+cuda9.0         libfreetype6-dev         libpng12-dev         libzmq3-dev         libhdf5-dev         libcurl3-dev         libgtk2.0-0         pkg-config         python3-dev         python3-pip         rsync         software-properties-common         unzip         gzip         wget         vim         git         nginx         ca-certificates         &&     apt-get clean &&     rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> e4288ab0dcc7
Step 4/14 :

[12B091c570: Pushing  782.7MB/1.137GB[16A[1K[K[14A[1K[K[15A[1K[K[12A[1K[K[15A[1K[K[12A[1K[K[15A[1K[K[12A[1K[K[15A[1K[K[12A[1K[K[15A[1K[K[15A[1K[K[16A[1K[K[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[11A[1K[K[13A[1K[K[11A[1K[K[12A[1K[K[13A[1K[K[10A[1K[K[13A[1K[K[10A[1K[K[13A[1K[K[10A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[9A[1K[K[8A[1K[K[9A[1K[K[12A[1K[K[13A[1K[K[9A[1K[K[13A[1K[K[9A[1K[K[8A[1K[K[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[7A[1K[K[13A[1K[K[7A[1K[K[13A[1K[K[9A[1K[K[13A[1K[K[12A[1K[K[9A[1K[K[6A[1K[K[9A[1K[K[5A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[13A[1K[K[13A[1K[K[9A[1K[K[13A[1K[K[9A[1K[K[13A[1K[K[12A[1K[K[13A[1K[K[12A[1K[K[9A[1K[K[3A[1K

[9B70584976: Pushed   1.338GB/1.326GB[9A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[12A[1K[KPushing  647.8MB/1.326GB[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[9A[1K[K[9A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[12A[1K[K[9A[1K[K[9A[1K[K[12A[1K[K[K[12A[1K

In [None]:
!docker rm $(docker ps -a -q -f status=exited)
!docker rmi -f $(docker images -a -q)

In [8]:
sagemaker = boto3.client('sagemaker')

In [9]:
response = sagemaker.create_training_job(
    TrainingJobName=run_name+'-training',
    HyperParameters={
        'num_epochs': epochs
    },
    AlgorithmSpecification={
        'TrainingImage': '741855114961.dkr.ecr.us-east-1.amazonaws.com/'+run_name+':latest',
        'TrainingInputMode': 'File'
    },    
    RoleArn='arn:aws:iam::741855114961:role/service-role/AmazonSageMaker-ExecutionRole-20180102T134989',
    InputDataConfig=[
        {
            'ChannelName': 'train',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': 's3://'+bucket_name+'/'+pipeline_name,
                    'S3DataDistributionType': 'FullyReplicated'
                }
            },
            'CompressionType': 'None',
            'RecordWrapperType': 'None'
        },
    ],
    OutputDataConfig={
        'S3OutputPath': 's3://'+bucket_name+'/output'
    },
    ResourceConfig={
        'InstanceType': 'ml.p3.16xlarge',
        'InstanceCount': 1,
        'VolumeSizeInGB': 10
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 86400
    },
    Tags=[
        {
            'Key': 'Name',
            'Value': run_name+'-training'
        }
    ]    
)
status='InProgress'
while status != 'Completed' and status != 'Failed' and status != 'Failed':
    response = sagemaker.describe_training_job(
        TrainingJobName=run_name+'-training'
    )
    status = response['TrainingJobStatus']
    time.sleep(30)
    print(status)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [11]:
if status == 'Completed':
    response = sagemaker.create_model(
        ModelName=run_name+'-model',
        PrimaryContainer={
            'Image': '741855114961.dkr.ecr.us-east-1.amazonaws.com/'+run_name+':latest',
            'ModelDataUrl': 's3://sagemaker-us-east-1-741855114961/output/'+run_name+'-training/output/model.tar.gz',
            'Environment': {
                'string': 'string'
            }
        },
        ExecutionRoleArn='arn:aws:iam::741855114961:role/service-role/AmazonSageMaker-ExecutionRole-20180102T134989',
        Tags=[
            {
                'Key': 'Name',
                'Value': run_name+'-model'
            }
        ]
    )    

In [12]:
response = sagemaker.create_endpoint_config(
    EndpointConfigName=run_name+'-endpoint-config',
    ProductionVariants=[
        {
            'VariantName': 'default',
            'ModelName': run_name+'-model',
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.p3.2xlarge',
            'InitialVariantWeight': 1
        },
    ],
    Tags=[
        {
            'Key': 'Name',
            'Value': run_name+'-endpoint-config'
        }
    ]
)

In [13]:
response = sagemaker.create_endpoint(
    EndpointName=run_name+'-endpoint',
    EndpointConfigName=run_name+'-endpoint-config',
    Tags=[
        {
            'Key': 'string',
            'Value': run_name+'-endpoint'
        }
    ]
)
status='Creating'
while status != 'InService' and status != 'Failed' and status != 'OutOfService':
    response = sagemaker.describe_endpoint(
        EndpointName=run_name+'-endpoint'
    )
    status = response['EndpointStatus']
    time.sleep(30)
    print(status)

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


In [20]:
!aws sagemaker-runtime invoke-endpoint --endpoint-name "$run_name-endpoint" --body 'Alyse,Hannah,Carter,Soren,Vihaan,Samantha,Drew,Mica,Talie,Abhiram,Zunairah,Humairah,Tate,Dawson,Finn,Cavan,Cade,Karenna,Emmett,Zada,Ethan' --content-type text/csv outfile
!cat outfile

{
    "ContentType": "text/csv; charset=utf-8",
    "InvokedProductionVariant": "default"
}
{"Vihaan": "M", "Tate": "F", "Cavan": "M", "Mica": "F", "Zunairah": "F", "Alyse": "F", "Samantha": "F", "Carter": "M", "Karenna": "F", "Soren": "F", "Finn": "M", "Zada": "F", "Abhiram": "M", "Cade": "M", "Drew": "M", "Talie": "F", "Humairah": "F", "Hannah": "F", "Dawson": "M", "Emmett": "M", "Ethan": "M"}

In [19]:
%cat outfile

{"Vihaan": "M", "Tate": "F", "Cavan": "M", "Mica": "F", "Zunairah": "F", "Alyse": "F", "Samantha": "F", "Carter": "M", "Karenna": "F", "Soren": "F", "Finn": "M", "Zada": "F", "Abhiram": "M", "Cade": "M", "Drew": "M", "Talie": "F", "Humairah": "F", "Hannah": "F", "Dawson": "M", "Emmett": "M", "Ethan": "M"}