In [54]:
! pip install --upgrade sagemaker


In [2]:
!cat container/Dockerfile

# Build an image that can do training and inference in SageMaker
# This is a Python 2 image that uses the nginx, gunicorn, flask stack
# for serving inferences in a stable way.

FROM ubuntu:18.04

MAINTAINER Amazon AI <sage-learner@amazon.com>

RUN apt-get -y update && apt-get install -y --no-install-recommends \
         wget \
         python \
         python3.6 \
         nginx \
         ca-certificates \
         libgcc-5-dev \
         build-essential \
         python3-dev \
    && rm -rf /var/lib/apt/lists/*


# Symlink /usr/bin/python to the python version we're building for.
RUN rm /usr/bin/python && ln -s /usr/bin/python3.6 /usr/bin/python

# Here we get all python packages.
# There's substantial overlap between scipy and numpy that we eliminate by
# linking them together. Likewise, pip leaves the install caches populated which uses
# a significant amount of space. These optimizations save a fair amount of space in the
# image, which reduces start up time.
RUN wget https://

In [3]:
# S3 prefix

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from io import StringIO
from sklearn.metrics import roc_auc_score
import sagemaker as sage
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
sess = sage.Session()
role = get_execution_role()
prefix = 'lgb-model'


In [4]:
%%sh

# The name of our algorithm
algorithm_name=lgb-model

cd container

chmod +x lgb/train
chmod +x lgb/serve

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  10.41MB
Step 1/11 : FROM ubuntu:18.04
 ---> 56def654ec22
Step 2/11 : MAINTAINER Amazon AI <sage-learner@amazon.com>
 ---> Using cache
 ---> d72bea90662d
Step 3/11 : RUN apt-get -y update && apt-get install -y --no-install-recommends          wget          python          python3.6          nginx          ca-certificates          libgcc-5-dev          build-essential          python3-dev     && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> 988e8e6e5e79
Step 4/11 : RUN rm /usr/bin/python && ln -s /usr/bin/python3.6 /usr/bin/python
 ---> Using cache
 ---> ef7de9e576eb
Step 5/11 : RUN wget https://bootstrap.pypa.io/3.3/get-pip.py && python3.6 get-pip.py
 ---> Using cache
 ---> 8a669529e160
Step 6/11 : RUN pip install --upgrade pip &&  pip3 install lightgbm==3.1.0 pandas==1.0.5 scikit-learn==0.23.1 flask  gunicorn &&  pip3 install gevent --pre &&  rm -rf /root/.cache
 ---> Using cache
 ---> 24014bd7a783
Step 7/11 : ENV PYTHONUNBU

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [5]:
df = pd.read_csv('titanic.csv',sep='|')
df = df.drop(['PassengerId','Cabin','Ticket','Name'],axis=1)

df_train, df_test = train_test_split(df, test_size=0.2)

In [6]:
df_train.to_csv('data/train.csv',index=False)
df_test.to_csv('data/test.csv',index=False)

In [7]:
sess.upload_data('data/train.csv', key_prefix=prefix + '/training')

's3://sagemaker-us-east-1-452432741922/lgb-model/training/train.csv'

In [8]:
data_location = f's3://{sess.default_bucket()}/{prefix}/training'

In [9]:
s3_input = {'training': data_location}

In [10]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/lgb-model:latest'.format(account, region)

lgb = sage.estimator.Estimator(image,
                       role, 1, 'ml.c4.2xlarge',
                       output_path="s3://{}/output".format(sess.default_bucket()),
                       sagemaker_session=sess)

lgb.fit(s3_input)

2020-11-18 13:51:43 Starting - Starting the training job...
2020-11-18 13:51:45 Starting - Launching requested ML instances......
2020-11-18 13:53:01 Starting - Preparing the instances for training......
2020-11-18 13:54:07 Downloading - Downloading input data
2020-11-18 13:54:07 Training - Downloading the training image...
2020-11-18 13:54:40 Training - Training image download completed. Training in progress..[34mStarting the training.[0m
[34m['/opt/ml/input/data/training/train.csv'][0m
[34mcsv parsed[0m
[34mmodel defined[0m
[34mCross validation AUC 0.8451[0m
[34mTraining complete.[0m

2020-11-18 13:54:52 Uploading - Uploading generated training model
2020-11-18 13:54:52 Completed - Training job completed
Training seconds: 67
Billable seconds: 67


In [11]:
from sagemaker.predictor import csv_serializer
predictor = lgb.deploy(1, 'ml.m4.xlarge', serializer=csv_serializer)



-----------!

In [43]:
test_data = pd.read_csv("data/test.csv")

In [44]:
test_data.iloc[:,1:].to_csv('data/x_test.csv',index=False)

In [45]:
import boto3

client = boto3.client('sagemaker-runtime')

endpoint_name = predictor.endpoint_name                               # Your endpoint name.
content_type = "text/csv"                                        # The MIME type of the input data in the request body.

In [46]:
response = client.invoke_endpoint(
    EndpointName=endpoint_name, 
    ContentType=content_type,
    Body=open('data/x_test.csv', 'rb')
    )

In [49]:
prob_scores = [eval(pred)[1] for pred in list(preds)]

In [50]:
test_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,,0,0,7.225,C
1,1,1,female,35.0,1,0,83.475,S
2,1,2,female,30.0,0,0,12.35,Q
3,1,1,female,14.0,1,2,120.0,S
4,1,1,female,38.0,0,0,80.0,


In [52]:
roc_auc_score(test_data.iloc[:,0],prob_scores)

0.8846657929226736

### Optional cleanup
When you're done with the endpoint, you'll want to clean it up.

In [53]:
predictor.delete_endpoint()