## Create the required Docker Image

In [None]:
tag_name="scikit:1.0"

In [None]:
dockerfile = """
FROM python:3.6-jessie

RUN pip install flask opencv-python pandas numpy scipy scikit-learn

RUN find / -name "*.pyc" -exec rm -f {} \;

RUN mkdir -p /opt/program
RUN mkdir -p /opt/ml/model

ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PATH="/opt/program:${PATH}"

COPY app.py /opt/program
WORKDIR /opt/program

EXPOSE 8080
ENTRYPOINT ["python", "app.py"]
"""

In [None]:
app = """
import json
import pickle
import sys
import model

from flask import Flask, request, jsonify
from sklearn.externals import joblib

app = Flask(__name__)

@app.route('/ping')
def ping():
    return ("", 200)

@app.route('/invocations', methods=["POST"])
def invoke():
    # load image from POST and convert it to opencv
    data = request.stream.read()
    
    payload = json.loads(data)
    print(payload, model)
    
    
if __name__ == '__main__':
    if len(sys.argv) < 2 or ( not sys.argv[1] in [ "serve", "train", "test"] ):
       raise Exception("Invalid argument: you must inform 'train' for training mode or 'serve' predicting mode") 

    train = sys.argv[1] == "train"
    test = sys.argv[1] == "test"
    
    if train:
       model.train()

    elif test:
       model.test(sys.argv[2:])
       
    else:
       model.serve()

       app.run(port=8080, host="0.0.0.0")

"""

In [None]:
with open('app.py', 'w') as f:
    f.write(app)
    f.flush()
    f.close()

with open('Dockerfile', 'w') as f:
    f.write(dockerfile)
    f.flush()
    f.close()

### Build the image

In [None]:
!sudo docker build -f Dockerfile -t $tag_name .

## Write the program module

In [None]:
model = """
import numpy as np
import json
import os
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.externals import joblib

prefix = '/opt/ml/'

input_path = os.path.join(prefix, 'input/data')
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

model_filename = os.path.join(model_path, 'model.pkl')

def train():
    print("Training mode")
    
    try:
        # This algorithm has a single channel of input data called 'training'. Since we run in
        # File mode, the input files are copied to the directory specified here.
        channel_name='training'
        training_path = os.path.join(input_path, channel_name)

        # Read in any hyperparameters that the user passed with the training job
        with open(param_path, 'r') as tc:
            hyperparameters = json.load(tc)

        # Take the set of files and read them all into a single pandas dataframe
        input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
        if len(input_files) == 0:
            raise ValueError(('There are no files in {}.\\n' +
                              'This usually indicates that the channel ({}) was incorrectly specified,\\n' +
                              'the data specification in S3 was incorrectly specified or the role specified\\n' +
                              'does not have permission to access the data.').format(training_path, channel_name))
        raw_data = [ pd.read_csv(file, sep=',', header=None ) for file in input_files ]
        train_data = pd.concat(raw_data)
        
        # labels are in the first column
        Y = train_data.ix[:,0]
        X = train_data.ix[:,1:]
        
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.33, random_state=7)

        #if hyperparameters['algorithm'] == 'logistic':
        #elif hyperparameters['algorithm'] == 'random_forest':
        
        model = LogisticRegression()
        model.fit(X_train, Y_train)

        joblib.dump(model, open(model_filename, 'wb'))
    
    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\\n' + trc)
            
        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\\n' + trc, file=sys.stderr)
        
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)

def serve():
    print("Predicting mode")

    model = joblib.load(open(model_filename, 'rb'))
    X = np.random.rand(1,10)
    print( model.predict( X ))
    return

def test(payload):
    X = eval(payload[0])
    model = joblib.load(open(model_filename, 'rb'))
    print( model.predict( [X] ))

"""

In [None]:
hyperparameters = """
{
  "epochs": 10,
  "learning_rate": 0.1
}
"""

In [None]:
!mkdir -p input/config
!mkdir -p program

with open('program/model.py', 'w') as f:
    f.write(model)
    f.flush()
    f.close()
    
with open('input/config/hyperparameters.json', 'w') as f:
    f.write(hyperparameters)
    f.flush()
    f.close()

## Create the dataset CSV

In [None]:
!mkdir -p input/data/training

import pandas as pd
import numpy as np

from sklearn import datasets
iris = datasets.load_iris()

dataset = np.insert(iris.data, 0, iris.target,axis=1)

pd = pd.DataFrame(data=dataset, columns=['iris_id'] + iris.feature_names)
pd.to_csv('input/data/training/iris.csv', header=None, index=False, sep=',', encoding='utf-8')

pd.head()

## Run the training locally

In [None]:
!docker run --rm --name 'my_model' \
    -v "${PWD}/model:/opt/ml/model" \
    -v "${PWD}/input:/opt/ml/input" \
    -v "${PWD}/program/model.py:/opt/program/model.py" scikit:1.0 train

In [None]:
!docker run --rm --name 'my_model' \
    -v "${PWD}/model:/opt/ml/model" \
    -v "${PWD}/input:/opt/ml/input" \
    -v "${PWD}/program/model.py:/opt/program/model.py" scikit:1.0 test "[4.6, 3.1, 1.5, 0.2]"

## Ok, now it's time to push everything to the correct repo
Go to the terminal and do the job!