### Run Processing pipeline

In [None]:
import os
import pandas as pd
import boto3

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn import SKLearnModel
from sagemaker.processing import ProcessingInput, ProcessingOutput

#### Load Environment Variables

In [None]:
%load_ext dotenv
%dotenv

script_path = os.getenv("PREPROCESSING_SCRIPT_PATH")
role= os.getenv("ROLE")
preprocessing_source_path=os.getenv("PREPROCESSING_SOURCE_PATH")
preprocessing_output_path=os.getenv("PREPROCESSING_OUTPUT_PATH")

## Preprocessing

#### Develop preprocessing script

In [None]:
%%writefile preprocessing.py

import argparse
import os

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

input_columns = [
    "species",
    "island",
    "bill_length_mm",
    "bill_depth_mm",
    "flipper_length_mm",
    "body_mass_g",
    "sex",
]

target = "sex"

if __name__ == "__main__":
    # Parse Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
    args, _ = parser.parse_known_args()
    split_ratio = args.train_test_split_ratio
    print("Received arguments {}".format(args))

    # Process input data
    input_data_path = os.path.join("/opt/ml/processing/input", "penguins.csv")
    print("Reading input data from {}".format(input_data_path))
    df = pd.read_csv(input_data_path)
    df = pd.DataFrame(data=df, columns=input_columns)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    preprocess = make_column_transformer(
        (["bill_length_mm", "bill_depth_mm", "flipper_length_mm"], StandardScaler()),
        (["species", "island"], OneHotEncoder(sparse=False)),
    )

    X = preprocess.fit_transform(df.drop(columns="sex"))

    # Split data into training and test set
    print("Splitting data into train and test sets with ratio {}".format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(
        pd.DataFrame(X),
        df[target],
        test_size=split_ratio,
        random_state=42,
    )

    train_features_output_path: str = os.path.join(
        "/opt/ml/processing/train", "train_features.csv"
    )
    train_labels_output_path: str = os.path.join(
        "/opt/ml/processing/train", "train_labels.csv"
    )
    test_features_output_path: str = os.path.join(
        "/opt/ml/processing/test", "test_features.csv"
    )
    test_labels_output_path: str = os.path.join(
        "/opt/ml/processing/test", "test_labels.csv"
    )

    # Save processed data as csv
    print("Saving training features to {}".format(train_features_output_path))
    X_train.to_csv(train_features_output_path, header=False, index=False)

    print("Saving test features to {}".format(test_features_output_path))
    X_test.to_csv(test_features_output_path, header=False, index=False)

    print("Saving training labels to {}".format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)

    print("Saving test labels to {}".format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)


#### Define & Run SKLearn Preprocessor

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0",
    base_job_name="preprocessing",
    role=role,
    instance_type="ml.m5.xlarge",
    instance_count=1,
)

sklearn_processor.run(
    code=script_path,
    inputs=[
        ProcessingInput(
            source="preprocessing.py", 
            destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        ProcessingOutput(
            destination=preprocessing_output_path,
            output_name="train_data", 
            source="/opt/ml/processing/train"
        ),
        ProcessingOutput(
            destination=preprocessing_output_path,
            output_name="test_data", 
            source="/opt/ml/processing/test"
        ),
    ],
)
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

#### Inspect generated training data

In [None]:
training_features = pd.read_csv(preprocessing_output_path + "train_features.csv", nrows=10, header=None)
print("Training features shape: {}".format(training_features.shape))
training_features.head(n=3)

## Model Training

#### Create SKLearn training job 

In [None]:
%%writefile train_and_deploy.py

import os

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib


""" 
Define model serving functions. More aboutthese functions at:
https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#load-a-model
"""
def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

def input_fn(request_body, content_type):
    if content_type == 'text/csv':
        samples = []
        for r in request_body.split('|'):
            samples.append(list(map(float,r.split(','))))
        return np.array(samples)
    else:
        raise ValueError("Thie model only supports text/csv input")

def predict_fn(input_data, model):
    return model.predict(input_data)

def output_fn(prediction, content_type):
    return str(prediction)


if __name__ == "__main__":
    training_data_directory = "/opt/ml/input/data/train"
    train_features_data = os.path.join(training_data_directory, "train_features.csv")
    train_labels_data = os.path.join(training_data_directory, "train_labels.csv")
    print("Reading input data")
    X_train = pd.read_csv(train_features_data, header=None)
    y_train = pd.read_csv(train_labels_data, header=None)

    model = LogisticRegression(class_weight="balanced", solver="lbfgs")
    print("Training LR model")
    model.fit(X_train, y_train)
    model_output_directory = os.path.join("/opt/ml/model", "model.joblib")
    print("Saving model to {}".format(model_output_directory))
    joblib.dump(model, model_output_directory)

In [None]:
sklearn = SKLearn(
    entry_point="train_and_deploy.py",
    framework_version="0.20.0", 
    instance_type="ml.m5.xlarge", 
    role=role
)
sklearn.fit({"train": preprocessing_output_path})

In [None]:
model_data_s3_uri = sklearn.output_path + sklearn.latest_training_job.name + "/output/model.tar.gz"
model_data_s3_uri

#### Evaluate Model

In [None]:
%%writefile evaluate.py

import json
import os
import tarfile

import pandas as pd

from sklearn.externals import joblib
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

if __name__ == "__main__":
    model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz")
    print("Extracting model from path: {}".format(model_path))
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    print("Loading model")
    model = joblib.load("model.joblib")

    print("Loading test input data")
    test_features_data = os.path.join("/opt/ml/processing/test", "test_features.csv")
    test_labels_data = os.path.join("/opt/ml/processing/test", "test_labels.csv")

    X_test = pd.read_csv(test_features_data, header=None)
    y_test = pd.read_csv(test_labels_data, header=None)
    predictions = model.predict(X_test)

    print("Creating classification evaluation report")
    report_dict = classification_report(y_test, predictions, output_dict=True)
    report_dict["accuracy"] = accuracy_score(y_test, predictions)
    # report_dict["roc_auc"] = roc_auc_score(y_test, predictions)

    print("Classification report:\n{}".format(report_dict))

    evaluation_output_path = os.path.join(
        "/opt/ml/processing/evaluation", "evaluation.json"
    )
    print("Saving classification report to {}".format(evaluation_output_path))

    with open(evaluation_output_path, "w") as f:
        f.write(json.dumps(report_dict))

In [None]:
sklearn_processor.run(
    code="evaluate.py",
    inputs=[
        ProcessingInput(
            source=model_data_s3_uri, 
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source="s3://mlops-test-processed-data/", 
            destination="/opt/ml/processing/test"),
    ],
    outputs=[ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation")],
)
evaluation_job_description = sklearn_processor.jobs[-1].describe()

#### Inspect Evaluation result

In [None]:
client = boto3.client('s3')
s3_path=evaluation_job_description["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
bucket, key = s3_path.split("//")[1].split("/",1)
result = client.get_object(Bucket=bucket, Key= key + '/evaluation.json') 
json.loads(result['Body'].read().decode('utf-8'))

### Model Deployment

#### Deploy Estimator to Sagemaker Endpoint

In [None]:
predictor = sklearn.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

#### Test Sagemaker Endpoint

In [None]:
# Load in the deploy_test data
deploy_test = training_features.head(2).values.tolist()

# Format the deploy_test data features
request_body = ""
for sample in deploy_test:
    request_body += ",".join([str(n) for n in sample]) + "|"
request_body = request_body[:-1] 
print("*"*20)
print(f"Calling Sagemaker Endopint with the following request_body: {request_body}")

# create sagemaker client using boto3
client = boto3.client('sagemaker-runtime')

# Specify endpoint and content_type
endpoint_name = predictor.endpoint
content_type = 'text/csv'

# Make call to endpoint
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=request_body,
    ContentType=content_type
    )
response_from_endpoint = response['Body'].read().decode("utf-8")
print("*"*20)
print(f"Response from Endpoint: {response_from_endpoint}")

#### Delete Endpoint, if no longer in use

In [None]:
# predictor.delete_endpoint()

## Build REST API

#### Create Lambda Function for handling API <-> Sagemaker Endpoint traffic

In [None]:
%%writefile serving_lambda.py
import os
import boto3
import json

endpoint_name = os.environ['ENDPOINT_NAME']
runtime= boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))
    
    data = json.loads(json.dumps(event))
    payload = json.loads(data['data'])
    print(payload)
    
    # Format the deploy_test data features
    request_body = ""
    for sample in payload:
        request_body += ",".join([str(n) for n in sample]) + "|"
    request_body = request_body[:-1] 
    print("request_body: ", request_body)
    
    response = runtime.invoke_endpoint(EndpointName=endpoint_name,
                                       ContentType='text/csv',
                                       Body=request_body)
                                       
    label = response['Body'].read().decode('utf-8').strip("[]").strip("'")
    
    return label

#### Go to API Gateway & Select Create new REST Endpoint

![REST API](img/REST.png)

#### Choose a name and create a new API

![REST API](img/CREATE_NEW.png)

#### Create a new method of type POST and choose your lambda as target

![REST API](img/POST.png)

#### Deploy API

![REST API](img/DEPLOY.png)

### Go to APIs --> Stages --> Inspect your newly created stage and collect Invocation Endpoint

#### Invoke Request against REST API

In [None]:
import requests

url = os.getenv("API_URL")
payload = json.dumps({"data":"[[-0.6396528091784842, 0.3738717119645826, -0.9980179785096928, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]]"})
print(f"Calling ML Api with the following payload {payload}")
response = requests.post(url, data=payload)
print("*"*20)
print(f"Return Message. Status code: {response.status_code}, Message: {response.text}")