In [None]:
import os
from palmerpenguins import load_penguins
import pandas as pd
import numpy as np

## Load data

In [None]:
penguins = load_penguins()
penguins.head(3)

## Drop nas, dups and train-test-split

In [None]:
penguins.dropna(inplace=True)
penguins.drop_duplicates(inplace=True)

features = [
    "bill_length_mm",
    "bill_depth_mm",
    "flipper_length_mm",
    "species",
    "island",
]

target = "sex"

test_amount = 0.3
train = [np.random.uniform() >= test_amount for _ in range(len(penguins))]
test = [not train_flag for train_flag in train]

X_train = penguins[train][features]
y_train = penguins[train][target]
X_test = penguins[test][features]
y_test = penguins[test][target]

In [None]:
# Speichern sowohl lokal wie auf s3 
# --> Nur relevant, wenn wir lokalen Call behalten wollen...

In [None]:
for raw_data_bucket in ["./"]:

    X_train.to_csv(os.path.join(raw_data_bucket, "X_train.csv"), index=False)
    y_train.to_csv(os.path.join(raw_data_bucket, "y_train.csv"), index=False)
    X_test.to_csv(os.path.join(raw_data_bucket, "X_test.csv"), index=False)
    y_test.to_csv(os.path.join(raw_data_bucket, "y_test.csv"), index=False)
    print(f"Stored data in '{raw_data_bucket}' .")

## Train and deploy file


**Notizen Carsten**

2 Änderungen bei neuer sklearn Version:


```console
from sklearn.externals import joblib
```
deprecated since 0.23 --> import joblib directly


```console
preprocessor = make_column_transformer(
    (StandardScaler(), num_features),
    (OneHotEncoder(sparse=False), cat_features),
)
```
different position of variables ([see more](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html))


**Pfade auf Docker**  
Default für Training-Input "/opt/ml/input/data/train" und Model-Dir sind "/opt/ml/model"  

Alternativ sind diese via os.environ.get("SM_MODEL_DIR") bzw os.environ.get("SM_CHANNEL_TRAIN") auf dem prebuilt sklearn docker schon gesetzt. 

Hierher werden bei Call des Wrappers potentiell Inputs/Outputs von s3 kopiert. 

In [None]:
%%writefile train_and_deploy.py

import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import joblib
import argparse
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline


def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

def input_fn(request_body, content_type):
    if content_type == 'text/csv':
        samples = []
        for r in request_body.split('|'):
            samples.append(list(map(float,r.split(','))))
        return np.array(samples)
    else:
        raise ValueError("Thie model only supports text/csv input")

def predict_fn(input_data, model):
    return model.predict(input_data)

def output_fn(prediction, content_type):
    return str(prediction)


if __name__ == "__main__":
    
    parser = argparse.ArgumentParser()

    parser.add_argument('--train', type=str, default="/opt/ml/input/data/train")
    parser.add_argument('--num_features', type=str) 
    parser.add_argument('--cat_features', type=str)
    parser.add_argument('--model-dir', type=str, default="/opt/ml/model")
    args, _ = parser.parse_known_args()
    
    train_path = args.train
    num_features = args.num_features.split()
    cat_features = args.cat_features.split()
    model_dir = args.model_dir

    X_train = pd.read_csv(os.path.join(train_path, "X_train.csv"))
    y_train = pd.read_csv(os.path.join(train_path, "y_train.csv"))
    
    preprocessor = make_column_transformer(
        (StandardScaler(), num_features),
        (OneHotEncoder(sparse=False), cat_features),
    )
    
    model = LogisticRegression(class_weight="balanced", solver="lbfgs")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    pipeline.fit(X_train, y_train)
    
    model_output_directory = os.path.join(model_dir, "model.joblib")
    print("Model saing path {}".format(model_output_directory))
    joblib.dump(pipeline, model_output_directory)

### Local call

In [None]:
!python3 train_and_deploy.py --train ./  \
                             --num_features "bill_length_mm bill_depth_mm flipper_length_mm"  \
                             --cat_features "species island"  \
                             --model-dir ./  

### SKlearn API Call

In [None]:
sklearn = SKLearn(
    entry_point="train_and_deploy.py",
    framework_version="1.0.1", 
    instance_type="ml.m5.xlarge", 
    role=sagemaker_role,
    hyperparameters={
        "num_features": "bill_length_mm bill_depth_mm flipper_length_mm",
        "cat_features": "species island"
    }
)
sklearn.fit({"train": processed_data_bucket})

Beschreiben, was hier passiert! \
Wenn wir train mitgeben, wird automatisch ProcessingInput des Ordners data_bucket mit Destination "/opt/ml/input/data/train" im Docker betrieben.

Serialisiertes Modell wird bei nicht-spezifizieren entsprechend in s3 abgelegt

### Deploy

In [None]:
sklearn.deploy()

Hier könnte man Evaluierung des Modells wieder im Notebook machen. \
y_pred via Endpunkt Invocation and metrics(y_pred, y_test)

## Outlook

siehe Word Dokument