# Carregar Features

In [None]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

In [None]:
boto_session = boto3.Session(region_name='us-east-1')
sagemaker_client = boto_session.client(service_name='sagemaker', region_name='us-east-1')
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name='us-east-1')

In [None]:
feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [None]:
feature_group_name = "titanic-features"
titanic_feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [None]:
titanic_feature_group.describe()

In [None]:
titanic_query = titanic_feature_group.athena_query()
titanic_table = titanic_query.table_name
output_bucket = 'martinig-athena-results-test'

In [None]:
query_string = f'SELECT * FROM "sagemaker_featurestore"."{titanic_table}";'

In [None]:
df = pd.DataFrame()
titanic_query.run(query_string=query_string, output_location='s3://'+output_bucket+'/query_results/')
titanic_query.wait()
df = titanic_query.as_dataframe()

In [None]:
df.head()

# Explorar

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn-ticks')

In [None]:
df.describe()

In [None]:
def plot(table, legloc='upper right',
                                    plt_style = 'seaborn-ticks',
                                    color_palette="dark",sorter=None, stacked=False,
                                    kind = 'bar', percentage = True,
                               custom_title=None, minimal=True, figsize=(19,10), width=0.7 ):     
    grouped = table
    
    #Tranform to percentages
    if percentage == True:
        grouped = np.round(grouped.divide(grouped['Total'],axis=0)*100,0)
    try:   
        del grouped['Total']
    except:
        pass
    
    # rearrange the columns
    if sorter:
        grouped = grouped[sorter]

    plt.style.use(plt_style)
    sns.set_palette(sns.color_palette(color_palette))
    ax = grouped.plot(kind=kind,stacked=stacked, figsize=figsize, width=width)
    _ = plt.setp(ax.get_xticklabels(), rotation=0)  # Rotate labels
    plt.legend(loc=legloc) # plot the legend normally
    
    #annotate the bars
    if percentage == True:
      for p in ax.patches:
            ax.annotate('{}%'.format(int(np.round(p.get_height(),decimals=2))),
                                         (p.get_x()+p.get_width()/2.,
                                          p.get_height()), ha='center', va='center',
                                        xytext=(0, 10), textcoords='offset points')
    else:
      for p in ax.patches:
            ax.annotate(np.round(p.get_height(),decimals=2),
                                         (p.get_x()+p.get_width()/2.,
                                          p.get_height()), ha='center', va='center',
                                        xytext=(0, 10), textcoords='offset points')
    if minimal == True:
        ax.get_yaxis().set_ticks([])
        plt.xlabel('')
        sns.despine(top=True, right=True, left=True, bottom=False);
    else:
        pass     
    # set custom title    
    plt.title(custom_title)

In [None]:
def Groupby_TwoCol_Plot(df, col1, col2, legloc='upper right',
                                    plt_style = 'ggplot',
                                    color_palette="dark",sorter=None, stacked=False,
                                    kind = 'bar', percentage = True,
                               custom_title=None, minimal=True, figsize=(14,6), width=0.6):   
    
    #Group by Placement and Representative and unstack by Placement
    grouped = df.groupby([col2,col1]).size().unstack(col2)
    
    #Make a totals column sort and delete after
    grouped['Total'] = grouped.sum(axis=1)
    #grouped = grouped.sort_values('Total', ascending = False)
   
    plot(grouped, legloc=legloc,
                                    plt_style = plt_style,
                                    color_palette=color_palette,sorter=sorter, stacked=stacked,
                                    kind = kind , percentage = percentage,
                               custom_title=custom_title, minimal=minimal, figsize=figsize, width=width)    

In [None]:
Groupby_TwoCol_Plot(df,
                    'survived',
                    'sex',
                    color_palette=('red','green'),
                    plt_style = 'seaborn-ticks',
                    custom_title='Proportion of Survived per Sex',
                    legloc='upper left')

# Preparar

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features_target = ['sex', 'age', 'survived']

In [None]:
df = df.filter(features_target)

In [None]:
df.head()

In [None]:
train, test = train_test_split(df, test_size=0.2)

# Modelar

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)

In [None]:
X = train[features_target[:-1]]
Y = train[features_target[-1:]]

In [None]:
X.head()

In [None]:
model.fit(X, Y)

# Validar

In [None]:
pred = model.predict(test[features_target[:-1]])

In [None]:
acc = np.mean(test[features_target[-1:]].values == pred)
display(acc)

# Criar artefatos

https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html

## Upload dos datasets de treino e teste

In [None]:
bucket = 'mlopsstack-mlbucket12760f44-1n0o1haje306i'
prefix = 'datasets/titanic'

In [None]:
def upload_to_s3(channel, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = prefix + '/' + channel + '/' + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')
upload_to_s3('train', 'train.csv')
upload_to_s3('test', 'test.csv')

## Criação do script de preparação ou usar SageMaker Data Wrangler

- https://docs.amazonaws.cn/en_us/sagemaker/latest/dg/use-scikit-learn-processing-container.html
- https://docs.amazonaws.cn/en_us/sagemaker/latest/dg/data-wrangler-getting-started.html

## Criação do script de treino e inferência

In [None]:
from sagemaker.sklearn.estimator import SKLearn

In [None]:
%%writefile ../train_inference/train_inference.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# inference function
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n-estimators", type=int, default=100)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")
    parser.add_argument("--features", type=str)
    parser.add_argument("--target", type=str) 

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators, random_state=0, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print acc
    print("validating model")
    pred = model.predict(X_test)
    acc = np.mean(y_test == pred)
    print("Accuracy: " + str(acc))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)


In [None]:
! python ../train_inference/train_inference.py --n-estimators 100 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'sex age' \
                   --target 'survived'

In [None]:
# We use the Estimator from the SageMaker Python SDK
FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="train_inference.py",
    source_dir="../train_inference",
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="titanic-scikit",
    metric_definitions=[{"Name": "Accuracy", "Regex": "Accuracy: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "features": "sex age",
        "target": "survived",
    },
)

In [None]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": f"s3://{bucket}/{prefix}/train/", "test": f"s3://{bucket}/{prefix}/test/"}, wait=True)

In [None]:
# Deploy my estimator to a SageMaker Endpoint and get a Predictor
predictor = sklearn_estimator.deploy(instance_type='ml.m4.xlarge',
                                     initial_instance_count=1)

In [None]:
response = predictor.predict(test[features_target[:-1]])

In [None]:
response

In [None]:
import io
'''from io import StringIO
test_file = io.StringIO()
test[features_target[:-1]].to_csv(test_file,header = None, index = None)'''

In [None]:
'''import boto3
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
    EndpointName= "titanic-scikit-2022-11-04-11-50-57-391",
    Body= test_file.getvalue(),
    ContentType = 'text/csv')
import json
result = json.loads(response['Body'].read().decode())
print(result)'''

In [None]:
# predictor.delete_endpoint(delete_endpoint_config=True)

## Criação do script de validação

- https://docs.amazonaws.cn/en_us/sagemaker/latest/dg/use-scikit-learn-processing-container.html