## Consumindo dados via Athena

In [None]:
!pip install awswrangler

In [None]:
import awswrangler as wr
import pandas as pd

In [None]:
df = wr.athena.read_sql_query("SELECT * FROM analytics_titanic", database="demodb")
df.head()

## EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df.describe()

In [None]:
sns.catplot(data=df, kind="count", x="survived", hue="sex")

## Preparando o dataset

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split

In [None]:
session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
prefix = "titanic"

In [None]:
def transform_sex(value):
    if value == "female":
        return 1
    else:
        return 0

In [None]:
df["sex"] = df["sex"].map(transform_sex)
df["sex"].value_counts()

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=22, shuffle=True)
test, validation = train_test_split(test, test_size=0.2, random_state=22, shuffle=True)

In [None]:
# Upload para o S3
train_file = "titanic_train.csv"
train.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print("Train data uploaded to: " + train_data_s3_path)

validation_file = "titanic_validation.csv"
validation.to_csv(validation_file, index=False, header=True)
validation_data_s3_path = session.upload_data(path=validation_file, key_prefix=prefix + "/validation")
print("Validation data uploaded to: " + validation_data_s3_path)

In [None]:
test_target_file = "titanic_test_target.csv"
test_y = test['survived']
test_y.to_csv(test_target_file, index=False, header=True)
test_target_s3_path = session.upload_data(path=test_target_file, key_prefix=prefix + "/test_target")
print("Test target uploaded to: " + test_target_s3_path)

test_file = "titanic_test.csv"
test.drop('survived', axis=1, inplace=True)
test.to_csv(test_file, index=False, header=True)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print("Test data uploaded to: " + test_data_s3_path)

## Experimento com AutoML
- https://sagemaker.readthedocs.io/en/stable/api/training/automl.html

In [None]:
from time import gmtime, strftime, sleep
from sagemaker import AutoML, AutoMLInput, Model

In [None]:
output_path = "s3://{}/{}/output".format(bucket, prefix)
target_feature = 'survived'
timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())
auto_ml_job_name = "automl-titanic-" + timestamp_suffix
max_training_time = 180
max_trainings = 2

print("AutoMLJobName: " + auto_ml_job_name)

In [None]:
input_data_training = AutoMLInput(
    inputs="s3://{}/{}/train".format(bucket, prefix),
    target_attribute_name=target_feature,
    channel_type='training'
)

input_data_validation = AutoMLInput(
    inputs="s3://{}/{}/validation".format(bucket, prefix),
    target_attribute_name=target_feature,
    channel_type='validation'
)

In [None]:
automl = AutoML(
    role=role,
    target_attribute_name=target_feature,
    base_job_name=auto_ml_job_name,
    sagemaker_session=session,
    max_candidates=max_trainings,
    max_runtime_per_training_job_in_seconds=max_training_time,
    output_path=output_path
)

In [None]:
automl.fit([input_data_training, input_data_validation], job_name=auto_ml_job_name, wait=False, logs=False)

In [None]:
describe_response = automl.describe_auto_ml_job()
print (describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
job_run_status = describe_response["AutoMLJobStatus"]

In [None]:
while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = automl.describe_auto_ml_job()
    job_run_status = describe_response["AutoMLJobStatus"]
    
    print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
    sleep(30)

In [None]:
# Caso a sessão termine
auto_ml_job_name = 'automl-titanic-11-10-03-07'
automl = AutoML.attach(auto_ml_job_name=auto_ml_job_name, sagemaker_session=session)

In [None]:
best_candidate = automl.describe_auto_ml_job()["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]

print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"]))

In [None]:
model = automl.create_model(
    name=auto_ml_job_name
)

model.create(
    instance_type='CPU'
)

## Teste inferência Batch com o melhor modelo

In [None]:
from sagemaker.transformer import Transformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
inferences_path = "s3://{}/{}/inferences".format(bucket, prefix)

In [None]:
transformer = model.transformer(
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=inferences_path
)

In [None]:
transformer.transform(test_data_s3_path, content_type="text/csv", split_type="Line")
transformer.wait()

In [None]:
type(test_y)

In [None]:
pred_y = wr.s3.read_csv(transformer.output_path).squeeze()
pred_y.shape, test_y.shape

In [None]:
accuracy_score(test_y.tolist(), pred_y.tolist())

In [None]:
cf_matrix = confusion_matrix(test_y.tolist(), pred_y.tolist())

In [None]:
sns.heatmap(cf_matrix, annot=True)

## Registro do modelo

In [None]:
import boto3

In [None]:
sm_client = boto3.client("sagemaker")

model_package_group_name = "titanic"

In [None]:
create_model_pacakge_group_response = sm_client.create_model_package_group(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageGroupDescription="Titanic model package group"
)
model_package_arn = create_model_pacakge_group_response["ModelPackageGroupArn"]
print(f"ModelPackageGroup Arn : {model_package_arn}")

In [None]:
candidate_name = automl.best_candidate()['CandidateName']
candidate_insights = f"{automl.best_candidate()['CandidateProperties']['CandidateArtifactLocations']['ModelInsights']}/{candidate_name}/statistics.json"
candidate_explainability = f"{automl.best_candidate()['CandidateProperties']['CandidateArtifactLocations']['ModelInsights']}/{candidate_name}/analysis.json"

In [None]:
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=candidate_insights,
        content_type="application/json",
    ),
    explainability=MetricsSource(
        s3_uri=candidate_explainability,
        content_type="application/json",
    ),
)

In [None]:
model.register(
    model_package_group_name=model_package_group_name,
    content_types=['text/csv'],
    response_types=['text/csv'],
    model_metrics=model_metrics
)

In [None]:
sm_client.list_model_packages(
    ModelPackageGroupName='titanic'
)