# Criando a pipeline de treino
- https://sagemaker.readthedocs.io/en/stable/workflows/pipelines/index.html

In [None]:
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

In [None]:
session = PipelineSession()

## Definindo parâmetros da pipe

In [None]:
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = "titanic"
sklearn_version = "0.23-1"
target_feature = 'survived'
base_name = 'titanic-pipeline'
model_package_group_name = 'titanic'
max_training_time = 180
max_trainings = 2
output_path = "s3://{}/{}/output".format(bucket, prefix)

## Consultado o Athena

In [None]:
from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition
from sagemaker.dataset_definition.inputs import DatasetDefinition

In [None]:
athena_dataset = AthenaDatasetDefinition (
  catalog = 'AwsDataCatalog',
  database = 'demodb',
  query_string = 'SELECT * FROM analytics_titanic',
  output_s3_uri = f's3://{bucket}/{prefix}/athena',
  work_group = 'primary',
  output_format = 'PARQUET'
)

dataset = DatasetDefinition(
  athena_dataset_definition = athena_dataset, 
  local_path='/opt/ml/processing/input/dataset.parquet' 
)

## Pré-processamento

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version=sklearn_version,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name=base_name,
    role=role
)

In [None]:
step_process = ProcessingStep(
    name="TitanicProcessDataset",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput( input_name="dataset", destination="/opt/ml/processing/input", dataset_definition=dataset)
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation")
    ],
    code="prepare.py",
)

## Treinamento com AutoML

In [None]:
from sagemaker import AutoML, AutoMLInput, Model
from sagemaker.workflow.automl_step import AutoMLStep

In [None]:
input_data_training = AutoMLInput(
    inputs=step_process.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
    target_attribute_name=target_feature,
    channel_type='training'
)

input_data_validation = AutoMLInput(
    inputs=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
    target_attribute_name=target_feature,
    channel_type='validation'
)

In [None]:
automl = AutoML(
    role=role,
    target_attribute_name=target_feature,
    base_job_name=base_name,
    sagemaker_session=session,
    max_candidates=max_trainings,
    max_runtime_per_training_job_in_seconds=max_training_time,
    output_path=output_path,
    mode="ENSEMBLING"
)

In [None]:
train_args = automl.fit(
    inputs=[
        input_data_training,
        input_data_validation
    ]
)

In [None]:
step_auto_ml_training = AutoMLStep(
    name="AutoMLTrainingStep",
    step_args=train_args,
)

In [None]:
best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
    role,
    sagemaker_session=session
)

## Registro

In [None]:
from sagemaker.workflow.model_step import ModelStep
from sagemaker import ModelMetrics, MetricsSource

In [None]:
step_args_create_model = best_auto_ml_model.create(
    instance_type="ml.m5.xlarge"
)

step_create_model = ModelStep(
    name="ModelCreationStep",
    step_args=step_args_create_model
)

In [None]:
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
        content_type="application/json",
    ),
    explainability=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
        content_type="application/json",
    ),
)

In [None]:
step_args_register_model = best_auto_ml_model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    model_package_group_name=model_package_group_name,
    model_metrics=model_metrics
)

In [None]:
step_register_model = ModelStep(
    name="ModelRegistrationStep",
    step_args=step_args_register_model
)

## Definição da pipe

In [None]:
from sagemaker.workflow.pipeline import Pipeline

In [None]:
pipeline = Pipeline(
    name="TitanicTrainingPipeline",
    steps=[
        step_process,
        step_auto_ml_training,
        step_create_model,
        step_register_model
    ],
    sagemaker_session=session
)

In [None]:
pipeline.upsert(role_arn=role)

## Executando a pipe

In [None]:
# pipeline_execution = pipeline.start()
# pipeline_execution.describe()
# pipeline_execution.list_steps()