Automatización del Pipeline

Definición de parámetros del pipeline

In [None]:
dataset_path = ParameterString(name='DatasetPath', default_value=f's3://{bucket}/{datasets_prefix}')
model_approval_status = ParameterString(name='ModelApprovalStatus', default_value='PendingManualApproval')  # "Approved" Si no se requiere aprobación manual
minimum_precision = ParameterFloat(name='MinimumPrecision', default_value=0.85)

parameters_list = [dataset_path, model_approval_status, minimum_precision]


Agregar paso al pipeline para ejecutar Processing Job para la preparación del dataset.

In [None]:
data_prep_step_parameters = {
    'name':'Preparacion-de-Datos',
    'processor':processor}

data_prep_step_parameters.update(data_prep_parameters)
data_prep_step_parameters['job_arguments'] = data_prep_step_parameters.pop('arguments')

data_prep_step_parameters['inputs']=[ProcessingInput(input_name='input',
                                         source=dataset_path,
                                         destination='/opt/ml/processing/input'),
                                     ProcessingInput(input_name='code',
                                         source=data_prep_script_path,
                                         destination='/opt/ml/processing/input/code')]

data_prep_step = ProcessingStep(**data_prep_step_parameters)
pipeline_steps = [data_prep_step]


Agregar paso al pipeline para entrenamiento de los modelos utilizando Training Jobs

In [None]:
training_steps = {}
for algorithm in estimators:       
    training_steps[algorithm] = TrainingStep(
        name=f'Entrenamiento-con-{algorithm}',
        estimator=tuners[algorithm].best_estimator(),
        inputs={
            'train_data': TrainingInput(
                data_prep_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri),
            'train_target': TrainingInput(
                data_prep_step.properties.ProcessingOutputConfig.Outputs['train_target'].S3Output.S3Uri)})
    
    pipeline_steps.append(training_steps[algorithm])


Agregar paso al pipeline para evaluación de desempeño de los modelos, utilizando un Processing Job

In [None]:
property_files = {}

for algorithm in estimators:
    property_file = PropertyFile(
        name=f'{algorithm}Metrics',
        output_name="eval",
        path=f'{algorithm}_metrics.json')
        
    property_files[algorithm] = property_file

eval_step_parameters = {
    'name':'Evaluacion-de-modelos',
    'processor':evaluation_processor,
    'property_files':[property_files[file] for file in property_files]}

eval_step_parameters.update(eval_parameters)
eval_step_parameters['job_arguments'] = eval_step_parameters.pop('arguments')

eval_step_parameters['inputs'] = [
    ProcessingInput(
        input_name='code',
        source=evaluate_models_script_path,
        destination='/opt/ml/processing/input/code'),
    ProcessingInput(
        source=data_prep_step.properties.ProcessingOutputConfig.Outputs['test_target'].S3Output.S3Uri, 
        destination='/opt/ml/processing/input/target'),
    ProcessingInput(
        source=data_prep_step.properties.ProcessingOutputConfig.Outputs['test_data'].S3Output.S3Uri, 
        destination='/opt/ml/processing/input/data'),
    ProcessingInput(
        source=training_steps['GradientBoosting'].properties.ModelArtifacts.S3ModelArtifacts, 
        destination='/opt/ml/processing/input/GradientBoosting'),
    ProcessingInput(
        source=training_steps['RandomForest'].properties.ModelArtifacts.S3ModelArtifacts,
        destination='/opt/ml/processing/input/RandomForest'),
    ProcessingInput(
        source=training_steps['ExtraTrees'].properties.ModelArtifacts.S3ModelArtifacts, 
        destination='/opt/ml/processing/input/ExtraTrees')]

eval_step = ProcessingStep(**eval_step_parameters)
pipeline_steps.append(eval_step)


Agregar condición para registrar modelo en el Model Registry.

In [None]:
package_group_name = f'{prefix}-PackageGroup'

for algorithm in estimators:   
    model_metrics = ModelMetrics(
        model_statistics = MetricsSource(
            s3_uri="{}/{}_metrics.json".format(
                eval_step.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"],
                algorithm),
            content_type="application/json"))
    
    register_step = RegisterModel(
        name=f"Registra{algorithm}",
        estimator=estimators[algorithm],
        model_data=training_steps[algorithm].properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=package_group_name,
        approval_status=model_approval_status,
        description=f'Churn prediction using {algorithm}',
        model_metrics=model_metrics,
        image_uri=docker_images['Inference']['image_uri'],
        entry_point = training_script_file
    )
    
    condition = ConditionGreaterThanOrEqualTo(
        left = JsonGet(
            step_name = 'Evaluacion-de-modelos',
            property_file = property_files[algorithm],
            json_path = f'binary_classification_metrics.precision.value'),
        right = minimum_precision)
    
    condition_step = ConditionStep(
        name=f"{algorithm}Precision",
        conditions=[condition],
        if_steps=[register_step],
        else_steps=[])
    
    pipeline_steps.append(condition_step)
    
print(f'Package Group Name: {package_group_name}')


esta celda deberia dar como resultado: Package Group Name: churn-clf-PackageGroup

Ejecución del pipeline

In [None]:
pipeline = Pipeline(name=f'{prefix}-pipeline-{strftime("%M-%S", gmtime())}',
                    parameters=parameters_list,
                    steps=pipeline_steps)


Genera definición del pipeline para ver que no exista ningún problema, si no arroja ningún error la ejecución de la siguiente celda, todo está bien.

In [None]:
definition = json.loads(pipeline.definition())


In [None]:
pipeline.upsert(role_arn=sagemaker_role)


In [None]:
execution = pipeline.start()


In [None]:
execution.list_steps()
