In [1]:
# Import libraries
import azureml.core
from azureml.core import (
    Workspace
    , Experiment
    , Environment
    , Dataset
    , RunConfiguration
    , ScriptRunConfig
)
from azureml.widgets import RunDetails

from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import (
    RunConfiguration
    , DEFAULT_CPU_IMAGE
    , DockerConfiguration
)
from azureml.core.conda_dependencies import CondaDependencies

from azureml.pipeline.core import (
    Pipeline
    , PipelineRun
    , PipelineParameter
    , PipelineData
    , TrainingOutput
)
from azureml.pipeline.steps import (
    PythonScriptStep
    , HyperDriveStep
)
from azureml.train.hyperdrive import (
    BayesianParameterSampling
    , HyperDriveConfig
    , PrimaryMetricGoal
    , choice
    , uniform
    , quniform
)

# from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule

print("AML SDK version:", azureml.core.VERSION)

# Load the workspace from a configuration file
ws = Workspace.from_config()

# Get a reference to our experiment
exp = Experiment(ws, 'sklearn_pipeline_exp')

# Build environment
env = Environment.from_conda_specification(name='test_env', file_path="conda_dependencies.yml")

# Set the compute target  and compute cluster
cluster = ComputeTarget(workspace=ws, name='cpu-cluster')
cluster.wait_for_completion(show_output=True)
compute = ComputeTarget(workspace=ws, name='crcastillo841')
compute.wait_for_completion(show_output=True)

# Establish default datastore
default_store = ws.get_default_datastore()

AML SDK version: 1.42.0
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned

Running


In [2]:
# Define the run configuration
run_config = RunConfiguration()

run_config.docker = DockerConfiguration(use_docker=True)
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# Specify conda dependencies through .yml
run_config.environment.python.conda_dependencies = CondaDependencies(conda_dependencies_file_path="conda_dependencies.yml")

In [3]:
# Define dataset
ds_input = Dataset.get_by_name(
        workspace=ws
        , name="1994_Adult_Data"
    ).as_named_input('ds_input')

# Define outputs
train_data = PipelineData('train_data', datastore=default_store).as_dataset()
test_data = PipelineData('test_data', datastore=default_store).as_dataset()

# Define variables
random_seed = 123
test_proportion = 0.2
target = 'income'

# Create prep_step
prep_step = PythonScriptStep(
    source_directory='./prep'
    , script_name='prep.py'
    , arguments=[
        '--random_seed', random_seed
        , '--test_proportion', test_proportion
        , '--target', target
        , '--train_data', train_data
        , '--test_data', test_data
        ]
    , inputs=[ds_input]
    , outputs=[
        train_data
        , test_data
        ]
    , runconfig=run_config
    , compute_target=compute
    , allow_reuse=True
    )

# Print
print('prep step is created')

prep step is created


In [4]:
# Define outputs
metrics_data = PipelineData(
    name='metrics_data'
    , datastore=default_store
    , pipeline_output_name='metrics_output'
    , training_output=TrainingOutput(type='Metrics')
)
model_data = PipelineData(
    name='model_data'
    , datastore=default_store
    , pipeline_output_name='model_output'
    , training_output=TrainingOutput(
        type='Model'
        , model_file="outputs/model.pkl"
    )
)

# Define the ScriptRunConfig
hd_src = ScriptRunConfig(
    source_directory='./train'
    , script='train.py'
    , arguments=[
        '--random_seed', random_seed
        # , '--train_data', train_data
        ]
    , compute_target=cluster
    , environment=env
)

# Define search space and parameter search
param_sampling = BayesianParameterSampling({
    "n_estimators": quniform(
        min_value=100
        , max_value=1000
        , q=50
        )
    , "learning_rate": uniform(
        min_value=0.01
        , max_value=0.8
        )
    , "max_depth": quniform(
        min_value=3
        , max_value=20
        , q=1
        )
    , "colsample_bytree": uniform(
        min_value=0.5
        , max_value=1.0
        )
    , "gamma": uniform(
        min_value=0
        , max_value=5
        )
    , "reg_lambda": uniform(
        min_value=0
        , max_value=10
        )
    , "reg_alpha": uniform(
        min_value=0
        , max_value=1
        )
    # need to research quniform for q=float, won't take integer
    , "subsample": uniform(
        min_value=0.6
        , max_value=0.9
        )
    , "max_bin": quniform(
        min_value=25
        , max_value=500
        , q=25
        )
})

# Create hyperdrive config
hd_config = HyperDriveConfig(
    run_config=hd_src
    , hyperparameter_sampling=param_sampling
    , policy=None
    , primary_metric_name='mean_cv_score'
    , primary_metric_goal=PrimaryMetricGoal.MAXIMIZE
    , max_total_runs=50
    , max_concurrent_runs=4
    )

# Create hyperdrive_step
hyperdrive_step = HyperDriveStep(
    name='hyperdrive_step'
    , hyperdrive_config=hd_config
    , inputs=[train_data]
    , outputs=[
        metrics_data
        , model_data
        ]
    , allow_reuse=True
    )

# Print
print('hyperdrive step is created')

hyperdrive step is created


For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned. Recommendend value:180.


In [5]:
# Create the register model step
register_model_step = PythonScriptStep(
    source_directory='./train'
    , script_name='register_model.py'
    , name="register_model_step"
    , inputs=[model_data]
    , compute_target=compute
    , arguments=["--model_data", model_data]
    , allow_reuse=True
    , runconfig=run_config
    )

# register_model_step.run_after(hd_step)

# Print
print('register model step is created')

register model step is created


In [6]:
# Create the pipeline
pipeline = Pipeline(
    workspace=ws
    , steps=[
        prep_step
        , hyperdrive_step
        , register_model_step
        ]
)
pipeline.validate()
pipeline_run = pipeline.submit('sklearn_poc_pipeline')
pipeline_run.wait_for_completion()

Step hyperdrive_step is ready to be created [aa45f64f]
Step register_model_step is ready to be created [64630266]
Created step prep.py [f0476b48][50ed25aa-7b8d-4cc3-9e26-976b2bd27938], (This step is eligible to reuse a previous run's output)Created step hyperdrive_step [aa45f64f][cedd61ac-069a-4c36-a251-13141339c402], (This step will run and generate new outputs)

Created step register_model_step [64630266][98514750-d1fa-4aa3-8fd7-7b1e1e5e783a], (This step will run and generate new outputs)
Submitted PipelineRun 0d478d65-2d8d-4498-baea-ee26d21c66d7
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/0d478d65-2d8d-4498-baea-ee26d21c66d7?wsid=/subscriptions/8841158c-7729-48e1-a4e3-b8125457e298/resourcegroups/azure_ml/workspaces/azure_ml_poc&tid=1dfa1b70-a4aa-42af-91e8-993521be798f
PipelineRunId: 0d478d65-2d8d-4498-baea-ee26d21c66d7
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/0d478d65-2d8d-4498-baea-ee26d21c66d7?wsid=/subscriptions/8841158c-7729-48e1-a4e3