In [21]:
# Import libraries
from azureml.core import (
    Workspace
    , Experiment
    , Environment
    , Dataset
    , RunConfiguration
)
from azureml.widgets import RunDetails

from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import (
    RunConfiguration
    , DEFAULT_CPU_IMAGE
    , DockerConfiguration
)
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data.data_reference import DataReference

from azureml.pipeline.core import (
    Pipeline
    , PipelineRun
    , PipelineParameter
    , PipelineData
    , TrainingOutput
)
from azureml.pipeline.steps import (
    PythonScriptStep
    , HyperDriveStep
)

# from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule

print("AML SDK version:", azureml.core.VERSION)

# Load the workspace from a configuration file
ws = Workspace.from_config()

# Get a reference to our experiment
exp = Experiment(ws, 'sklearn_pipeline_exp')

# Build environment
env = Environment.from_conda_specification(name='test_env', file_path="conda_dependencies.yml")

# Set the compute target  and compute cluster
cluster = ComputeTarget(workspace=ws, name='cpu-cluster')
cluster.wait_for_completion(show_output=True)
compute = ComputeTarget(workspace=ws, name='crcastillo841')
compute.wait_for_completion(show_output=True)

# Establish default datastore
default_store = ws.get_default_datastore()

AML SDK version: 1.42.0
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned

Running


In [22]:
# Define the run configuration
run_config = RunConfiguration()

run_config.docker = DockerConfiguration(use_docker=True)
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# Specify conda dependencies through .yml
run_config.environment.python.conda_dependencies = CondaDependencies(conda_dependencies_file_path="conda_dependencies.yml")

In [27]:
# Define dataset
ds_input = Dataset.get_by_name(
        workspace=ws
        , name="1994_Adult_Data"
    ).as_named_input('ds_input')

# Define outputs
train_data = PipelineData('train_data', datastore=default_store).as_dataset()
test_data = PipelineData('test_data', datastore=default_store).as_dataset()

# Define variables
random_seed = 123
test_proportion = 0.2
target = 'income'

# Create prep_step
prep_step = PythonScriptStep(
    source_directory='./prep'
    , script_name='prep.py'
    , arguments=[
        '--random_seed', random_seed
        , '--test_proportion', test_proportion
        , '--target', target
        , '--train_data', train_data
        , '--test_data', test_data
        ]
    , inputs=[ds_input]
    , outputs=[
        train_data
        , test_data
        ]
    , runconfig=run_config
    , compute_target=compute
    , allow_reuse=True
    )

# Print
print('prep step is created')

prep step is created


In [None]:
# Define outputs
metrics_data = PipelineData(
    name='metrics_data'
    , datastore=default_store
    , pipeline_output_name='metrics_output'
    , training_output=TrainingOutput(type='Metrics')
)
model_data = PipelineData(
    name='model_data'
    , datastore=default_store
    , pipeline_output_name='metrics_output'
    , training_output=TrainingOutput(
        type='Model'
        , model_file="outputs/model.pkl"
    )
)

# Define search space and parameter search
param_sampling = BayesianParameterSampling({
    "l1_ratio": uniform(
        min_value=0
        , max_value=1
        ),
    "C": choice(
        [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2]
        )
})

# Create hyperdrive config
hd_config = HyperDriveConfig(
    run_config=run_config
    , hyperparameter_sampling=param_sampling
    , policy=None
    , primary_metric_name='mean_cv_score'
    , primary_metric_goal=PrimaryMetricGoal.MAXIMIZE
    , max_total_runs=40
    , max_concurrent_runs=4
    )

# Create hyperdrive_step
hypedrive_step = HyperDriveStep(
    name='hyperdrive_step'
    , hyperdrive_config=
    , source_directory='./train'
    , script_name='train.py'
    , arguments=[
        '--random_seed', random_seed
        , '--train_data', train_data
        ]
    , inputs=[train_data]
    , outputs=[model]
    , runconfig=run_config
    , compute_target=compute
    , allow_reuse=True
    )

# Print
print('hyperdrive step is created')

In [28]:
pipeline = Pipeline(
    workspace=ws
    , steps=[
        prep_step
        , train_step
        # , score_step
        ]
)
pipeline.validate()
pipeline.submit('sklearn_poc_pipeline')

Step prep.py is ready to be created [b14bed48]
Created step prep.py [b14bed48][66d6224a-dda3-41b1-8eeb-41a6a5b7b901], (This step will run and generate new outputs)
Submitted PipelineRun 452481a2-bbb6-4e16-9500-7b3aa8a371b9
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/452481a2-bbb6-4e16-9500-7b3aa8a371b9?wsid=/subscriptions/8841158c-7729-48e1-a4e3-b8125457e298/resourcegroups/azure_ml/workspaces/azure_ml_poc&tid=1dfa1b70-a4aa-42af-91e8-993521be798f


Experiment,Id,Type,Status,Details Page,Docs Page
sklearn_poc_pipeline,452481a2-bbb6-4e16-9500-7b3aa8a371b9,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation
