In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails
from azureml.core import Dataset
 
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core import PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep
 
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
 
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
 
from azureml.core.model import Model
from azureml.core.environment import CondaDependencies
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment



# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.34.0


In [2]:
# connect to your workspace
ws = Workspace.from_config()

In [3]:
# Get the blob storage associated with the workspace
def_blob_store = ws.get_default_datastore()

In [4]:
def_blob_store.upload_files(['/mnt/batch/tasks/shared/LS_root/mounts/clusters/schen91/code/Users/schen9/data/cardio_train.csv'],target_path='data',overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/schen91/code/Users/schen9/data/cardio_train.csv
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/schen91/code/Users/schen9/data/cardio_train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_495a93d6ba8a4fb199195eb521ca0992

In [5]:
#create an Azure Machine Learning compute for running your steps

aml_compute_target = "demo-cluster"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("found existing compute target.")
except ComputeTargetException:
    print("creating new compute target")
    
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = 1, 
                                                                max_nodes = 4)    
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
print("Azure Machine Learning Compute attached")

found existing compute target.
Azure Machine Learning Compute attached


In [6]:
#Configure the training run's environment

aml_run_config = RunConfiguration()
 
aml_run_config.target = aml_compute
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:latest"
 
aml_run_config.environment.python.user_managed_dependencies = False
 
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn','numpy'], 
    pip_packages=['joblib','azureml-sdk','fusepy'], 
    pin_sdk_version=False)

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [7]:
# Import and register new dataset
df = Dataset.Tabular.from_delimited_files(def_blob_store.path('./data/cardio_train.csv'))


In [8]:
raw_data = df.as_named_input('raw_data')
train_data = PipelineData("train_data", datastore=def_blob_store).as_dataset()
test_data = PipelineData("test_data", datastore=def_blob_store).as_dataset()
model_file = PipelineData("model_file", datastore=def_blob_store)

In [9]:
#Contruct your pipelines steps

#initial pipeline step
from azureml.pipeline.steps import PythonScriptStep

entry_point="./prep.py"
source_directory="./prep"
data_prep_step = PythonScriptStep(name="prep_step",
                         script_name=entry_point, 
                         source_directory=source_directory,
                         arguments=["--train", train_data,"--test", test_data],
                         inputs=[raw_data],
                         outputs=[train_data,test_data],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         allow_reuse=True)

In [10]:
#use a dataset
source_directory="./train"
train_point="./train.py"
data_consuming_step = PythonScriptStep(name="consuming_step",
                         script_name=train_point, 
                         source_directory=source_directory,
                         arguments=["--train", train_data,"--test", test_data,"--model",model_file],
                         inputs=[train_data,test_data],
                         outputs=[model_file],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         allow_reuse=True)

In [11]:
steps = [data_prep_step,data_consuming_step]
pipeline1 = Pipeline(workspace=ws, steps=steps)
pipeline_run1 = Experiment(ws, 'cardio_prediction').submit(pipeline1, regenerate_outputs=False)

Created step prep_step [898eb088][39b5c2e3-9031-45eb-8094-608a378e5c3b], (This step is eligible to reuse a previous run's output)
Created step consuming_step [6e872326][5d7d9730-c5e1-4e04-91e4-c9dba21d3660], (This step will run and generate new outputs)
Submitted PipelineRun 4259da5e-81d6-48f8-b330-063b42f3c74a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4259da5e-81d6-48f8-b330-063b42f3c74a?wsid=/subscriptions/03a06d66-fdfa-4280-a0d7-97ca9b1ba175/resourcegroups/machinelearningoperations/workspaces/machinelearningopsgroup3&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
