## Referencing Workspace

In [32]:
import azureml.core
from azureml.core import Workspace, Dataset,Datastore, Experiment

# Referencing Workspace
ws = Workspace.from_config()

## Assert compute

In [2]:
assert_compute = {"compute Instance":False,"compute Cluster":False}

# Get compute name and compute type 
from azureml.core import ComputeTarget
ct_list = ComputeTarget.list(ws)

ws_list_computes = [{"name":c.name,"computeType":str(type(c))} for c in ct_list]

# Assert at least one compute instance exists
try:
    ci_targets = [c["name"] for c in ws_list_computes if c['computeType']=="<class 'azureml.core.compute.computeinstance.ComputeInstance'>"]
    assert len(ci_targets) > 0, "Missing Compute Instance"
    assert_compute["compute Instance"] = True
except Exception as E:
    print(E)
    
    
# Assert at least one compute cluster exists
try:
    cc_targets = [c["name"] for c in ws_list_computes if c['computeType']=="<class 'azureml.core.compute.amlcompute.AmlCompute'>"]
    assert len(cc_targets) > 0, "Missing Compute Cluster"
    assert_compute["compute Cluster"] = True
except Exception as E:
    print(E)

assert_compute

Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'compute Instance': True, 'compute Cluster': True}

##

## Select Compute Instance

In [3]:
ci_targets

['ciabb11f9d',
 're-compute-inst',
 'ezzatdemnati-v1',
 'ezzatdemnati-v1-ci2',
 'ezzatdemnati-v3']

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute, ComputeInstance
# Get compute instance to run a test
ci_instance = None
compute_instance_selected = True
try:
    ci_instance = ComputeInstance(workspace=ws, name=ci_targets[1]) # Get first compute instance        
    print(ci_instance)
except Exception as e:
    print(e)
    compute_instance_selected = False
    pass

{
  "Name": "re-compute-inst",
  "Id": "/subscriptions/e0d7a68e-191f-4f51-83ce-d93995cd5c09/resourceGroups/my_ml_tests/providers/Microsoft.MachineLearningServices/workspaces/myworkspace/computes/re-compute-inst",
  "Workspace": "myworkspace",
  "Location": "eastus2",
  "VmSize": "STANDARD_D3_V2",
  "State": "Stopped",
  "Tags": {}
}


## Select Compute Cluster

In [5]:
cc_targets

['real-estate-compute-cluster', 're-compute-cl', 'cpu-cluster']

In [6]:
# Select compute cluster
from azureml.core.compute import ComputeTarget, AmlCompute, ComputeInstance
# Get compute instance to run a test
cc_instance = None
compute_cluster_selected = True
try:
    cc_instance = AmlCompute(workspace=ws, name=cc_targets[1]) 
    print(cc_instance)
except Exception as e:
    print(e)
    compute_cluster_selected = False
    pass

AmlCompute(workspace=Workspace.create(name='myworkspace', subscription_id='e0d7a68e-191f-4f51-83ce-d93995cd5c09', resource_group='my_ml_tests'), name=re-compute-cl, id=/subscriptions/e0d7a68e-191f-4f51-83ce-d93995cd5c09/resourceGroups/my_ml_tests/providers/Microsoft.MachineLearningServices/workspaces/myworkspace/computes/re-compute-cl, type=AmlCompute, provisioning_state=Succeeded, location=eastus2, tags={})


## Create and register environment

In [7]:
%%writefile env.yml

name: simple_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.8.3
- scikit-learn
- pandas
- pip
- pip:
  - azureml-defaults
  - azureml-mlflow
  - pandas
  - numpy
  - joblib
  - scikit-learn
  - matplotlib
  - seaborn

Overwriting env.yml


In [8]:
from azureml.core import Environment
from azureml.core import Workspace

# Get or create registered environment
name = "experiment_env"
file_path = r"env.yml"
environment_registered = True
registered_env = None
try:
    #register environement if not exist
    env_names = Environment.list(workspace=ws)
    current_env = name
    if current_env in env_names:
        registered_env = Environment.get(ws, current_env)
    else:    
        env = Environment.from_conda_specification(name=name, file_path=file_path)

        # Registering and reusing environments
        registered_env = env.register(workspace=ws)
except Exception as e:
    print(e)
    environment_registered = False
    pass

## Create Dataset

### Download data file

In [9]:
# Download training dataset
# !wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/diabetes.csv

import requests
import os

dataset_created = True
try:
    # checking if the directory demo_folder 
    # exist or not.
    if not os.path.exists("./data"):      
        # if the demo_folder directory is not present 
        # then create it.
        os.makedirs("./data")

    url = "https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/diabetes.csv"
    r = requests.get(url)
    #retrieving data from the URL using get method
    with open("./data/diabetes.csv", 'wb') as f:    
        f.write(r.content) 

except Exception as e:
    print(e)
    dataset_created = False
    pass  


### Register Dataset

In [10]:
from azureml.core import Dataset
from azureml.data.datapath import DataPath
import os

# Get default Datastore
default_ds = ws.get_default_datastore()

src_dir = "./data"
tgt_dir = "/data"
file_format="csv"
dataset_name = "diabetes.csv"
data_file_mask = f"{tgt_dir}/diabetes.csv"
dataset_registered = True
# Register dataset if not exists
if dataset_name not in ws.datasets:
    try:
        Dataset.File.upload_directory(src_dir=src_dir,
                            target=DataPath(default_ds, tgt_dir)
                            )

        #Create a tabular dataset from the path on the datastore (this may take a short while)
        tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds,data_file_mask))

        # Register the tabular dataset    
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name=dataset_name,
                                description=dataset_name,
                                tags = {'format':file_format.upper()},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as e:
        print(e)
        dataset_registered = False
        pass
else:
    print('Dataset already registered.')

Dataset already registered.


## Create Pipeline

#### Set pipeline configuration

In [11]:
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import RunConfiguration

if compute_cluster_selected: # and dataset_registered and environment_registered:
    # Create a new runconfig object for the pipeline
    pipeline_run_config = RunConfiguration()

    # Use compute instance. 
    pipeline_run_config.target = ci_instance

    # Use created environment
    pipeline_run_config.environment = registered_env


#### Create data prep file

In [27]:
%%writefile data_prep.py
import pandas as pd
import numpy as np
import math as mt
import argparse
from azureml.core import Run
import os

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_name', help='raw dataset')
parser.add_argument('--out-folder', type=str, dest='save_folder', default='prepped_data', help='Folder to output results')
args = parser.parse_args()
#input_data = args.raw_dataset
save_folder = args.save_folder

# Get the experiment run context
run = Run.get_context()


# load the training dataset
#diabetes = pd.read_csv('./data/diabetes.csv')
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()
diabetes.head()

# Remove missing data
diabetes = diabetes.dropna()

# Log processed rows
row_count = (len(diabetes))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'prep_data.csv')
diabetes.to_csv(save_path, index=False, header=True)


Overwriting data_prep.py


#### Create train model file

In [28]:
%%writefile train_model.py

import pandas as pd
import numpy as np
import math as mt
import argparse
import json

from azureml.core import Run
import os

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')

args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()


# load the training dataset
# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'prep_data.csv')
diabetes = pd.read_csv(file_path)
diabetes.head()

# Separate features and labels
features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
label = 'Diabetic'
X, y = diabetes[features].values, diabetes[label].values

from sklearn.model_selection import train_test_split

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))    

# Train the model
from sklearn.linear_model import LogisticRegression

# Set regularization rate
reg = 0.01

# train a logistic regression model on the training set
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
print (model)

predictions = model.predict(X_test)
print('Predicted labels: ', predictions)
print('Actual labels:    ' ,y_test)

from sklearn.metrics import accuracy_score

model_acc = accuracy_score(y_test, predictions)
print('Accuracy: ',model_acc )
run.log('train_model', "LogisticRegression")
run.log('Accuracy', model_acc)

 # Complete the run
run.complete()

#------------ SAVE MODEL -----------------#
# Save the trained model in the outputs folder
import joblib
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', f'trained_model_.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
from azureml.core import Run, Model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = F'trained_model_Logistic',
               tags={'Training context':'Pipeline'},
               properties={'Acccuracy': np.float(model_acc)})

Overwriting train_model.py


In [29]:
if compute_instance_selected and dataset_registered and environment_registered:
    # 2-Get the training dataset
    raw_ds = ws.datasets.get(dataset_name)

    
    from azureml.core import Workspace, Datastore, Dataset
    ds = Datastore(ws, "ezmylake")

    # Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
    prep_data = OutputFileDatasetConfig(destination=(ds,"prep_data"))
    
    # Defines steps
    # Step to run a Python script

    step1 = PythonScriptStep(name = 'prepare data',
                            source_directory = './',
                            script_name = 'data_prep.py',
                            compute_target = ci_instance.name,
                            runconfig = pipeline_run_config,
                            runconfig_pipeline_params=None,
                            #inputs=[raw_ds.as_named_input('raw_data')]
                            # Script arguments include PipelineData
                            arguments = ['--input-data', raw_ds.as_named_input('raw_data'),
                                        '--out-folder', prep_data],
                            # Disable/Enable step reuse
                            allow_reuse = False)

    # Step to train a model
    step2 = PythonScriptStep(name = 'train model',
                            source_directory = './',
                            script_name = 'train_model.py',
                            compute_target = ci_instance.name,
                            runconfig = pipeline_run_config,
                            runconfig_pipeline_params=None,
                            # Pass as script argument
                            arguments=['--training-data', prep_data.as_input(),                                    
                                        ],
                            #inputs=[],
                            #output=[s]  
                            )

    # Start Compute instance
    ci_status = ci_instance.get_status()

    try:
        ci_instance.start(wait_for_completion=True, show_output=True)
    except:
        pass



In [30]:
pipeline_executed = True
try:
    train_pipeline = Pipeline(workspace = ws, steps = [step1,step2])

    # Create an experiment and run the pipeline
    #Forcing all steps to run add: regenerate_outputs=True
    from azureml.core import Workspace, Experiment
    experiment = Experiment(workspace = ws, name = 'test-training-pipeline') #regenerate_outputs=True 
    pipeline_run = experiment.submit(train_pipeline)
    pipeline_run.wait_for_completion(show_output=True)
except Exception as e:
    print(e)
    pipeline_executed = False
    pass
finally:
    #Stop compute instance
    #ci_instance.stop(wait_for_completion=True, show_output=True)
    print("OK")

Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Created step prepare data [5383bd31][0ab32ef1-a4a6-43d9-a877-a68e2091f686], (This step will run and generate new outputs)
Created step train model [1b2b74bb][0a1b706a-21fe-47b6-8aa1-3237a7cd8921], (This step will run and generate new outputs)
Submitted PipelineRun 44683fea-d875-4ee2-baf0-0f58fd7685dc
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/44683fea-d875-4ee2-baf0-0f58fd7685dc?wsid=/subscriptions/e0d7a68e-191f-4f51-83ce-d93995cd5c09/resourcegroups/my_ml_tests/workspaces/myworkspace&tid=16b3c013-d300-468d-ac64-7eda0820b6d3
PipelineRunId: 44683fea-d875-4ee2-baf0-0f58fd7685dc
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/44683fea-d875-4ee2-baf0-0f58fd7685dc?wsid=/subscriptions/e0d7a68e-191f-4f51-83ce-d93995cd5c09/resourcegroups/my_ml_tests/workspaces/myworkspace&tid=16b3c013-d300-468d-ac64-7eda0820b6d3
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 828a6a81-4afe-4bcb-9e5a-fa22dc7ef532
Link to Azure Machine Learning Portal

## Azure ML Readiness Assessment summary

In [31]:
print(F"""
       --- Azure ML Readiness Assessment summary ---
       assert_compute: {assert_compute}
       compute_instance_selected: {compute_instance_selected}
       environment_registered: {environment_registered}
       dataset_created: {dataset_created}
       dataset_registered: {dataset_registered}
       pipeline_executed: {pipeline_executed}       
       """
       )


       --- Azure ML Readiness Assessment summary ---
       assert_compute: {'compute Instance': True, 'compute Cluster': True}
       compute_instance_selected: True
       environment_registered: True
       dataset_created: True
       dataset_registered: True
       pipeline_executed: True       
       
