# Use the Azure ML SDK to create a script-based experiment for training and registering a model

### Connect to the AML workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.48.0 to work with mlw-test


### Create a folder for the experiment files

In [2]:
import os
experiment_folder = 'diabetes-training'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes-training folder created


### Train a classification model by using a tabular dataset that is passed to it as an argument.

> **Note**: In the script, the dataset is passed as a parameter (or argument). In the case of a tabular dataset, this argument will contain the ID of the registered dataset; so you could write code in the script to get the experiment's workspace from the run context, and then get the dataset using its ID; like this:
>
> ```
> run = Run.get_context()
> ws = run.experiment.workspace
> dataset = Dataset.get_by_id(ws, id=args.training_dataset_id)
> diabetes = dataset.to_pandas_dataframe()
> ```
>
> However, Azure Machine Learning runs automatically identify arguments that reference named datasets and add them to the run's **input_datasets** collection, so you can also retrieve the dataset from this collection by specifying its "friendly name" (which as you'll see shortly, is specified in the argument definition in the script run configuration for the experiment). This is the approach taken in the script above.

Now you can run a script as an experiment, defining an argument for the training dataset, which is read by the script.

> **Note**: The **Dataset** class depends on some components in the **azureml-dataprep** package, so you need to include this package in the environment where the training experiment will be run. The **azureml-dataprep** package is included in the **azure-defaults** package.

In [3]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import os
import argparse
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get the script arguments (regularization rate and training dataset ID)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# Get the training dataset
print("Loading Data...")
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()

# Log some data asset insights
run.log('observations', len(diabetes))

# Plot and log the count of diabetic vs non-diabetic patients
diabetic_counts = diabetes['Diabetic'].value_counts()
fig = plt.figure(figsize=(6,6))
ax = fig.gca()    
diabetic_counts.plot.bar(ax = ax) 
ax.set_title('Patients with Diabetes') 
ax.set_xlabel('Diagnosis') 
ax.set_ylabel('Patients')
run.log_image(name='label distribution', plot=fig)


# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes-training/diabetes_training.py


### Define a runtime environment for the experiment
You can write out a conda enviromment configuration here, or always use a pre-configured .yml file

In [4]:
%%writefile $experiment_folder/experiment_env.yml
name: diabetes-experiment-env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- pandas
- pip
- pip:
  - azureml-defaults
  - azureml-mlflow
  - matplotlib

Overwriting diabetes-training/experiment_env.yml


### Create environment from the conda specification

In [5]:
from azureml.core import Environment

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("diabetes-experiment-env", experiment_folder + "/experiment_env.yml")

# Let Azure ML manage dependencies
experiment_env.python.user_managed_dependencies = False 

# Print the environment details
print(experiment_env.name, 'defined.')
print(experiment_env.python.conda_dependencies.serialize_to_string())

diabetes-experiment-env defined.
name: diabetes-experiment-env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- pandas
- pip
- pip:
  - azureml-defaults
  - azureml-mlflow
  - matplotlib



### We can register the defined conda environment in the workspace

In [6]:
experiment_env.register(workspace=ws)

{
    "assetId": "azureml://locations/westeurope/workspaces/a43e1e7b-9417-4e7b-b067-1990df476d4c/environments/diabetes-experiment-env/versions/3",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20221101.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "diabetes-e

In [7]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails


# Create a Python environment for the experiment (from a .yml file)
#env = Environment.from_conda_specification("experiment_env", "environment.yml") 

# Create a Python environment from a registered environment
env = Environment.get(ws, 'diabetes-experiment-env')


# Get the training dataset
diabetes_ds = ws.datasets.get("Diabetes data asset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                              script='diabetes_training.py',
                              arguments = ['--regularization', 0.1, # Regularizaton rate parameter
                                           '--input-data', diabetes_ds.as_named_input('training_data')], # Reference to dataset
                              environment=env,
                              docker_runtime_config=DockerConfiguration(use_docker=True)) 

# submit the experiment
experiment_name = 'train-diabetes'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'train-diabetes_1675181241_b981229b',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2023-01-31T16:07:23.180718Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '6641bf4a-2b15-4c72-b5ee-3c7eefb20ce8'},
 'inputDatasets': [{'dataset': {'id': 'c5c5a0cc-405e-44ff-a043-08bfb11d509c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'diabetes_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--regularization',
   '0.1',
   '--input-data',
   'DatasetConsumptionConfig:training_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'training_data': {'dataLocation': {'dataset': {'id': 'c5c5a0cc-405e-44ff-a043-08bfb11d509c',
      'name': 'Diabetes data asset',
      'version': '1'},
     'dataPath': 

### Show list of metrics and output files generated

In [8]:
# Get logged metrics
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
    print(file)

observations 15000
label distribution aml://artifactId/ExperimentRun/dcid.train-diabetes_1675181241_b981229b/label distribution_1675181250.png
Regularization Rate 0.1
Accuracy 0.7893333333333333
AUC 0.8568650620553335


azureml-logs/60_control_log.txt
azureml-logs/70_driver_log.txt
label distribution_1675181250.png
logs/azureml/8_azureml.log
logs/azureml/dataprep/backgroundProcess.log
logs/azureml/dataprep/backgroundProcess_Telemetry.log
outputs/diabetes_model.pkl


### Register the model

In [9]:
from azureml.core import Model

run.register_model(model_path='outputs/diabetes_logistic_regression_model.pkl', 
                   model_name='diabetes-model',
                   tags={'Algorithm': 'Logistic Regression', 'Training context':'Tabular diabetes data asset'}, 
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy'], 'Regularization Rate': run.get_metrics()['Regularization Rate']})

ModelPathNotFoundException: ModelPathNotFoundException:
	Message: Could not locate the provided model_path outputs/diabetes_logistic_regression_model.pkl in the set of files uploaded to the run: ['azureml-logs/60_control_log.txt', 'azureml-logs/70_driver_log.txt', 'label distribution_1675181250.png', 'logs/azureml/8_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'outputs/diabetes_model.pkl']
                See https://aka.ms/run-logging for more details.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Could not locate the provided model_path outputs/diabetes_logistic_regression_model.pkl in the set of files uploaded to the run: ['azureml-logs/60_control_log.txt', 'azureml-logs/70_driver_log.txt', 'label distribution_1675181250.png', 'logs/azureml/8_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'outputs/diabetes_model.pkl']\n                See https://aka.ms/run-logging for more details."
    }
}