In [1]:
#pip install azure-ai-ml


In [2]:
# Import Libraries

from azureml.core import Dataset
from azureml.data.datapath import DataPath
#from azureml.core import train


In [3]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.49.0 to work with testerinos


In [4]:
default_ds = ws.get_default_datastore()

if 'titanic dataset' not in ws.datasets:
    Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'titanic-data/')
                              )

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-data/titanic.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='titanic dataset',
                                description='titanic data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

In [5]:
import os

experiment_folder = 'titanic_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


In [6]:
%%writefile $experiment_folder/titanic_training.py
# Import libraries
import argparse, joblib, os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# Get the experiment run context
run = Run.get_context()

# Get script arguments
parser = argparse.ArgumentParser()

# Input dataset
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')

# Hyperparameters
parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=0.1, help='learning rate')
parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')

# Add arguments to args collection
args = parser.parse_args()

# Log Hyperparameter values
run.log('learning_rate',  np.float(args.learning_rate))
run.log('n_estimators',  np.int(args.n_estimators))

# load the titanic dataset
print("Loading Data...")
titanic = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input

# Separate features and labels
X, y = titanic[['Age','Sex','Fare']].values, titanic['Survived'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a Gradient Boosting classification model with the specified hyperparameters
print('Training a classification model')
model = GradientBoostingClassifier(learning_rate=args.learning_rate,
                                   n_estimators=args.n_estimators).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the model in the run outputs
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/titanic_model.pkl')

run.complete()

Overwriting titanic_training-hyperdrive/titanic_training.py


In [7]:

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "sweetcluster"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


In [8]:

%%writefile $experiment_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Overwriting titanic_training-hyperdrive/hyperdrive_env.yml


In [9]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
hyper_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/hyperdrive_env.yml")

# Get the training dataset
titanic_ds = ws.datasets.get("titanic dataset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='titanic_training.py',
                                # Add non-hyperparameter arguments -in this case, the training dataset
                                arguments = ['--input-data', titanic_ds.as_named_input('training_data')],
                                environment=hyper_env,
                                compute_target = training_cluster)

# Sample a range of parameter values
params = GridParameterSampling(
    {
        # Hyperdrive will try 6 combinations, adding these as script arguments
        '--learning_rate': choice(0.01, 0.1, 1.0),
        '--n_estimators' : choice(10, 100)
    }
)

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, # No early stopping policy
                          primary_metric_name='AUC', # Find the highest AUC metric
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=6, # Restict the experiment to 6 iterations
                          max_concurrent_runs=2) # Run up to 2 iterations in parallel

# Run the experiment
experiment = Experiment(workspace=ws, name='mslearn-titanic-hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Execution failed. User process '/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/bin/python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\n  File \"titanic_training.py\", line 43, in <module>\n    n_estimators=args.n_estimators).fit(X_train, y_train)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/ensemble/_gb.py\", line 413, in fit\n    dtype=DTYPE, multi_output=True)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/base.py\", line 433, in _validate_data\n    X, y = check_X_y(X, y, **check_params)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\", line 63, in inner_f\n    return f(*args, **kwargs)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\", line 878, in check_X_y\n    estimator=estimator)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\", line 63, in inner_f\n    return f(*args, **kwargs)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\", line 673, in check_array\n    array = np.asarray(array, order=order, dtype=dtype)\n  File \"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/numpy/core/_asarray.py\", line 83, in asarray\n    return array(a, dtype, copy=False, order=order)\nValueError: could not convert string to float: 'male'\n\n Marking the experiment as failed because initial child jobs have failed due to user error",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Execution failed. User process '/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/bin/python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\\n  File \\\"titanic_training.py\\\", line 43, in <module>\\n    n_estimators=args.n_estimators).fit(X_train, y_train)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/ensemble/_gb.py\\\", line 413, in fit\\n    dtype=DTYPE, multi_output=True)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/base.py\\\", line 433, in _validate_data\\n    X, y = check_X_y(X, y, **check_params)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\\\", line 63, in inner_f\\n    return f(*args, **kwargs)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\\\", line 878, in check_X_y\\n    estimator=estimator)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\\\", line 63, in inner_f\\n    return f(*args, **kwargs)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/sklearn/utils/validation.py\\\", line 673, in check_array\\n    array = np.asarray(array, order=order, dtype=dtype)\\n  File \\\"/azureml-envs/azureml_cca64c8c3667dbc9fa4b9eb1c2f6cd0a/lib/python3.6/site-packages/numpy/core/_asarray.py\\\", line 83, in asarray\\n    return array(a, dtype, copy=False, order=order)\\nValueError: could not convert string to float: 'male'\\n\\n Marking the experiment as failed because initial child jobs have failed due to user error\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

KeyError: 'log_files'

In [None]:

# Print all child runs, sorted by the primary metric
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)

# Get the best run, and its metrics and arguments
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)

In [None]:
from azureml.core import Model

# Register model
best_run.register_model(model_path='outputs/titanic_model.pkl', model_name='titanic_model',
                        tags={'Training context':'Hyperdrive'},
                        properties={'AUC': best_run_metrics['AUC'], 'Accuracy': best_run_metrics['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')