In [40]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


In [41]:

from azureml.telemetry import set_diagnostics_collection

set_diagnostics_collection(send_diagnostics=True)

Turning diagnostics collection on. 


In [42]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-139150
Azure region: southcentralus
Subscription id: 6b4af8be-9931-443e-90f6-c4c34a1f9737
Resource group: aml-quickstarts-139150


In [43]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "compute-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-02-18T22:12:17.082000+00:00', 'errors': None, 'creationTime': '2021-02-18T22:12:14.009612+00:00', 'modifiedTime': '2021-02-18T22:12:29.882838+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


In [44]:

import os

project_folder = './capstone-project'
os.makedirs(project_folder, exist_ok=True)

In [45]:
import shutil

shutil.copy('train.py', project_folder)

'./capstone-project/train.py'

# Create an experiment

In [46]:
from azureml.core import Experiment

experiment_name = 'House_Price_Predication'
experiment = Experiment(ws, name=experiment_name)

## Create an environment
Define a conda environment YAML file with your training script dependencies and create an Azure ML environment.

In [47]:
%%writefile conda_dependencies.yml

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - azureml-train-automl-runtime==1.21.0
  - inference-schema
  - azureml-interpret==1.21.0
  - azureml-defaults==1.21.0
- numpy>=1.16.0,<1.19.0
- pandas==0.25.1
- scikit-learn==0.22.1
- xgboost<=1.3.3
- psutil>=5.2.2,<6.0.0
channels:
- anaconda
- conda-forge

Overwriting conda_dependencies.yml


In [48]:
from azureml.core import Environment

sklearn_env = Environment.from_conda_specification(name = 'capstone-project-env', file_path = './conda_dependencies.yml')

In [49]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      arguments=['--kernel', 'linear', '--penalty', 1.0],
                      compute_target=compute_target,
                      environment=sklearn_env)

In [50]:
run = experiment.submit(src)

In [51]:
#https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb

# Monitor The Run

In [52]:
from azureml.widgets import RunDetails

RunDetails(run).show()
run.wait_for_completion(show_output=True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

RunId: House_Price_Predication_1613688085_86008573
Web View: https://ml.azure.com/experiments/House_Price_Predication/runs/House_Price_Predication_1613688085_86008573?wsid=/subscriptions/6b4af8be-9931-443e-90f6-c4c34a1f9737/resourcegroups/aml-quickstarts-139150/workspaces/quick-starts-ws-139150

Streaming azureml-logs/20_image_build_log.txt

2021/02/18 22:41:30 Downloading source code...
2021/02/18 22:41:31 Finished downloading source code
2021/02/18 22:41:32 Creating Docker network: acb_default_network, driver: 'bridge'
2021/02/18 22:41:32 Successfully set up Docker network: acb_default_network
2021/02/18 22:41:32 Setting up Docker configuration...
2021/02/18 22:41:33 Successfully set up Docker configuration
2021/02/18 22:41:33 Logging in to registry: 8ee44115daec45588f306316a3c9ed45.azurecr.io
2021/02/18 22:41:34 Successfully logged into 8ee44115daec45588f306316a3c9ed45.azurecr.io
2021/02/18 22:41:34 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: '

{'runId': 'House_Price_Predication_1613688085_86008573',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-18T22:52:27.093993Z',
 'endTimeUtc': '2021-02-18T22:54:01.16583Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'd7a4b334-4353-4681-a488-b35e13e3d2a1',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--kernel', 'linear', '--penalty', '1'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'compute-cluster',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'capstone-project-env

# Tune model hyperparameters

In [55]:
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
    
booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]

## Hyper Parameter Optimization
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = RandomParameterSampling({
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }
)
xgboost_model_regression = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=hyperparameter_grid, 
                                     primary_metric_name='mean_squared_error',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=50,
                                     max_concurrent_runs=4)

# hd_config = HyperDriveConfig(estimator=est,
#                                 hyperparameter_sampling=ps,
#                                 policy=early_termination_policy,
#                                 primary_metric_name='validation_acc',
#                                 primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
#                                 max_total_runs=4,
#                                 max_concurrent_runs=4)


# # Set up the random search with 4-fold cross validation
# random_cv = RandomizedSearchCV(estimator=regressor,
#             param_distributions=hyperparameter_grid,
#             cv=5, n_iter=50,
#             scoring = 'neg_mean_absolute_error',n_jobs = 4,
#             verbose = 5, 
#             return_train_score = True,
#             random_state=42)



In [56]:
# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_config)

In [None]:
# Monitor HyperDrive runs

In [57]:

RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO',…

In [58]:

hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_609797ff-fa3d-4b85-bfde-f560216147ea
Web View: https://ml.azure.com/experiments/House_Price_Predication/runs/HD_609797ff-fa3d-4b85-bfde-f560216147ea?wsid=/subscriptions/6b4af8be-9931-443e-90f6-c4c34a1f9737/resourcegroups/aml-quickstarts-139150/workspaces/quick-starts-ws-139150

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-18T23:21:23.674941][API][INFO]Experiment created<END>\n""<START>[2021-02-18T23:21:24.281275][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-18T23:21:25.064358][GENERATOR][INFO]Successfully sampled '1' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-18T23:21:25.0353878Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-02-18T23:21:54.696534][GENERATOR][INFO]Trying to sample '3' jobs from the hyperparameter space<END>\n""<START>[2021-02-18T23:21:55.507023][GENERATOR][INFO]Successfully sample

{'runId': 'HD_609797ff-fa3d-4b85-bfde-f560216147ea',
 'target': 'compute-cluster',
 'status': 'Canceled',
 'startTimeUtc': '2021-02-18T23:21:23.416323Z',
 'endTimeUtc': '2021-02-18T23:27:41.406258Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
 'properties': {'primary_metric_config': '{"name": "mean_squared_error", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'd7a4b334-4353-4681-a488-b35e13e3d2a1'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg139150.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_609797ff-fa3d-4b85-bfde-f560216147ea/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=bgZQLfFVG%2F4%2BCNn%2FwTAeVShn81m6m%2BZRQ7IljJ%2BwUDQ%3D&st=2021-02-18T2

In [59]:
assert(hyperdrive_run.get_status() == "Completed")

AssertionError: 

# Find and register best model

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

In [None]:
print(best_run.get_file_names())

In [None]:
model = best_run.register_model(model_name='xgboost-house-price', model_path='outputs/model.joblib')