In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import azureml.core
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.79


In [2]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')

globalAI-demo	northeurope	globalAI


In [3]:
from azureml.core import Experiment
experiment_name = 'mnist-cnn'

exp = Experiment(workspace=ws, name=experiment_name)

In [4]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "gpucluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_NC6")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)
    # create the cluster
    compute_target = ComputeTarget.create(
    ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())


creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-12-14T07:38:47.491000+00:00', 'errors': None, 'creationTime': '2019-12-14T07:38:45.498847+00:00', 'modifiedTime': '2019-12-14T07:39:00.996913+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


In [5]:
import os

project_folder = './model'
os.makedirs(project_folder, exist_ok=True)

In [6]:
from azureml.train.dnn import PyTorch

estimator = PyTorch(source_directory=project_folder, 
                    script_params={'--output-dir': './outputs'},
                    compute_target=compute_target,
                    entry_script='mnist.py',
                    use_gpu=True)

# upgrade to PyTorch 1.0 Preview, which has better support for ONNX
# estimator.conda_dependencies.remove_conda_package('pytorch=0.4.0')
# estimator.conda_dependencies.add_conda_package('pytorch-nightly')
# estimator.conda_dependencies.add_channel('pytorch')



In [7]:
run = exp.submit(estimator)
print(run.get_details())

{'runId': 'mnist-cnn_1576309196_d8ccffea', 'target': 'gpucluster', 'status': 'Queued', 'properties': {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnapshotId': 'ced87461-4a32-4b9e-a416-37b3a3b34ab7', 'azureml.git.repository_uri': 'https://github.com/felixSchober/globalAI19.git', 'mlflow.source.git.repoURL': 'https://github.com/felixSchober/globalAI19.git', 'azureml.git.branch': 'master', 'mlflow.source.git.branch': 'master', 'azureml.git.commit': '3f3146a80c4a1a78460ef9ad2a7f21187554043c', 'mlflow.source.git.commit': '3f3146a80c4a1a78460ef9ad2a7f21187554043c', 'azureml.git.dirty': 'True', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [], 'runDefinition': {'script': 'mnist.py', 'arguments': ['--output-dir', './outputs'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'gpucluster', 'dataReferences': {}, 'data': {}, 'jobName': None, 'maxRunDurationSeconds': 

In [8]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET',…

In [9]:
print(run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_5758e3bfb31d5de5ebebf1bfd48ed1e1e7ca2f0ed3be5c95b9e03c829b7f0699_d.txt', 'azureml-logs/65_job_prep-tvmps_5758e3bfb31d5de5ebebf1bfd48ed1e1e7ca2f0ed3be5c95b9e03c829b7f0699_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_5758e3bfb31d5de5ebebf1bfd48ed1e1e7ca2f0ed3be5c95b9e03c829b7f0699_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/128_azureml.log', 'logs/azureml/azureml.log', 'outputs/mnist.onnx']
