In [12]:
import os
import urllib
import shutil
import azureml

from azureml.core import Experiment
from azureml.core import Workspace, Run

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow
from azureml.core.runconfig import TensorflowConfiguration

In [13]:
from azureml.core import Workspace

subscription_id = '61691a9f-545c-41c4-baf2-d901d60ef9aa'
resource_group  = 'cgmmlservicewest2'
workspace_name  = 'cgmmlservicewest2'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    ws.write_config()
    print('Library configuration succeeded')
except:
    print('Workspace not found')

Library configuration succeeded


In [14]:
ws = Workspace.from_config()

In [15]:
script_folder = './tf-mnist'
os.makedirs(script_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='tf-mnist')

In [16]:
from azureml.core.dataset import Dataset

web_paths = [
            'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
            ]
dataset = Dataset.File.from_files(path=web_paths)

In [17]:
dataset = dataset.register(workspace=ws,
                           name='mnist dataset',
                           description='training and test dataset',
                           create_new_version=True)

# list the files referenced by dataset
dataset.to_path()

array(['/http/yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'],
      dtype=object)

In [18]:
cluster_name = "gpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)





Found existing compute target


In [20]:
#Setup training input parameters
script_params = {
    '--data-folder': dataset.as_named_input('mnist').as_mount(),
    '--batch-size': 50,
    '--first-layer-neurons': 300,
    '--second-layer-neurons': 100,
    '--learning-rate': 0.01
}

#Create Tensorflow estimator
#est = TensorFlow(source_directory=script_folder,
#                 entry_script='tf_mnist.py',
#                 script_params=script_params,
#                 compute_target=compute_target,
#                 use_gpu=True,
#                 pip_packages=['azureml-dataprep[pandas,fuse]'])

#Configure distributed training
distributed_training = TensorflowConfiguration()
distributed_training.worker_count = 4

# Create Tensorflow estimator with distributed training
est= TensorFlow(source_directory=script_folder,
                      compute_target=compute_target,
                      script_params=script_params,
                      entry_script='tf_mnist.py',
                      node_count=4,
                      process_count_per_node=1,
                      distributed_training=distributed_training,
                      use_gpu=True,
                      pip_packages=['azureml-dataprep[pandas,fuse]'])





In [21]:
# submit the TensorFlow job
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: tf-mnist_1582821514_e15ee371
Web View: https://ml.azure.com/experiments/tf-mnist/runs/tf-mnist_1582821514_e15ee371?wsid=/subscriptions/61691a9f-545c-41c4-baf2-d901d60ef9aa/resourcegroups/cgmmlservicewest2/workspaces/cgmmlservicewest2

Streaming azureml-logs/55_azureml-execution-tvmps_5dcf74135a4d33e1a7abd45cd90b2e6bfb06bc6b5ab4d800ae9290faf77935fd_d.txt

2020-02-27T16:44:34Z Starting output-watcher...
2020-02-27T16:44:34Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_1d5e282518217ede461ec1726f1f4557
f7277927d38a: Pulling fs layer
8d3eac894db4: Pulling fs layer
edf72af6d627: Pulling fs layer
3e4f86211d23: Pulling fs layer
d6e9603ff777: Pulling fs layer
5cad422780e2: Pulling fs layer
8130687c8acb: Pulling fs layer
c11e9246d621: Pulling fs layer
0dfae24cbbd9: Pulling fs layer
0bb049a6d391: Pulling fs layer
22a53069998a: Pulling fs layer
d6e9603ff777: Waiting
db550b9db251: Pulling fs layer

(60000, 784)
(60000,)
(10000, 784)
(10000,)
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
2020-02-27 16:46:58.766816: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-02-27 16:46:58.775602: E tensorflow/stream_executor/cuda/cuda_driver.cc:300] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-02-27 16:46:58.775675: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:161] retrieving CUDA diagnostic information for host: ecfe0694d0ee423691464a704a0d641a000000
2020-02-27 16:46:58.775713: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:168] hostname: ecfe0694d0ee423691464a704a0d641a000000
2020-02-27 16:46:58.775806: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:192] libcuda reported version is: 440.33.1
2020-02-27 16:46:58.775857: I tensorflow/stream_execu

{'runId': 'tf-mnist_1582821514_e15ee371',
 'target': 'gpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-02-27T16:44:32.663347Z',
 'endTimeUtc': '2020-02-27T16:48:15.713402Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '3f01f9d4-85fb-4549-a51a-1e56cb7ffeae',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '0ac8087b-beb3-4a17-abe9-9c0311b692b3'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'mnist', 'mechanism': 'Mount'}}],
 'runDefinition': {'script': 'tf_mnist.py',
  'useAbsolutePath': False,
  'arguments': ['--data-folder',
   'DatasetConsumptionConfig:mnist',
   '--batch-size',
   '50',
   '--first-layer-neurons',
   '300',
   '--second-layer-neurons',
   '100',
   '--learning-rate',
   '0.01'],
  'sourceDirectoryDataStore': 'workspaceblobstore',
  'framework': 'TensorFlow',
  'communicator': 'ParameterServer',
  'targe

In [22]:
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

model = run.register_model(model_name='tf-dnn-mnist', 
                           model_path='outputs/model',
                           model_framework=Model.Framework.TENSORFLOW,
                           model_framework_version='1.13.0',
                           resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5))

In [23]:
# Create a model folder in the current directory
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/model'):
        output_file_path = os.path.join('./model', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        run.download_file(name=f, output_file_path=output_file_path)

Downloading from outputs/model/checkpoint to ./model/checkpoint ...
Downloading from outputs/model/mnist-tf.model.data-00000-of-00001 to ./model/mnist-tf.model.data-00000-of-00001 ...
Downloading from outputs/model/mnist-tf.model.index to ./model/mnist-tf.model.index ...
Downloading from outputs/model/mnist-tf.model.meta to ./model/mnist-tf.model.meta ...
