In [None]:
# Check core SDK version number
import azureml.core

print( "SDK version:" , azureml.core.VERSION )

In [None]:
from azureml.core import Workspace

# Create the workspace using the specified parameters
ws = Workspace.from_config()
print( ws.name , ws.resource_group , ws.location , ws.subscription_id , sep = '\n')

In [None]:
import os
from azureml.core import Experiment

# choose a name for the run history container in the workspace
experiment_name = "hack"

exp = Experiment( ws , name=experiment_name )

# project folder name - the folder that contains all training scripts
project_folder = os.path.join( os.getcwd() , experiment_name )
print( project_folder )

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'hackcluster'

try:
    compute_target = ComputeTarget( workspace=ws, name=cluster_name )
    print( 'Found existing compute target' )
except ComputeTargetException:
    print( 'Creating a new compute target...' )
    compute_config = AmlCompute.provisioning_configuration( vm_size = "STANDARD_D2_V2" ,
                                                                min_nodes = 0 ,
                                                                max_nodes = 7 )

    # create the cluster
    compute_target = ComputeTarget.create( ws , cluster_name, compute_config )

    compute_target.wait_for_completion( show_output=True )

print( compute_target.status.serialize() )

In [None]:
from azureml.train.estimator import Estimator

est = Estimator ( source_directory = project_folder ,
                  compute_target = compute_target ,
                  node_count = 3 ,
                  distributed_backend = "mpi" ,
                  entry_script = 'launch.py' ,
                  pip_packages = [ "ray==0.6.5" , "psutil" , "setproctitle" ] )

In [None]:
run = exp.submit( est )
print( run )

In [None]:
from azureml.widgets import RunDetails

RunDetails( run ).show()

In [None]:
#cancel the current run
run.cancel()

## create compute clusters in vnet

In [None]:
# Check core SDK version number
import azureml.core

print( "SDK version:" , azureml.core.VERSION )

In [None]:
from azureml.core import Workspace

# Create the workspace using the specified parameters
ws = Workspace.from_config()
print( ws.name , ws.resource_group , ws.location , ws.subscription_id , sep = '\n')

In [None]:
import os
from azureml.core import Experiment

# choose a name for the run history container in the workspace
experiment_name = "vnethack"

exp = Experiment( ws , name=experiment_name )

# project folder name - the folder that contains all training scripts
project_folder = os.path.join( os.getcwd() , experiment_name )
print( project_folder )

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# The Azure virtual network name, subnet, and resource group
vnet_name = 'mlvnet'
subnet_name = 'default'
vnet_resourcegroup_name = 'exmachina'

# Choose a name for your CPU cluster
cpu_cluster_name = "vnetcpu"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget( workspace=ws, name=cpu_cluster_name )
    print( "Found existing cpucluster" )
except ComputeTargetException:
    print( "Creating new cpucluster" )
    
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration( vm_size = "STANDARD_D2_V2" ,
                                                           min_nodes = 0 ,
                                                           max_nodes = 4 ,
                                                           vnet_resourcegroup_name = vnet_resourcegroup_name ,
                                                           vnet_name = vnet_name ,
                                                           subnet_name = subnet_name )

    # Create the cluster with the specified name and configuration
    cpu_cluster = ComputeTarget.create( ws , cpu_cluster_name, compute_config )
    
    # Wait for the cluster to complete, show the output log
    cpu_cluster.wait_for_completion( show_output=True )
    
print( cpu_cluster.status.serialize() )

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# The Azure virtual network name, subnet, and resource group
vnet_name = 'mlvnet'
subnet_name = 'default'
vnet_resourcegroup_name = 'exmachina'

# Choose a name for your CPU cluster
gpu_cluster_name = "vnetgpu"

# Verify that cluster does not exist already
try:
    gpu_cluster = ComputeTarget( workspace=ws , name=gpu_cluster_name )
    print( "Found existing gpucluster" )
except ComputeTargetException:
    print( "Creating new gpucluster" )
    
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration( vm_size = "STANDARD_NC6" ,
                                                           min_nodes = 0 ,
                                                           max_nodes = 2 ,
                                                           vnet_resourcegroup_name = vnet_resourcegroup_name ,
                                                           vnet_name = vnet_name ,
                                                           subnet_name = subnet_name )

    # Create the cluster with the specified name and configuration
    gpu_cluster = ComputeTarget.create( ws, gpu_cluster_name, compute_config )
    
    # Wait for the cluster to complete, show the output log
    gpu_cluster.wait_for_completion( show_output=True )
    
print( gpu_cluster.status.serialize() )

In [None]:
from azureml.train.estimator import Estimator

cpu_est = Estimator ( source_directory = project_folder ,
                  compute_target = cpu_cluster ,
                  node_count = 3 ,
                  distributed_backend = "mpi" ,
                  entry_script = 'launch_cpu.py' ,
                  pip_packages = [ "ray==0.6.5" , "psutil" , "setproctitle" ] )

In [None]:
cpu_run = exp.submit( cpu_est )
print( cpu_run )

In [None]:
from azureml.widgets import RunDetails

RunDetails( cpu_run ).show()

In [None]:
from azureml.train.estimator import Estimator

gpu_est = Estimator ( source_directory = project_folder ,
                  compute_target = gpu_cluster ,
                  node_count = 2 ,
                  distributed_backend = "mpi" ,
                  entry_script = 'launch_gpu.py' ,
                  pip_packages = [ "ray==0.6.5" , "psutil" , "setproctitle" ] )

In [None]:
gpu_run = exp.submit( gpu_est )
print( gpu_run )

In [None]:
from azureml.widgets import RunDetails

RunDetails( gpu_run ).show()

In [None]:
#cancel the current run
gpu_run.cancel()

## create compute clusters with password for ssh

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# The Azure virtual network name, subnet, and resource group
vnet_name = 'mlvnet'
subnet_name = 'default'
vnet_resourcegroup_name = 'exmachina'

# Choose a name for your CPU cluster
cpu_cluster_name = "vnetcpu"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget( workspace=ws, name=cpu_cluster_name )
    print( "Found existing cpucluster" )
except ComputeTargetException:
    print( "Creating new cpucluster" )
    
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration( vm_size ="STANDARD_D2_V2" ,
                                                           min_nodes = 0 ,
                                                           max_nodes = 4 ,
                                                           admin_username = "themaster" ,
                                                           admin_user_password = "tardis!42" ,
                                                           vnet_resourcegroup_name = vnet_resourcegroup_name ,
                                                           vnet_name = vnet_name ,
                                                           subnet_name = subnet_name )

    # Create the cluster with the specified name and configuration
    cpu_cluster = ComputeTarget.create( ws , cpu_cluster_name, compute_config )
    
    # Wait for the cluster to complete, show the output log
    cpu_cluster.wait_for_completion( show_output=True )
    
print( cpu_cluster.status.serialize() )