In [3]:
import os
from azureml.core import Workspace, Experiment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.data.data_reference import DataReference
from azureml.core.runconfig import RunConfiguration
from azureml.core import ScriptRunConfig
import json

In [4]:
with open('config/config.json', 'r') as f:
    config = json.loads(f.read())
    
subscription_id = config["SUBSCRIPTION_ID"]
resource_group = config["RESOURCE_GROUP"]
workspace_name = config["WORKSPACE_NAME"]
gpu_cluster_name = config["GPU_CLUSTER_NAME"]

ws = Workspace(workspace_name=workspace_name, subscription_id=subscription_id, resource_group=resource_group)

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code B7UHWEVJW to authenticate.
Interactive authentication successfully completed.


In [5]:
scripts_folder = "scripts"

if gpu_cluster_name in ws.compute_targets:
    gpu_cluster = ws.compute_targets[gpu_cluster_name]
    
    if gpu_cluster and type(gpu_cluster) is AmlCompute:
        print('Compute target found. Using: ' + gpu_cluster_name)
else:
    print("Creating new cluster")
    
    # vm_size parameter below could be modified to one of the RAPIDS-supported VM types
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "Standard_NC6s_v2", min_nodes=1, max_nodes = 1)

    # create the cluster
    gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)
    gpu_cluster.wait_for_completion(show_output=True)

Compute target found. Using: gpu-todrabas


In [6]:
file_root = 'data/nyctaxi'
ds = ws.get_default_datastore()

# data already uploaded to the datastore
data_ref = DataReference(data_reference_name='data', datastore=ds, path_on_datastore=file_root)

In [None]:
# cd = CondaDependencies(conda_dependencies_file_path='rapids.yml')
# run_config = RunConfiguration(conda_dependencies=cd)
# run_config.framework = 'python'
# run_config.target = gpu_cluster_name
# run_config.environment.docker.enabled = True
# run_config.environment.docker.gpu_support = True
# run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04"
# run_config.environment.spark.precache_packages = False
# run_config.data_references={'data':data_ref.to_config()}

In [7]:
run_config = RunConfiguration()
run_config.framework = 'python'
run_config.environment.python.user_managed_dependencies = True
run_config.environment.python.interpreter_path = '/conda/envs/rapids/bin/python'
run_config.target = gpu_cluster_name
run_config.environment.docker.enabled = True
run_config.environment.docker.gpu_support = True
run_config.environment.docker.base_image = "todrabas/mlads_rapids:cuda9.2-runtime-ubuntu18.04"
# run_config.environment.docker.base_image = "rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04"
run_config.environment.spark.precache_packages = False
run_config.data_references={'data':data_ref.to_config()}

In [8]:
src = ScriptRunConfig(source_directory=scripts_folder, 
                          script='1_pandasVsRapids_ETL.py', 
                          arguments = ['--gpu', 1, '--data_dir', str(data_ref)],
                          run_config=run_config
                         )

exp = Experiment(ws, 'rapidstest_etl_gpu')
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)

RunId: rapidstest_etl_gpu_1559615213_48164648
Web View: https://mlworkspace.azure.ai/portal/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/MLADS_todrabas/providers/Microsoft.MachineLearningServices/workspaces/todrabas_MLADS_WE/experiments/rapidstest_etl_gpu/runs/rapidstest_etl_gpu_1559615213_48164648

Streaming azureml-logs/80_driver_log.txt

Running ETL...
/mnt/batch/tasks/shared/LS_root/jobs/todrabas_mlads_we/azureml/rapidstest_etl_gpu_1559615213_48164648/mounts/workspaceblobstore/mortgage_2000/acq/Acquisition_2000Q1.txt
/mnt/batch/tasks/shared/LS_root/jobs/todrabas_mlads_we/azureml/rapidstest_etl_gpu_1559615213_48164648/mounts/workspaceblobstore/mortgage_2000/perf/Performance_2000Q1.txt
Creating ever delinquent statuses...
Creating delinquency statuses...
	Processing month: 1
	Processing month: 2
	Processing month: 3
	Processing month: 4
	Processing month: 5
	Processing month: 6
	Processing month: 7
	Processing month: 8
	Processing month: 9
	Processing month: 10
	

{'runId': 'rapidstest_etl_gpu_1559615213_48164648',
 'target': 'gpu-todrabas',
 'status': 'Completed',
 'startTimeUtc': '2019-06-04T02:27:06.260143Z',
 'endTimeUtc': '2019-06-04T02:27:39.941189Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': '9031dea4-bb66-42d8-a214-b7d7caeaca6c',
  'azureml.git.repository_uri': 'git@github.com:drabastomek/MLADS_RAPIDS.git',
  'mlflow.source.git.repoURL': 'git@github.com:drabastomek/MLADS_RAPIDS.git',
  'azureml.git.branch': 'devel',
  'mlflow.source.git.branch': 'devel',
  'azureml.git.commit': '1fdd1add65b610adc7c1adf54ec1e66369ee7c85',
  'mlflow.source.git.commit': '1fdd1add65b610adc7c1adf54ec1e66369ee7c85',
  'azureml.git.dirty': 'False'},
 'runDefinition': {'script': '1_pandasVsRapids_ETL.py',
  'arguments': ['--gpu', '1', '--data_dir', '$AZUREML_DATAREFERENCE_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'gpu-todrabas',
  'dataReferences': {'data': {'dataSto

In [7]:
src = ScriptRunConfig(source_directory=scripts_folder, 
                          script='1_pandasVsRapids_ETL.py', 
                          arguments = ['--gpu', 0, '--data_dir', str(data_ref)],
                          run_config=run_config
                         )

exp = Experiment(ws, 'rapidstest_etl_cpu')
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)

RunId: rapidstest_etl_cpu_1559596019_974e3590
Web View: https://mlworkspace.azure.ai/portal/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/MLADS_todrabas/providers/Microsoft.MachineLearningServices/workspaces/todrabas_MLADS_WE/experiments/rapidstest_etl_cpu/runs/rapidstest_etl_cpu_1559596019_974e3590

Streaming azureml-logs/80_driver_log.txt

Running ETL...
/mnt/batch/tasks/shared/LS_root/jobs/todrabas_mlads_we/azureml/rapidstest_etl_cpu_1559596019_974e3590/mounts/workspaceblobstore/mortgage_2000/acq/Acquisition_2000Q1.txt
/mnt/batch/tasks/shared/LS_root/jobs/todrabas_mlads_we/azureml/rapidstest_etl_cpu_1559596019_974e3590/mounts/workspaceblobstore/mortgage_2000/perf/Performance_2000Q1.txt
Creating ever delinquent statuses...
Creating delinquency statuses...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexi

{'runId': 'rapidstest_etl_cpu_1559596019_974e3590',
 'target': 'gpu-todrabas',
 'status': 'Completed',
 'startTimeUtc': '2019-06-03T21:08:56.092629Z',
 'endTimeUtc': '2019-06-03T21:25:30.303368Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': '0d743ad6-71a4-4bbc-b70c-d8bc06c719fe',
  'azureml.git.repository_uri': 'git@github.com:drabastomek/MLADS_RAPIDS.git',
  'mlflow.source.git.repoURL': 'git@github.com:drabastomek/MLADS_RAPIDS.git',
  'azureml.git.branch': 'devel',
  'mlflow.source.git.branch': 'devel',
  'azureml.git.commit': 'ba3ab5b273cbdf8a5bcd3345a4a043542de4442c',
  'mlflow.source.git.commit': 'ba3ab5b273cbdf8a5bcd3345a4a043542de4442c',
  'azureml.git.dirty': 'True'},
 'runDefinition': {'script': '1_pandasVsRapids_ETL.py',
  'arguments': ['--gpu', '0', '--data_dir', '$AZUREML_DATAREFERENCE_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'gpu-todrabas',
  'dataReferences': {'data': {'dataStor