In [1]:
import azureml.core
print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.41


In [2]:
import os
from azureml.core import Workspace, Experiment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.data.data_reference import DataReference
from azureml.core.runconfig import RunConfiguration
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

In [24]:
%%script sh --out /dev/null --err /dev/null
cd .. && mkdir -p data/mortgage_2000 && cd data/mortgage_2000 && wget http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/mortgage_2000.tgz

In [25]:
%%sh 
cd ../data/mortgage_2000 && tar -xvf mortgage_2000.tgz

names.csv
acq/Acquisition_2000Q4.txt
acq/Acquisition_2000Q3.txt
acq/Acquisition_2000Q2.txt
acq/Acquisition_2000Q1.txt
perf/Performance_2000Q4.txt
perf/Performance_2000Q3.txt
perf/Performance_2000Q2.txt
perf/Performance_2000Q1.txt


In [3]:
subscription_id = os.environ.get("SUBSCRIPTION_ID", "15ae9cb6-95c1-483d-a0e3-b1a1a3b06324")
resource_group = os.environ.get("RESOURCE_GROUP", "MLADS_todrabas")
workspace_name = os.environ.get("WORKSPACE_NAME", "todrabas_MLADS_WE")
# workspace_region = os.environ.get("WORKSPACE_REGION", "")

ws = Workspace(workspace_name=workspace_name, subscription_id=subscription_id, resource_group=resource_group)

# write config to a local directory for future use
# ws.write_config()

In [18]:
ws.write_config(path='./config')

In [22]:
config = {}
config["SUBSCRIPTION_ID"] = "15ae9cb6-95c1-483d-a0e3-b1a1a3b06324"
config["RESOURCE_GROUP"] = "MLADS_todrabas"
config["WORKSPACE_NAME"] = "todrabas_MLADS_WE"
config["GPU_CLUSTER_NAME"] = "gpu-todrabas"

with open('config/config.json', 'w') as f:
    f.write(json.dumps(config))

In [58]:
scripts_folder = "scripts"

# import shutil
# shutil.copy('./1_pandasVsRapids_ETL.py', os.path.join(scripts_folder, '1_pandasVsRapids_ETL.py'))

# with open(os.path.join(scripts_folder, './1_pandasVsRapids_ETL.py'), 'r') as process_data_script:
#     print(process_data_script.read())

In [5]:
gpu_cluster_name = "gpu-todrabas"

if gpu_cluster_name in ws.compute_targets:
    gpu_cluster = ws.compute_targets[gpu_cluster_name]
    
    if gpu_cluster and type(gpu_cluster) is AmlCompute:
        print('found compute target. just use it. ' + gpu_cluster_name)
else:
    print("creating new cluster")
    # vm_size parameter below could be modified to one of the RAPIDS-supported VM types
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "Standard_NC6s_v2", min_nodes=1, max_nodes = 1)

    # create the cluster
    gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)
    gpu_cluster.wait_for_completion(show_output=True)

creating new cluster
Creating
Succeeded..............
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [29]:
import tarfile
import hashlib
from urllib.request import urlretrieve
# from progressbar import ProgressBar

def validate_downloaded_data(path):
    if(os.path.isdir(path) and os.path.exists(path + '//names.csv')) :
        if(os.path.isdir(path + '//acq' ) and len(os.listdir(path + '//acq')) == 8):
            if(os.path.isdir(path + '//perf' ) and len(os.listdir(path + '//perf')) == 11):
                print("Data has been downloaded and decompressed at: {0}".format(path))
                return True
    print("Data has not been downloaded and decompressed")
    return False

# def show_progress(count, block_size, total_size):
#     global pbar
#     global processed
    
#     if count == 0:
#         pbar = ProgressBar(maxval=total_size)
#         processed = 0
    
#     processed += block_size
#     processed = min(processed,total_size)
#     pbar.update(processed)

        
def download_file(fileroot):
    filename = fileroot + '.tgz'
    if(not os.path.exists(filename) or hashlib.md5(open(filename, 'rb').read()).hexdigest() != '82dd47135053303e9526c2d5c43befd5' ):
        url_format = 'http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/{0}.tgz'
        url = url_format.format(fileroot)
        print("...Downloading file :{0}".format(filename))
        urlretrieve(url, filename)
#         pbar.finish()
        print("...File :{0} finished downloading".format(filename))
    else:
        print("...File :{0} has been downloaded already".format(filename))
    return filename

def decompress_file(filename,path):
    tar = tarfile.open(filename)
    print("...Getting information from {0} about files to decompress".format(filename))
    members = tar.getmembers()
    numFiles = len(members)
    so_far = 0
    for member_info in members:
        tar.extract(member_info,path=path)
#         show_progress(so_far, 1, numFiles)
        so_far += 1
#     pbar.finish()
    print("...All {0} files have been decompressed".format(numFiles))
    tar.close()

In [62]:
fileroot = 'mortgage_2000'
path = '.\\{0}'.format(fileroot)
pbar = None
processed = 0

print(path)
if(not validate_downloaded_data(path)):
    print("Downloading and Decompressing Input Data")
    filename = download_file(fileroot)
    decompress_file(filename,path)
    print("Input Data has been Downloaded and Decompressed")

.\mortgage_2000
Data has not been downloaded and decompressed
Downloading and Decompressing Input Data
...Downloading file :mortgage_2000.tgz
...File :mortgage_2000.tgz finished downloading
...Getting information from mortgage_2000.tgz about files to decompress
...All 9 files have been decompressed
Input Data has been Downloaded and Decompressed


In [65]:
!ls -la .

total 459928
drwxrwxr-x 6 1002 1002      4096 Jun  3 17:22 .
drwxrwxr-x 6 1002 1002      4096 May 28 17:50 ..
drwxr-xr-x 4 root root      4096 May 31 18:22 .\mortgage_2000
drwxrwxr-x 2 1002 1002      4096 May 31 18:40 .ipynb_checkpoints
-rw-rw-r-- 1 1002 1002     43804 May 30 17:39 1_pandasVsRapids_ETL.ipynb
-rw-r--r-- 1 root root      5477 Jun  3 17:04 2_pandasVsRapids_DBSCAN.ipynb
-rw-r--r-- 1 root root    294388 May 30 18:05 3_Rapids_flow_classification.ipynb
-rw-rw-r-- 1 1002 1002     29883 Jun  3 17:22 4_Rapids_AzureML_ETL.ipynb
-rw-r--r-- 1 root root 470557209 Jun  3 17:21 mortgage_2000.tgz
drwxr-xr-x 2 root root      4096 Jun  3 17:17 mortgage_np
drwxr-xr-x 3 root root      4096 May 31 18:58 scripts


In [36]:
# fileroot = 'data\mortagage_2000'
# path = '.\\{0}'.format(fileroot)
# print(path)
ds = ws.get_default_datastore()

# download and uncompress data in a local directory before uploading to data store
# directory specified in src_dir parameter below should have the acq, perf directories with data and names.csv file
ds.upload(src_dir=path, target_path=fileroot, overwrite=True, show_progress=True)

# data already uploaded to the datastore
data_ref = DataReference(data_reference_name='data', datastore=ds, path_on_datastore=fileroot)

Uploading .\mortgage_2000/acq/Acquisition_2000Q1.txt
Uploading .\mortgage_2000/acq/Acquisition_2000Q2.txt
Uploading .\mortgage_2000/acq/Acquisition_2000Q3.txt
Uploading .\mortgage_2000/acq/Acquisition_2000Q4.txt
Uploading .\mortgage_2000/names.csv
Uploading .\mortgage_2000/perf/Performance_2000Q1.txt
Uploading .\mortgage_2000/perf/Performance_2000Q2.txt
Uploading .\mortgage_2000/perf/Performance_2000Q3.txt
Uploading .\mortgage_2000/perf/Performance_2000Q4.txt
Uploaded .\mortgage_2000/names.csv, 1 files out of an estimated total of 9
Uploaded .\mortgage_2000/acq/Acquisition_2000Q1.txt, 2 files out of an estimated total of 9
Uploaded .\mortgage_2000/acq/Acquisition_2000Q2.txt, 3 files out of an estimated total of 9
Uploaded .\mortgage_2000/acq/Acquisition_2000Q4.txt, 4 files out of an estimated total of 9
Uploaded .\mortgage_2000/acq/Acquisition_2000Q3.txt, 5 files out of an estimated total of 9
Uploaded .\mortgage_2000/perf/Performance_2000Q2.txt, 6 files out of an estimated total of 9


In [51]:
filename = 'mortgage.npy.gz'
url = 'https://github.com/rapidsai/notebooks-extended/raw/master/data/mortgage/mortgage.npy.gz'

print("...Downloading file :{0}".format(filename))
urlretrieve(url, filename)
print("...File :{0} finished downloading".format(filename))


...Downloading file :mortgage.npy.gz
...File :mortgage.npy.gz finished downloading


In [9]:
path = 'mortgage_np'
froot = 'mortgage_np'

ds = ws.get_default_datastore()

# download and uncompress data in a local directory before uploading to data store
# directory specified in src_dir parameter below should have the acq, perf directories with data and names.csv file
ds.upload(src_dir=path, target_path=froot, overwrite=True, show_progress=True)

# # data already uploaded to the datastore
# data_ref = DataReference(data_reference_name='data', datastore=ds, path_on_datastore=froot)

Uploading mortgage_np/.ipynb_checkpoints/mortgage-checkpoint.csv
Uploading mortgage_np/mortgage.csv
Uploading mortgage_np/mortgage.npy.gz
Uploaded mortgage_np/.ipynb_checkpoints/mortgage-checkpoint.csv, 1 files out of an estimated total of 3
Uploaded mortgage_np/mortgage.npy.gz, 2 files out of an estimated total of 3
Uploaded mortgage_np/mortgage.csv, 3 files out of an estimated total of 3


$AZUREML_DATAREFERENCE_4fe7bbde9f7c4d149c691834fae9f4bc

In [24]:
import tarfile
import hashlib
from urllib.request import urlretrieve


filename = 'unswiotflow.tar.gz'
url = 'https://github.com/rapidsai/notebooks-extended/blob/master/data/unswiot/unswiotflow.tar.gz?raw=true'

print("...Downloading file :{0}".format(filename))
urlretrieve(url, filename)
print("...File :{0} finished downloading".format(filename))

...Downloading file :unswiotflow.tar.gz
...File :unswiotflow.tar.gz finished downloading


In [30]:
path = 'unswiot'
decompress_file(filename,path)

...Getting information from unswiotflow.tar.gz about files to decompress
...All 3 files have been decompressed


In [31]:
# path = 'mortgage_np'
froot = path

ds = ws.get_default_datastore()

# download and uncompress data in a local directory before uploading to data store
# directory specified in src_dir parameter below should have the acq, perf directories with data and names.csv file
ds.upload(src_dir=path, target_path=froot, overwrite=True, show_progress=True)

# # data already uploaded to the datastore
# data_ref = DataReference(data_reference_name='data', datastore=ds, path_on_datastore=froot)

Uploading unswiot/conn.log
Uploading unswiot/lab_mac_labels_cats.csv
Uploading unswiot/small_sample.pcap
Uploaded unswiot/lab_mac_labels_cats.csv, 1 files out of an estimated total of 3
Uploaded unswiot/small_sample.pcap, 2 files out of an estimated total of 3
Uploaded unswiot/conn.log, 3 files out of an estimated total of 3


$AZUREML_DATAREFERENCE_b3e57f3ce3c840d9a4fc373290c23006

In [38]:
run_config = RunConfiguration()
run_config.framework = 'python'
run_config.environment.python.user_managed_dependencies = True
run_config.environment.python.interpreter_path = '/conda/envs/rapids/bin/python'
run_config.target = gpu_cluster_name
run_config.environment.docker.enabled = True
run_config.environment.docker.gpu_support = True
run_config.environment.docker.base_image = "rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04"
# run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub
# run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images
# run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images
run_config.environment.spark.precache_packages = False
run_config.data_references={'data':data_ref.to_config()}

In [48]:
src = ScriptRunConfig(source_directory=scripts_folder, 
                          script='1_pandasVsRapids_ETL.py', 
                          arguments = ['--gpu', 0, '--data_dir', str(data_ref)
                                      ],
                          run_config=run_config
                         )

exp = Experiment(ws, 'rapidstest_cpu')
run = exp.submit(config=src)
# RunDetails(run).show()
run.wait_for_completion(show_output=True)

RunId: rapidstest_cpu_1559329102_76a4e0f3
Web View: https://mlworkspace.azure.ai/portal/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/MLADS_todrabas/providers/Microsoft.MachineLearningServices/workspaces/todrabas_MLADS_WE/experiments/rapidstest_cpu/runs/rapidstest_cpu_1559329102_76a4e0f3

Streaming azureml-logs/80_driver_log.txt

Running ETL...
/mnt/batch/tasks/shared/LS_root/jobs/todrabas_mlads_we/azureml/rapidstest_cpu_1559329102_76a4e0f3/mounts/workspaceblobstore/mortgage_2000/acq/Acquisition_2000Q1.txt
/mnt/batch/tasks/shared/LS_root/jobs/todrabas_mlads_we/azureml/rapidstest_cpu_1559329102_76a4e0f3/mounts/workspaceblobstore/mortgage_2000/perf/Performance_2000Q1.txt
Creating ever delinquent statuses...
Creating delinquency statuses...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-vie

{'runId': 'rapidstest_cpu_1559329102_76a4e0f3',
 'target': 'gpu-todrabas',
 'status': 'Completed',
 'startTimeUtc': '2019-05-31T18:58:31.723163Z',
 'endTimeUtc': '2019-05-31T19:15:34.968262Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': '5d4be29b-dedc-4f13-81d8-30851a513f2a',
  'azureml.git.repository_uri': 'git@github.com:drabastomek/MLADS_RAPIDS.git',
  'mlflow.source.git.repoURL': 'git@github.com:drabastomek/MLADS_RAPIDS.git',
  'azureml.git.branch': 'devel',
  'mlflow.source.git.branch': 'devel',
  'azureml.git.commit': '1c5ecbfb982fac904d415f3dd0ecfff403d4c510',
  'mlflow.source.git.commit': '1c5ecbfb982fac904d415f3dd0ecfff403d4c510',
  'azureml.git.dirty': 'True'},
 'runDefinition': {'script': '1_pandasVsRapids_ETL.py',
  'arguments': ['--gpu', '0', '--data_dir', '$AZUREML_DATAREFERENCE_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'gpu-todrabas',
  'dataReferences': {'data': {'dataStoreNam

In [49]:

# delete the cluster
gpu_cluster.delete()