## Use Ray Tune and MLFlow on Hyperplane 

In [None]:
!pip install tensorboardX --quiet
!pip install kubernetes==18.20 --quiet

In [2]:
import ray
import tensorflow as tf
import torch 
print(f'ray version {ray.__version__}')
print(f'tf version {tf.__version__}')
print(f'torch version {torch.__version__}')


2021-09-03 17:06:24.358150: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


ray version 1.5.2
tf version 2.4.1
torch version 1.7.1+cpu


In [3]:
## start the ray cluster
from ray_common import initialize_ray_cluster, stop_ray_cluster

num_workers = 2
cpu_per_worker="4000m"
ram_per_worker="4.0Gi"


ray_cluster = initialize_ray_cluster(num_workers, cpu_per_worker, ram_per_worker)
ray_cluster

2021-09-03 17:08:14,500	INFO services.py:1245 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Waiting for worker ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98...
Waiting for worker ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925...


['ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98',
 'ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925']

In [5]:
assert ray.worker.global_worker.connected

In [6]:
import os
import tempfile
import time

import mlflow

from ray import tune
from ray.tune.integration.mlflow import MLflowLoggerCallback, mlflow_mixin
print('mlflow version', mlflow.__version__)

mlflow version 1.17.0


In [7]:
def evaluation_fn(step, width, height):
    return (0.1 + width * step / 100)**(-1) + height * 0.1


In [8]:
def easy_objective(config):
    # Hyperparameters
    width, height = config["width"], config["height"]

    for step in range(config.get("steps", 100)):
        # Iterative training function - can be any arbitrary training procedure
        intermediate_score = evaluation_fn(step, width, height)
        # Feed the score back to Tune.
        tune.report(iterations=step, mean_loss=intermediate_score)
        time.sleep(0.1)


In [9]:
def tune_function(mlflow_tracking_uri, finish_fast=False):
    tune.run(
        easy_objective,
        name="mlflow",
        num_samples=5,
        callbacks=[
            MLflowLoggerCallback(
                tracking_uri=mlflow_tracking_uri,
                experiment_name="mixin_example",
                save_artifact=True)
        ],
        config={
            "width": tune.randint(10, 100),
            "height": tune.randint(0, 100),
            "steps": 5 if finish_fast else 100,
        })

In [10]:
@mlflow_mixin
def decorated_easy_objective(config):
    # Hyperparameters
    width, height = config["width"], config["height"]

    for step in range(config.get("steps", 100)):
        # Iterative training function - can be any arbitrary training procedure
        intermediate_score = evaluation_fn(step, width, height)
        # Log the metrics to mlflow
        mlflow.log_metrics(dict(mean_loss=intermediate_score), step=step)
        # Feed the score back to Tune.
        tune.report(iterations=step, mean_loss=intermediate_score)
        time.sleep(0.1)


In [11]:
def tune_decorated(mlflow_tracking_uri, finish_fast=False):
    # Set the experiment, or create a new one if does not exist yet.
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    mlflow.set_experiment(experiment_name="mixin_example")
    tune.run(
        decorated_easy_objective,
        name="mlflow",
        num_samples=5,
        config={
            "width": tune.randint(10, 100),
            "height": tune.randint(0, 100),
            "steps": 5 if finish_fast else 100,
            "mlflow": {
                "experiment_name": "mixin_example",
                "tracking_uri": mlflow.get_tracking_uri()
            }
        })


## setup MLFlow tracking URI

In [None]:
import os
tracking_uri = os.environ.get('DATABASE_URL_NO_PARAMS').split(':5432')[0]
mlflow.set_tracking_uri(tracking_uri)
tracking_uri = mlflow.get_tracking_uri()
print(tracking_uri)

In [13]:
tune_decorated(tracking_uri)



Trial name,status,loc,height,width
decorated_easy_objective_36bc0_00000,PENDING,,91,26
decorated_easy_objective_36bc0_00001,PENDING,,61,97
decorated_easy_objective_36bc0_00002,PENDING,,26,58
decorated_easy_objective_36bc0_00003,PENDING,,82,74
decorated_easy_objective_36bc0_00004,PENDING,,57,14


2021-09-03 17:13:11,457	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Result for decorated_easy_objective_36bc0_00002:
  date: 2021-09-03_17-13-13
  done: false
  experiment_id: c1ffe078065b4841858384a55ee9baae
  hostname: ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925
  iterations: 0
  iterations_since_restore: 1
  mean_loss: 12.6
  neg_mean_loss: -12.6
  node_ip: 10.0.179.4
  pid: 66
  time_since_restore: 0.10284662246704102
  time_this_iter_s: 0.10284662246704102
  time_total_s: 0.10284662246704102
  timestamp: 1630689193
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 36bc0_00002
  
Result for decorated_easy_objective_36bc0_00003:
  date: 2021-09-03_17-13-13
  done: false
  experiment_id: d824e4a05adf4fb5a7cc5249962b6796
  hostname: ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
  iterations: 0
  iterations_since_restore: 1
  mean_loss: 18.200000000000003
  neg_mean_loss: -18.200000000000003
  node_ip: 10.0.179.3
  pid: 65
  time_since_restore: 0.16285395622253418
  time_this_iter_s: 0.16285395622253418
  time_total_s: 0.1628539562

Trial name,status,loc,height,width,loss,iter,total time (s),iterations,neg_mean_loss
decorated_easy_objective_36bc0_00000,RUNNING,10.0.179.3:64,91,26,9.43784,12,2.17164,11,-9.43784
decorated_easy_objective_36bc0_00001,RUNNING,10.0.179.4:64,61,97,6.18518,13,2.14049,12,-6.18518
decorated_easy_objective_36bc0_00002,RUNNING,10.0.179.4:66,26,58,2.74164,13,2.13614,12,-2.74164
decorated_easy_objective_36bc0_00003,RUNNING,10.0.179.3:65,82,74,8.31136,13,2.27568,12,-8.31136
decorated_easy_objective_36bc0_00004,RUNNING,10.0.179.4:65,57,14,6.30976,12,2.11566,11,-6.30976


Result for decorated_easy_objective_36bc0_00000:
  date: 2021-09-03_17-13-18
  done: false
  experiment_id: df826e3298c242b7891af98d2cf2265c
  hostname: ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
  iterations: 29
  iterations_since_restore: 30
  mean_loss: 9.23089005235602
  neg_mean_loss: -9.23089005235602
  node_ip: 10.0.179.3
  pid: 64
  time_since_restore: 5.217756271362305
  time_this_iter_s: 0.16892600059509277
  time_total_s: 5.217756271362305
  timestamp: 1630689198
  timesteps_since_restore: 0
  training_iteration: 30
  trial_id: 36bc0_00000
  
Result for decorated_easy_objective_36bc0_00004:
  date: 2021-09-03_17-13-18
  done: false
  experiment_id: bf16f9be9b3644148e068a75743168a1
  hostname: ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925
  iterations: 29
  iterations_since_restore: 30
  mean_loss: 5.940384615384616
  neg_mean_loss: -5.940384615384616
  node_ip: 10.0.179.4
  pid: 65
  time_since_restore: 5.195038557052612
  time_this_iter_s: 0.17796707153320312
  time_

Trial name,status,loc,height,width,loss,iter,total time (s),iterations,neg_mean_loss
decorated_easy_objective_36bc0_00000,RUNNING,10.0.179.3:64,91,26,9.19524,41,7.19552,40,-9.19524
decorated_easy_objective_36bc0_00001,RUNNING,10.0.179.4:64,61,97,6.12571,41,7.0739,40,-6.12571
decorated_easy_objective_36bc0_00002,RUNNING,10.0.179.4:66,26,58,2.64188,42,7.21684,41,-2.64188
decorated_easy_objective_36bc0_00003,RUNNING,10.0.179.3:65,82,74,8.23367,41,7.152,40,-8.23367
decorated_easy_objective_36bc0_00004,RUNNING,10.0.179.4:65,57,14,5.87544,41,7.18503,40,-5.87544


Result for decorated_easy_objective_36bc0_00000:
  date: 2021-09-03_17-13-23
  done: false
  experiment_id: df826e3298c242b7891af98d2cf2265c
  hostname: ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
  iterations: 58
  iterations_since_restore: 59
  mean_loss: 9.165876152832674
  neg_mean_loss: -9.165876152832674
  node_ip: 10.0.179.3
  pid: 64
  time_since_restore: 10.30574893951416
  time_this_iter_s: 0.16855120658874512
  time_total_s: 10.30574893951416
  timestamp: 1630689203
  timesteps_since_restore: 0
  training_iteration: 59
  trial_id: 36bc0_00000
  
Result for decorated_easy_objective_36bc0_00004:
  date: 2021-09-03_17-13-23
  done: false
  experiment_id: bf16f9be9b3644148e068a75743168a1
  hostname: ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925
  iterations: 58
  iterations_since_restore: 59
  mean_loss: 5.821654501216545
  neg_mean_loss: -5.821654501216545
  node_ip: 10.0.179.4
  pid: 65
  time_since_restore: 10.295065641403198
  time_this_iter_s: 0.17229461669921875
  ti

Trial name,status,loc,height,width,loss,iter,total time (s),iterations,neg_mean_loss
decorated_easy_objective_36bc0_00000,RUNNING,10.0.179.3:64,91,26,9.15543,70,12.1575,69,-9.15543
decorated_easy_objective_36bc0_00001,RUNNING,10.0.179.4:64,61,97,6.11492,70,12.0417,69,-6.11492
decorated_easy_objective_36bc0_00002,RUNNING,10.0.179.4:66,26,58,2.62457,71,12.2459,70,-2.62457
decorated_easy_objective_36bc0_00003,RUNNING,10.0.179.3:65,82,74,8.21927,71,12.2379,70,-8.21927
decorated_easy_objective_36bc0_00004,RUNNING,10.0.179.4:65,57,14,5.80246,70,12.1286,69,-5.80246


Result for decorated_easy_objective_36bc0_00004:
  date: 2021-09-03_17-13-28
  done: false
  experiment_id: bf16f9be9b3644148e068a75743168a1
  hostname: ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925
  iterations: 87
  iterations_since_restore: 88
  mean_loss: 5.7814332247557
  neg_mean_loss: -5.7814332247557
  node_ip: 10.0.179.4
  pid: 65
  time_since_restore: 15.366740465164185
  time_this_iter_s: 0.1664295196533203
  time_total_s: 15.366740465164185
  timestamp: 1630689208
  timesteps_since_restore: 0
  training_iteration: 88
  trial_id: 36bc0_00004
  
Result for decorated_easy_objective_36bc0_00000:
  date: 2021-09-03_17-13-28
  done: false
  experiment_id: df826e3298c242b7891af98d2cf2265c
  hostname: ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
  iterations: 88
  iterations_since_restore: 89
  mean_loss: 9.143516100957354
  neg_mean_loss: -9.143516100957354
  node_ip: 10.0.179.3
  pid: 64
  time_since_restore: 15.409528017044067
  time_this_iter_s: 0.165879487991333
  time_to

Trial name,status,loc,height,width,loss,iter,total time (s),iterations,neg_mean_loss
decorated_easy_objective_36bc0_00000,RUNNING,10.0.179.3:64,91,26,9.13909,99,17.3032,98,-9.13909
decorated_easy_objective_36bc0_00001,RUNNING,10.0.179.4:64,61,97,6.11062,98,17.0039,97,-6.11062
decorated_easy_objective_36bc0_00002,RUNNING,10.0.179.4:66,26,58,2.61756,99,17.3228,98,-2.61756
decorated_easy_objective_36bc0_00003,RUNNING,10.0.179.3:65,82,74,8.21377,99,17.2081,98,-8.21377
decorated_easy_objective_36bc0_00004,RUNNING,10.0.179.4:65,57,14,5.7731,98,17.1056,97,-5.7731


Result for decorated_easy_objective_36bc0_00003:
  date: 2021-09-03_17-13-30
  done: true
  experiment_id: d824e4a05adf4fb5a7cc5249962b6796
  experiment_tag: 3_height=82,width=74
  hostname: ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
  iterations: 99
  iterations_since_restore: 100
  mean_loss: 8.213631406761179
  neg_mean_loss: -8.213631406761179
  node_ip: 10.0.179.3
  pid: 65
  time_since_restore: 17.405672073364258
  time_this_iter_s: 0.19752287864685059
  time_total_s: 17.405672073364258
  timestamp: 1630689210
  timesteps_since_restore: 0
  training_iteration: 100
  trial_id: 36bc0_00003
  
Result for decorated_easy_objective_36bc0_00000:
  date: 2021-09-03_17-13-31
  done: true
  experiment_id: df826e3298c242b7891af98d2cf2265c
  experiment_tag: 0_height=91,width=26
  hostname: ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
  iterations: 99
  iterations_since_restore: 100
  mean_loss: 9.138699690402476
  neg_mean_loss: -9.138699690402476
  node_ip: 10.0.179.3
  pid: 64
  ti

Trial name,status,loc,height,width,loss,iter,total time (s),iterations,neg_mean_loss
decorated_easy_objective_36bc0_00000,TERMINATED,,91,26,9.1387,100,17.478,99,-9.1387
decorated_easy_objective_36bc0_00001,TERMINATED,,61,97,6.1104,100,17.4734,99,-6.1104
decorated_easy_objective_36bc0_00002,TERMINATED,,26,58,2.61739,100,17.4868,99,-2.61739
decorated_easy_objective_36bc0_00003,TERMINATED,,82,74,8.21363,100,17.4057,99,-8.21363
decorated_easy_objective_36bc0_00004,TERMINATED,,57,14,5.77163,100,17.5836,99,-5.77163


2021-09-03 17:13:31,501	INFO tune.py:550 -- Total run time: 21.44 seconds (20.65 seconds for the tuning loop).


In [14]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-4bca3e32-3370-4d72-8076-3799b5569e98
Deleting ray-worker-526e0e9c-77ba-444e-af47-8a2d1ca79925
