### (Advanced) PopulationBasedTraining with Ray tune, TensorFlow, mlflow and Tensorboard 

In [1]:
!pip install -r requirements.txt --quiet

In [2]:
from __future__ import print_function

from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import (Input, Activation, Dense, Permute,
                                     Dropout)
from tensorflow.keras.layers import add, dot, concatenate
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing.sequence import pad_sequences

from filelock import FileLock
import os
import argparse
import tarfile
import numpy as np
import re

from ray import tune
import mlflow
from ray.tune.integration.mlflow import MLflowLoggerCallback, mlflow_mixin

2021-08-29 20:08:46.337740: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
def tokenize(sent):
    """Return the tokens of a sentence including punctuation.

    >>> tokenize("Bob dropped the apple. Where is the apple?")
    ["Bob", "dropped", "the", "apple", ".", "Where", "is", "the", "apple", "?"]
    """
    return [x.strip() for x in re.split(r"(\W+)?", sent) if x and x.strip()]


In [4]:
def parse_stories(lines, only_supporting=False):
    """Parse stories provided in the bAbi tasks format

    If only_supporting is true, only the sentences
    that support the answer are kept.
    """
    data = []
    story = []
    for line in lines:
        line = line.decode("utf-8").strip()
        nid, line = line.split(" ", 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if "\t" in line:
            q, a, supporting = line.split("\t")
            q = tokenize(q)
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append("")
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


In [5]:
def get_stories(f, only_supporting=False, max_length=None):
    """Given a file name, read the file,
    retrieve the stories,
    and then convert the sentences into a single story.

    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    """

    def flatten(data):
        return sum(data, [])

    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    data = [(flatten(story), q, answer) for story, q, answer in data
            if not max_length or len(flatten(story)) < max_length]
    return data


In [6]:
def vectorize_stories(word_idx, story_maxlen, query_maxlen, data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen), np.array(answers))



In [13]:
def read_data(finish_fast=False):
    # Get the file
    try:
        path = get_file(
            "babi-tasks-v1-2.tar.gz",
#             origin="https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz")
            origin = "s3://d2v-tmp/demo/data/qa/babi_tasks_1-20_v1-2.tar.gz")
    except Exception:
        print(
            "Error downloading dataset, please download it manually:\n"
            "$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2"  # noqa: E501
            ".tar.gz\n"
            "$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz"  # noqa: E501
        )
        raise

    # Choose challenge
    challenges = {
        # QA1 with 10,000 samples
        "single_supporting_fact_10k": "tasks_1-20_v1-2/en-10k/qa1_"
        "single-supporting-fact_{}.txt",
        # QA2 with 10,000 samples
        "two_supporting_facts_10k": "tasks_1-20_v1-2/en-10k/qa2_"
        "two-supporting-facts_{}.txt",
    }
    challenge_type = "single_supporting_fact_10k"
    challenge = challenges[challenge_type]

    with tarfile.open(path) as tar:
        train_stories = get_stories(tar.extractfile(challenge.format("train")))
        test_stories = get_stories(tar.extractfile(challenge.format("test")))
    if finish_fast:
        train_stories = train_stories[:64]
        test_stories = test_stories[:64]
    return train_stories, test_stories


In [14]:
class MemNNModel(tune.Trainable):
    @mlflow_mixin
    def build_model(self):
        """Helper method for creating the model"""
        vocab = set()
        for story, q, answer in self.train_stories + self.test_stories:
            vocab |= set(story + q + [answer])
        vocab = sorted(vocab)

        # Reserve 0 for masking via pad_sequences
        vocab_size = len(vocab) + 1
        story_maxlen = max(
            len(x) for x, _, _ in self.train_stories + self.test_stories)
        query_maxlen = max(
            len(x) for _, x, _ in self.train_stories + self.test_stories)

        word_idx = {c: i + 1 for i, c in enumerate(vocab)}
        self.inputs_train, self.queries_train, self.answers_train = (
            vectorize_stories(word_idx, story_maxlen, query_maxlen,
                              self.train_stories))
        self.inputs_test, self.queries_test, self.answers_test = (
            vectorize_stories(word_idx, story_maxlen, query_maxlen,
                              self.test_stories))

        # placeholders
        input_sequence = Input((story_maxlen, ))
        question = Input((query_maxlen, ))

        # encoders
        # embed the input sequence into a sequence of vectors
        input_encoder_m = Sequential()
        input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
        input_encoder_m.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, story_maxlen, embedding_dim)

        # embed the input into a sequence of vectors of size query_maxlen
        input_encoder_c = Sequential()
        input_encoder_c.add(
            Embedding(input_dim=vocab_size, output_dim=query_maxlen))
        input_encoder_c.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, story_maxlen, query_maxlen)

        # embed the question into a sequence of vectors
        question_encoder = Sequential()
        question_encoder.add(
            Embedding(
                input_dim=vocab_size, output_dim=64,
                input_length=query_maxlen))
        question_encoder.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, query_maxlen, embedding_dim)

        # encode input sequence and questions (which are indices)
        # to sequences of dense vectors
        input_encoded_m = input_encoder_m(input_sequence)
        input_encoded_c = input_encoder_c(input_sequence)
        question_encoded = question_encoder(question)

        # compute a "match" between the first input vector sequence
        # and the question vector sequence
        # shape: `(samples, story_maxlen, query_maxlen)`
        match = dot([input_encoded_m, question_encoded], axes=(2, 2))
        match = Activation("softmax")(match)

        # add the match matrix with the second input vector sequence
        response = add(
            [match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
        response = Permute(
            (2, 1))(response)  # (samples, query_maxlen, story_maxlen)

        # concatenate the match matrix with the question vector sequence
        answer = concatenate([response, question_encoded])

        # the original paper uses a matrix multiplication.
        # we choose to use a RNN instead.
        answer = LSTM(32)(answer)  # (samples, 32)

        # one regularization layer -- more would probably be needed.
        answer = Dropout(self.config.get("dropout", 0.3))(answer)
        answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
        # we output a probability distribution over the vocabulary
        answer = Activation("softmax")(answer)

        # build the final model
        model = Model([input_sequence, question], answer)
        return model
    
    @mlflow_mixin
    def setup(self, config):
        with FileLock(os.path.expanduser("~/.tune.lock")):
            self.train_stories, self.test_stories = read_data(
                config["finish_fast"])
        model = self.build_model()
        rmsprop = RMSprop(
            lr=self.config.get("lr", 1e-3), rho=self.config.get("rho", 0.9))
        model.compile(
            optimizer=rmsprop,
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"])
        self.model = model
        
    @mlflow_mixin
    def step(self):
        # train
        mlflow.tensorflow.autolog()
        self.model.fit(
            [self.inputs_train, self.queries_train],
            self.answers_train,
            batch_size=self.config.get("batch_size", 32),
            epochs=self.config.get("epochs", 1),
            validation_data=([self.inputs_test, self.queries_test],
                             self.answers_test),
            verbose=0)
        _, accuracy = self.model.evaluate(
            [self.inputs_train, self.queries_train],
            self.answers_train,
            verbose=0)
        return {"mean_accuracy": accuracy}
    
    def save_checkpoint(self, checkpoint_dir):
        file_path = checkpoint_dir + "/model"
        self.model.save(file_path)
        return file_path

    def load_checkpoint(self, path):
        # See https://stackoverflow.com/a/42763323
        del self.model
        self.model = load_model(path)


When using mlflow_mixin with Ray Client, it is recommended to use a remote tracking server. If you are using a MLflow tracking server backed by the local filesystem, then it must be setup on the server side and not on the client side.
When using mlflow_mixin with Ray Client, it is recommended to use a remote tracking server. If you are using a MLflow tracking server backed by the local filesystem, then it must be setup on the server side and not on the client side.
When using mlflow_mixin with Ray Client, it is recommended to use a remote tracking server. If you are using a MLflow tracking server backed by the local filesystem, then it must be setup on the server side and not on the client side.


## initialize a Ray cluster

In [16]:
import ray
from ray.tune.schedulers import PopulationBasedTraining

## start the ray cluster
from ray_common import initialize_ray_cluster, stop_ray_cluster

num_workers = 2
cpu_per_worker="4000m"
ram_per_worker="4.0Gi"


ray_cluster = initialize_ray_cluster(num_workers, cpu_per_worker, ram_per_worker)
ray_cluster

In [17]:
pbt = PopulationBasedTraining(
    perturbation_interval=2,
    hyperparam_mutations={
        "dropout": lambda: np.random.uniform(0, 1),
        "lr": lambda: 10**np.random.randint(-10, 0),
        "rho": lambda: np.random.uniform(0, 1)
    })

In [18]:
mlflow.set_tracking_uri(os.environ.get('DATABASE_URL_NO_PARAMS'))
mlflow.set_experiment("pbt_babi_memnn")

In [22]:
results = tune.run(
        MemNNModel,
        name="pbt_babi_memnn",
        scheduler=pbt,
        metric="mean_accuracy",
        mode="max",
        stop={"training_iteration": 2},
        num_samples=2,
        config={
            "finish_fast": True,
            "batch_size": 32,
            "epochs": 1,
            "dropout": 0.3,
            "lr": 0.01,
            "rho": 0.9,
            "mlflow": {
                "experiment_name": "mixin_example",
                "tracking_uri": mlflow.get_tracking_uri()
            }
        },
        sync_config=tune.SyncConfig(
        sync_to_driver=False,
#         upload_dir="gs://pipeline_data/ray_data"
        upload_dir="s3://d2v-tmp/demo/ray")

)


[2m[36m(pid=1391)[0m == Status ==
[2m[36m(pid=1391)[0m Memory usage on this node: 2.8/15.6 GiB
[2m[36m(pid=1391)[0m PopulationBasedTraining: 0 checkpoints, 0 perturbs
[2m[36m(pid=1391)[0m Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/5.6 GiB heap, 0.0/2.55 GiB objects (0.0/1.0 example-resource-b, 0.0/1.0 example-resource-a)
[2m[36m(pid=1391)[0m Result logdir: /root/ray_results/pbt_babi_memnn
[2m[36m(pid=1391)[0m Number of trials: 2/2 (2 PENDING)
[2m[36m(pid=1391)[0m +------------------------+----------+-------+
[2m[36m(pid=1391)[0m | Trial name             | status   | loc   |
[2m[36m(pid=1391)[0m |------------------------+----------+-------|
[2m[36m(pid=1391)[0m | MemNNModel_566a9_00000 | PENDING  |       |
[2m[36m(pid=1391)[0m | MemNNModel_566a9_00001 | PENDING  |       |
[2m[36m(pid=1391)[0m +------------------------+----------+-------+
[2m[36m(pid=1391)[0m 
[2m[36m(pid=1391)[0m 


[2m[36m(pid=1397)[0m 2021-08-29 20:11:50.920498: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
[2m[36m(pid=1397)[0m 2021-08-29 20:11:50.920585: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
[2m[36m(pid=1397)[0m 2021-08-29 20:11:50.920624: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ray-triton-cluster-ray-head-rjkjd): /proc/driver/nvidia/version does not exist
[2m[36m(pid=1397)[0m 2021-08-29 20:11:50.920879: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
[2m[36m(pid=1397)[

[2m[36m(pid=1391)[0m Result for MemNNModel_566a9_00000:
[2m[36m(pid=1391)[0m   date: 2021-08-29_20-11-53
[2m[36m(pid=1391)[0m   done: false
[2m[36m(pid=1391)[0m   experiment_id: 9d6dc54f9d8140a9aec5ba426a1f55ba
[2m[36m(pid=1391)[0m   hostname: ray-triton-cluster-ray-head-rjkjd
[2m[36m(pid=1391)[0m   iterations_since_restore: 1
[2m[36m(pid=1391)[0m   mean_accuracy: 0.171875
[2m[36m(pid=1391)[0m   node_ip: 10.0.23.3
[2m[36m(pid=1391)[0m   pid: 1397
[2m[36m(pid=1391)[0m   time_since_restore: 2.742231607437134
[2m[36m(pid=1391)[0m   time_this_iter_s: 2.742231607437134
[2m[36m(pid=1391)[0m   time_total_s: 2.742231607437134
[2m[36m(pid=1391)[0m   timestamp: 1630267913
[2m[36m(pid=1391)[0m   timesteps_since_restore: 0
[2m[36m(pid=1391)[0m   training_iteration: 1
[2m[36m(pid=1391)[0m   trial_id: 566a9_00000
[2m[36m(pid=1391)[0m   
[2m[36m(pid=1391)[0m == Status ==
[2m[36m(pid=1391)[0m Memory usage on this node: 3.4/15.6 GiB
[2m[36m(pid



[2m[36m(pid=1391)[0m Result for MemNNModel_566a9_00000:
[2m[36m(pid=1391)[0m   date: 2021-08-29_20-11-54
[2m[36m(pid=1391)[0m   done: true
[2m[36m(pid=1391)[0m   experiment_id: 9d6dc54f9d8140a9aec5ba426a1f55ba
[2m[36m(pid=1391)[0m   hostname: ray-triton-cluster-ray-head-rjkjd
[2m[36m(pid=1391)[0m   iterations_since_restore: 2
[2m[36m(pid=1391)[0m   mean_accuracy: 0.21875
[2m[36m(pid=1391)[0m   node_ip: 10.0.23.3
[2m[36m(pid=1391)[0m   pid: 1397
[2m[36m(pid=1391)[0m   time_since_restore: 2.9185242652893066
[2m[36m(pid=1391)[0m   time_this_iter_s: 0.17629265785217285
[2m[36m(pid=1391)[0m   time_total_s: 2.9185242652893066
[2m[36m(pid=1391)[0m   timestamp: 1630267914
[2m[36m(pid=1391)[0m   timesteps_since_restore: 0
[2m[36m(pid=1391)[0m   training_iteration: 2
[2m[36m(pid=1391)[0m   trial_id: 566a9_00000
[2m[36m(pid=1391)[0m   
[2m[36m(pid=1391)[0m Result for MemNNModel_566a9_00001:
[2m[36m(pid=1391)[0m   date: 2021-08-29_20-11-55




[2m[36m(pid=1391)[0m Result for MemNNModel_566a9_00001:
[2m[36m(pid=1391)[0m   date: 2021-08-29_20-11-55
[2m[36m(pid=1391)[0m   done: true
[2m[36m(pid=1391)[0m   experiment_id: 3bfbf215518c4959be3d8bb911e59ff5
[2m[36m(pid=1391)[0m   hostname: ray-triton-cluster-ray-worker-k9h7b
[2m[36m(pid=1391)[0m   iterations_since_restore: 2
[2m[36m(pid=1391)[0m   mean_accuracy: 0.3125
[2m[36m(pid=1391)[0m   node_ip: 10.0.23.4
[2m[36m(pid=1391)[0m   pid: 821
[2m[36m(pid=1391)[0m   time_since_restore: 2.954185724258423
[2m[36m(pid=1391)[0m   time_this_iter_s: 0.162428617477417
[2m[36m(pid=1391)[0m   time_total_s: 2.954185724258423
[2m[36m(pid=1391)[0m   timestamp: 1630267915
[2m[36m(pid=1391)[0m   timesteps_since_restore: 0
[2m[36m(pid=1391)[0m   training_iteration: 2
[2m[36m(pid=1391)[0m   trial_id: 566a9_00001
[2m[36m(pid=1391)[0m   
[2m[36m(pid=1391)[0m == Status ==
[2m[36m(pid=1391)[0m Memory usage on this node: 2.9/15.6 GiB
[2m[36m(pid=1

[2m[36m(pid=1391)[0m 2021-08-29 20:11:56,900	INFO tune.py:550 -- Total run time: 9.81 seconds (8.48 seconds for the tuning loop).


In [23]:
print("Best hyperparameters found were: ", results.best_config)

Best hyperparameters found were:  {'finish_fast': True, 'batch_size': 32, 'epochs': 1, 'dropout': 0.3, 'lr': 0.01, 'rho': 0.9, 'mlflow': {'experiment_name': 'mixin_example', 'tracking_uri': 'postgresql://postgres:postgres@postgresql.postgres-4ext18h5:5432/prisma'}}


### start tensorboard

In [25]:
logdir = results.get_best_logdir("mean_accuracy", mode="max")
logdir

'/root/ray_results/pbt_babi_memnn/MemNNModel_566a9_00001_1_2021-08-29_20-11-47'

## setup tensorboard 
- go to ray head node and run the follwoing command 
$tensorboard --logdir $logdir
- port forward 6006 from head node to localhost
- go to localhost:6006 on your browser to see tensorboard

In [None]:
ray.shutdown()