In [1]:
%connect_info

{
  "shell_port": 49748,
  "iopub_port": 49749,
  "stdin_port": 49750,
  "control_port": 49752,
  "hb_port": 49751,
  "ip": "127.0.0.1",
  "key": "d921f901-03c5468d5a24e4101a265c6b",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-04fdfb77-9818-490e-a512-4dbb224d5443.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

## Possible models

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.

`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [3]:
import sys
sys.argv = sys.argv[:1]

In [4]:
import os
import pickle
import numpy as np
import tensorflow as tf

from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from evaluation import  evaluate #, MetricsReporterCallback,
from utils import build_model_name, convert_flags_to_dict, define_nn_flags

import ray
from ray import tune
# from ray.tune.integration.keras import TuneReporterCallback
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.suggest import Repeater
from hyperopt import hp

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [5]:
SEED = 42
BASE_DIR = os.path.expanduser("~")     # this will point to the user's home
TRAIN_DIR = "ray_results"

FLAGS = define_nn_flags(tf.compat.v1.flags, BASE_DIR, TRAIN_DIR)
FLAGS.layers = [int(i) for i in FLAGS.layers]

_config = convert_flags_to_dict(FLAGS)
_config["codes"] = (['DE', 'GA', 'HI', 'PT', 'ZH']
                    if FLAGS.language_code is 'all' else [FLAGS.language_code])

cwd = os.getcwd()

In [6]:

def train_model(config):

    model_name = build_model_name(config)

    with open('{}/data/{}.embdata.pkl'.format(cwd, config["bert_type"]),
              'rb') as f:
        data = pickle.load(f)

    x_train = np.concatenate(
        [data[code]['x_train'] for code in _config["codes"]], axis=0)
    y_train = np.concatenate(
        [data[code]['y_train'] for code in _config["codes"]], axis=0)
    print(x_train.shape, y_train.shape)

    x_dev = np.concatenate([data[code]['x_dev'] for code in _config["codes"]],
                           axis=0)
    y_dev = np.concatenate([data[code]['y_dev'] for code in _config["codes"]],
                           axis=0)
    print(x_dev.shape, y_dev.shape)

    del data

    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=SEED)

    model = tf.keras.Sequential()
    layer_config = ([config["layer_size"]] * config["nlayers"]
                    if config["nlayers"] > 0 and config["layer_size"] > 0
                    else config["layers"])

    # Dense layers
    for i, layer_size in enumerate(layer_config):
        if i == 0:
            dense_layer = tf.keras.layers.Dense(
                layer_size,
                input_shape=(x_train.shape[-1],),
                activation=config["hidden_activation"])
        else:
            dense_layer = tf.keras.layers.Dense(
                layer_size, activation=config["hidden_activation"])
        model.add(dense_layer)
        model.add(tf.keras.layers.Dropout(config["dropout"]))

    if config["output_size"] == 1:
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    else:
        model.add(
            tf.keras.layers.Dense(
                2,
                activation=config["output_activation"],
            ))

    if config["optimizer"] == 'adam':
        optimizer = tf.keras.optimizers.Adam
    elif config["optimizer"] == 'rmsprop':
        optimizer = tf.keras.optimizers.RMSprop
    else:
        optimizer = tf.keras.optimizers.SGD

    # compiling model
    model.compile(loss=config["loss_function"],
                  optimizer=optimizer(learning_rate=config["learning_rate"],
                                      clipnorm=config["clipnorm"]),
                  metrics=['accuracy'])

    print(model.summary())

    class_weights = None
    if config["weighted_loss"]:
        weights = class_weight.compute_class_weight('balanced',
                                                    np.unique(y_train),
                                                    y_train.reshape(-1))
        class_weights = {}

        for i in range(weights.shape[0]):
            class_weights[i] = weights[i]

    print('Class weights: {}'.format(class_weights))

    # do this check again vecause we need y_train to be 1-D for class weights
    if config["output_size"] > 1:
        y_train = tf.keras.utils.to_categorical(y_train)
        y_val = tf.keras.utils.to_categorical(y_val)

    checkpoint = tf.keras.callbacks.ModelCheckpoint(config["train_dir"] +
                                                    model_name,
                                                    save_best_only=True)
    callbacks = [checkpoint]

    if config["tune"]:
        callbacks.append(
            MetricsReporterCallback(custom_validation_data=(x_val, y_val)))

    if config["early_stop_patience"] > 0:
        early_stop = tf.keras.callbacks.EarlyStopping(
            monitor='loss',
            min_delta=config["early_stop_delta"],
            patience=config["early_stop_patience"])
        callbacks.append(early_stop)

    if config["log_tensorboard"]:
        tensorboard = tf.keras.callbacks.TensorBoard(
            log_dir=config["train_dir"] + '/logs')
        callbacks.append(tensorboard)

    def lr_scheduler(epoch, lr):     # pylint: disable=C0103
        lr_decay = config["lr_decay"]**max(epoch - config["start_decay"], 0.0)
        return lr * lr_decay

    if config["start_decay"] > 0:
        lrate = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
        callbacks.append(lrate)

    print(model.summary())

    print('Train...')
    model.fit(x_train,
              y_train,
              class_weight=class_weights,
              batch_size=config["batch_size"],
              epochs=config["max_epochs"],
              callbacks=callbacks,
              verbose=2,
              validation_data=(x_val, y_val))

    # #####
    # Evaluation time
    #
    evaluate(model, test_data=(x_dev, y_dev))

In [17]:
search_space = {
    "dropout": hp.uniform("dropout", 0.0, 0.9),
    "max_epochs": hp.choice("max_epochs", [10, 20, 30, 50]),
    "early_stop_delta": hp.choice("early_stop_delta", [0.001, 0.0001]),
    "early_stop_patience": hp.choice("early_stop_patience", [10, 20]),
    "hidden_activation": hp.choice("hidden_activation", ['tanh', 'relu', 'elu', 'selu']),
    "output_activation": hp.choice("output_activation", ['sigmoid', 'softmax']),
    "clipnorm": hp.choice("clipnorm", [0.5, 1.0, 2.5, 5.0, 10.0]),
    "learning_rate": hp.loguniform("learning_rate", np.log(1e-4), np.log(1e-0)),
    "batch_size": hp.choice("batch_size", [20, 24, 32, 64, 128]),
    "nlayers": hp.randint('nlayers', 1, 5) * 1,
    "layer_size": hp.randint('layer_size', 1, 101) * 10,
}

_config.update({
    "hidden_activation": 'relu',
    "optimizer": 'adam',
    "threads": 1,
    "output_size": 2,
    "num_samples": 500,
})

In [18]:

class MetricsReporterCallback(tf.keras.callbacks.Callback):
    """Tune Callback for Keras."""

    def __init__(self, reporter=None, freq="epoch", logs=None, custom_validation_data=None):
        """Initializer.

        Args:
            freq (str): Sets the frequency of reporting intermediate results.
                One of ["batch", "epoch"].
        """
        assert custom_validation_data, "validation_data should not be None"
        self.custom_validation_data = custom_validation_data
        self.iteration = 0
        logs = logs or {}
        # if freq not in ["batch", "epoch"]:
        #     raise ValueError("{} not supported as a frequency.".format(freq))
        self.freq = "epoch"
        super(MetricsReporterCallback, self).__init__()
        self._results = None
        self._batch_count = 0

    def on_batch_end(self, batch, logs=None):
        # from ray import tune
        # logs = logs or {}

        logs = self._update_logs(logs, predict=self.iteration == 0)

        if not self.freq == "batch":
            return
        self.iteration += 1
        for metric in list(logs):
            if "loss" in metric and "neg_" not in metric:
                logs["neg_" + metric] = -logs[metric]
        if "acc" in logs:
            tune.report(keras_info=logs, mean_accuracy=logs["acc"])
        else:
            tune.report(keras_info=logs, mean_accuracy=logs.get("accuracy"))


    def on_epoch_end(self, batch, logs=None):

        print('Updating metrics')
        val_predict = (np.asarray(
            self.model.predict(self.custom_validation_data[0]))).round()

        self._results = classification_report(
            self.custom_validation_data[1], val_predict, output_dict=True)

        logs = self._update_logs(logs or {})

        if not self.freq == "epoch":
            return
        self.iteration += 1
        for metric in list(logs):
            if "loss" in metric and "neg_" not in metric:
                logs["neg_" + metric] = -logs[metric]
        if "acc" in logs:
            tune.track.log(keras_info=logs, mean_accuracy=logs["acc"])
        else:
            tune.track.log(keras_info=logs, mean_accuracy=logs.get("accuracy"))

    def _update_logs(self, logs, predict=True):

        if self._results is None:

            logs.update({
                # "accuracy": 0.0accuracy"],
                "label0_precision": 0.00,
                "label0_recall": 0.00,
                "label0_f1_score": 0.00,
                "label0_support": 0.00,
                "label1_precision": 0.00,
                "label1_recall": 0.0,
                "label1_f1_score": 0.0,
                "f1_score": 0.0,
                "label1_support": 0.0,
                "macro_precision": 0.0,
                "macro_recall": 0.0,
                "macro_f1_score": 0.0,
                "macro_support": 0.0,
                "weighted_precision": 0.0,
                "weighted_recall": 0.0,
                "weighted_f1_score": 0.0,
                "weighted_support": 0.0})
        else:

            logs.update({
                # "accuracy" :self._results["accuracy"],
                "label0_precision" :self._results["0"]["precision"],
                "label0_recall" :self._results["0"]["recall"],
                "label0_f1_score" :self._results["0"]["f1-score"],
                "label0_support" :self._results["0"]["support"],
                "label1_precision" :self._results["1"]["precision"],
                "label1_recall" :self._results["1"]["recall"],
                "label1_f1_score" :self._results["1"]["f1-score"],
                "f1_score" :self._results["1"]["f1-score"],
                "label1_support" :self._results["1"]["support"],
                "macro_precision" :self._results["macro avg"]["precision"],
                "macro_recall" :self._results["macro avg"]["recall"],
                "macro_f1_score" :self._results["macro avg"]["f1-score"],
                "macro_support" :self._results["macro avg"]["support"],
                "weighted_precision" :self._results["weighted avg"]["precision"],
                "weighted_recall" :self._results["weighted avg"]["recall"],
                "weighted_f1_score" :self._results["weighted avg"]["f1-score"],
                "weighted_support" :self._results["weighted avg"]["support"]})

        return logs

In [19]:

# reporter = tune.CLIReporter()
# reporter = tune.JupyterNotebookReporter(False)
# reporter.add_metric_column('keras_info/label1_f1_score', 'f1-score')

ray.shutdown()     # Restart Ray defensively in case the ray connection is lost.
ray.init(num_cpus=2)
results = tune.run(
    train_model,
    name="tune-nn-bert-classifier",
    config=_config,
    stop={
        "keras_info/f1_score": 0.99,
        "training_iteration": 10**8
    },
    resources_per_trial={
        "cpu": 1,
        "gpu": 0
    },
    num_samples=_config["num_samples"],
    checkpoint_freq=0,
    checkpoint_at_end=False,
    scheduler=AsyncHyperBandScheduler(
        time_attr='epoch',
        metric='f1_score',
        mode='max',
        max_t=400,
        grace_period=20),
    search_alg=HyperOptSearch(
        search_space,
        metric="keras_info/f1_score",
        mode="max",
        random_state_seed=SEED,
        points_to_evaluate=[{
            "dropout": 0.2,
            "max_epochs": 2,
            "early_stop_delta": 0,
            "early_stop_patience": 0,
            "hidden_activation": 1,
            "output_activation": 0,
            "clipnorm": 3,
            "learning_rate": 0.0001,
            "batch_size": 3,
            "nlayers": 2,
            "layer_size": 100            
        }]
    ),
    progress_reporter=reporter,
    verbose=1)
results.dataframe().to_csv(
    '{0}/nn_results{1}layers.csv'.format(
        _config["train_dir"], _config['layers']))

2020-05-29 18:57:43,832	INFO resource_spec.py:212 -- Starting Ray with 2.88 GiB memory available for workers and up to 1.46 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-29 18:57:44,119	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


Trial name,status,loc,batch_size,clipnorm,dropout,early_stop_delta,early_stop_patience,hidden_activation,layer_size,learning_rate,max_epochs,nlayers,output_activation
train_model_6cdb5e42,PENDING,,32,5.0,0.626338,0.001,20,relu,30,0.0354279,30,2,softmax
train_model_6ce31d26,PENDING,,20,2.5,0.0364661,0.001,20,relu,320,0.266372,50,4,sigmoid
train_model_6ce57e4a,PENDING,,128,2.5,0.463574,0.001,10,elu,900,0.0355628,10,3,sigmoid
train_model_6ceb7598,PENDING,,64,2.5,0.0720379,0.0001,20,tanh,770,0.0121221,50,2,sigmoid
train_model_6ceded28,PENDING,,32,0.5,0.0859095,0.001,20,selu,360,0.00820041,30,3,softmax
train_model_6cf07a16,PENDING,,24,5.0,0.557945,0.0001,20,selu,900,0.000641441,20,3,softmax
train_model_6cf2e90e,PENDING,,32,1.0,0.884504,0.001,20,relu,430,0.539511,50,1,sigmoid
train_model_6cf562c4,PENDING,,64,2.5,0.535158,0.001,20,relu,670,0.00156185,30,4,sigmoid
train_model_6d08e268,PENDING,,128,10.0,0.555145,0.001,20,selu,850,0.000279136,10,1,sigmoid
train_model_6d0b85e0,PENDING,,128,2.5,0.190273,0.001,20,tanh,610,0.00872228,20,2,sigmoid


[2m[36m(pid=33443)[0m 
[2m[36m(pid=33443)[0m Model name nn_classifier.distilbert-base-multilingual-cased.all.30epochs.20-0.0010eStop.[100, 100]-relulayers-0.626338dropout.2-softmax.output.0.5.thresh.weighted.binary_crossentropyLoss.32batch.adam.0.0354lr.0.8696-0decay.5.00norm.ckpt
[2m[36m(pid=33443)[0m 
[2m[36m(pid=33442)[0m 
[2m[36m(pid=33442)[0m Model name nn_classifier.distilbert-base-multilingual-cased.all.30epochs.10-0.0010eStop.[100, 100]-relulayers-0.200000dropout.2-sigmoid.output.0.5.thresh.weighted.binary_crossentropyLoss.64batch.adam.0.0001lr.0.8696-0decay.5.00norm.ckpt
[2m[36m(pid=33442)[0m 
[2m[36m(pid=33443)[0m 2020-05-29 18:57:57,083	INFO trainable.py:217 -- Getting current IP.
[2m[36m(pid=33442)[0m 2020-05-29 18:57:57,083	INFO trainable.py:217 -- Getting current IP.


KeyboardInterrupt: 

[2m[36m(pid=33442)[0m (66338, 768) (66338,)
[2m[36m(pid=33442)[0m (4330, 768) (4330,)
[2m[36m(pid=33443)[0m (66338, 768) (66338,)
[2m[36m(pid=33443)[0m (4330, 768) (4330,)
[2m[36m(pid=33443)[0m 2020-05-29 18:57:59.230103: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
[2m[36m(pid=33443)[0m 2020-05-29 18:57:59.279858: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f92bee5ca40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
[2m[36m(pid=33443)[0m 2020-05-29 18:57:59.279917: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
[2m[36m(pid=33442)[0m 2020-05-29 18:57:59.271177: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
[2m[36m(pid=33442)[0m 2020-05-29 18:57:59.3

In [10]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

In [11]:

search_space = {
    "dropout":
        hp.uniform("dropout", 0.0, 0.9),
    "max_epochs":
        hp.choice("max_epochs", [10, 20, 30, 50]),
    "early_stop_delta":
        hp.choice("early_stop_delta", [0.001, 0.0001]),
    "early_stop_patience":
        hp.choice("early_stop_patience", [10, 20]),
    "hidden_activation":
        hp.choice("hidden_activation", ['tanh', 'relu', 'elu', 'selu']),
    "output_activation":
        hp.choice("output_activation", ['sigmoid', 'softmax']),
    "clipnorm":
        hp.choice("clipnorm", [0.5, 1.0, 2.5, 5.0, 10.0]),
    "learning_rate":
        hp.loguniform("learning_rate", np.log(1e-4), np.log(1e-0)),
    "batch_size":
        hp.choice("batch_size", [20, 24, 32, 64, 128]),
    "nlayers":
        hp.randint('nlayers', 1, 5) * 1,
    "layer_size":
        hp.randint('layer_size', 1, 100) * 10,
}

In [12]:
sample(search_space)

{'batch_size': 24,
 'clipnorm': 1.0,
 'dropout': 0.19698578209841475,
 'early_stop_delta': 0.001,
 'early_stop_patience': 10,
 'hidden_activation': 'relu',
 'layer_size': 70,
 'learning_rate': 0.41547657526854137,
 'max_epochs': 10,
 'nlayers': 3,
 'output_activation': 'softmax'}