## Findings
  * general
    * batch size should be at least 1x(number of classes) in order to take advantage of MTL
    * Always start training a model with dropout=ker_reg=act_reg=0, look at bias and variance then add until good fit
    * No pooling gives very fast results but strong overfitting and large model size
    * Don't worry about class weights unless heavily (20x or more) imbalanced
    * head:0x1024 works best, any more layers underfits, not much of a difference to 0x512
  * multi-label output
    * sigmoid output makes model very sensitive to learning_rate.
      * I have found with VGG16 around 5e-5 is a good start
      * Use eg setup_callbacks(hist=2, grads=True) to enable gradient outputs; check if class_logits_out is becoming spread between 0 and 1, check that gradients are not 0 (should be around 1e-3).
  * single-label output
    * Pretty stable with any architecture
    

## Bugs
  * There may be a GPU memory leak somewhere... keras does not recycle models properly. I'll try to find this.

In [None]:
%matplotlib notebook

from collections import Counter
from itertools import cycle
import json
import os
import pickle
import re
import sys
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from imgaug import augmenters as iaa
from keras.applications.inception_v3 import InceptionV3
from keras.applications.resnet50 import preprocess_input, ResNet50
from keras.applications.vgg16 import VGG16
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, TensorBoard, EarlyStopping, LearningRateScheduler, Callback
from keras.layers import (
    Flatten, GlobalAveragePooling2D, GlobalMaxPooling2D, Input, Dense, Dropout)
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l1_l2
from keras.metrics import binary_accuracy, categorical_accuracy
from pycocotools.coco import COCO
from skimage.transform import resize
from sklearn.metrics import precision_recall_curve, average_precision_score
import keras.backend as K
import keras.initializers
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf

from abyss_deep_learning.keras.classification import (
    ClassificationDataset, PRTensorBoard, Inference,
    caption_map_gen, multihot_gen, augmentation_gen, skip_bg_gen, cached_gen)
from abyss_deep_learning.keras.utils import (
    batching_gen, lambda_gen, calc_class_weights, count_labels_multi, count_labels_single, gen_dump_data)
import abyss_deep_learning.abyss_dataset as dataset_model

# JSON Prep

In [None]:
# from abyss_deep_learning.utils import instance_to_caption
# train_dict = instance_to_caption(json.load(open(working_dir+"training.json","r")))
# with open(working_dir+"train-nb.json","w") as f:
#     json.dump(train_dict,f)
# val_dict = instance_to_caption(json.load(open(working_dir+"validation.json","r")))
# with open(working_dir+"val-nb.json","w") as f:
#     json.dump(val_dict,f)

# Setup Data

In [None]:
############### CONFIGURE ALL VARIABLES IN THIS CELL ########################
# num_classes assumed from caption_map entries
image_dims = (299, 299, 3) # What to resize images to before CNN
batch_size = 10 # Should be at least 1x <number of classes>
NN_DTYPE = np.float32 # Pretrained networks are in float32

# Caption type can be either "single" or "multi". This sets up various other parameters in the system.
caption_type = "multi" 

# maps caption strings to class numbers (ensure minimal set of class numbers)
# eg use {0, 1, 2} not {4, 7, 8}
caption_map = {
    "IP": 0,
    "JD_ML": 1,
    "DD": 2,
    "JD_S": 3,
    "ED_All": 4
}

# Import or define the right translator
from abyss_deep_learning.datasets.translators import AbyssCaptionTranslator, CloudFactoryCaptionTranslator
translator = AbyssCaptionTranslator() # CloudFactoryCaptionTranslator()

database_dir = "/data/abyss/projectmax/feature-detection"
dataset_name = "ours"
coco_train = ClassificationDataset(
    caption_map, translator,
    os.path.join(database_dir, "{:s}/train.json".format(dataset_name)))
coco_val = ClassificationDataset(
    caption_map, translator,
    os.path.join(database_dir, "{:s}/val.json".format(dataset_name)))
coco_test = ClassificationDataset(
    caption_map, translator,
    os.path.join(database_dir, "{:s}/val.json".format(dataset_name)))

In [None]:
print("Combinations of labels present")
print("Train:")
print(set([tuple(coco_train.load_caption(image['id'])) for image in coco_train.imgs.values()]))
print("Val:")
print(set([tuple(coco_val.load_caption(image['id'])) for image in coco_val.imgs.values()]))
print("Test:")
print(set([tuple(coco_test.load_caption(image['id'])) for image in coco_test.imgs.values()]))

In [None]:
# Find a balanced set
def balanced_set(coco):
    captions = [caption 
            for ann in coco.anns.values() if 'caption' in ann
           for caption in ann['caption'].split(',') if caption != "background"]
    smallest_caption, smallest_caption_value = min(Counter(captions).items(), key=lambda x: x[1])
    
    unique_captions = np.unique(captions)
#     print("unique_captions", unique_captions)
    # Count how many images are in each label
    images_in_caption = {
        caption: [ann['image_id'] for ann in coco.anns.values() if caption in ann['caption'].split(',')]
        for caption in unique_captions}
    print("images_in_caption", {k: len(i) for k, i in images_in_caption.items()})
    for images in images_in_caption.values():
        np.random.shuffle(images)
    
    # Count how many captions are in each image
    captions_in_image = {
        image_id: ([
            caption
            for ann in coco.anns.values() if ann['image_id'] == image_id and 'caption' in ann
            for caption in ann['caption'].split(',') if len(caption) and caption != "background"])
        for image_id in coco.imgs}
    print("captions_in_image")
    print([len(captions) for image_id, captions in captions_in_image.items()])
    
#     print("smallest", smallest_caption, smallest_caption_value)
    balanced = []
    out = {caption: [] for caption in unique_captions}
    
    def add_to_counts(image_id):
        # Increment counts for all captions in image
        for caption in captions_in_image[image_id]:
            out[caption].append(image_id)
        # Remove image_id from all images_in_caption
        for images in images_in_caption.values():
            if image_id in images:
                images.pop(images.index(image_id))
    
    while any([len(out[caption]) < smallest_caption_value for caption in unique_captions]):
        least = min(out.items(), key=lambda x: len(x[1]))
        image_id = images_in_caption[least[0]].pop()
        add_to_counts(image_id)
        
    print("balanced images in caption")
    print({k: len(v) for k, v in out.items()})
    out = set([j
           for i in out.values()
          for j in i])

    return out

balanced_image_ids_train = balanced_set(coco_train)
balanced_image_ids_val = balanced_set(coco_val)
balanced_image_ids_test = balanced_set(coco_test)
print("balanced train set size", len(balanced_image_ids_train))
print("balanced val set size", len(balanced_image_ids_val))
print("balanced test set size", len(balanced_image_ids_test))

In [None]:
caption_map_r = {val: key for key, val in caption_map.items()}
num_classes = len(caption_map)
steps_per_epoch = coco_train.num_images() // batch_size
steps_per_epoch_val = coco_val.num_images() // batch_size
print("Number of classes:", num_classes)
print("Steps per epoch:", steps_per_epoch)
print("Steps per steps_per_epoch_val:", steps_per_epoch_val)

In [None]:
def preprocess(image, caption):
    image = resize(image, image_dims, preserve_range=True)
    return preprocess_input(image.astype(NN_DTYPE), mode='tf'), caption

def postprocess(image):
    return ((image + 1) * 127.5).astype(np.uint8)
     
def pipeline(gen, aug_config=None):
    return (
        augmentation_gen(
            skip_bg_gen(
                multihot_gen(
                    lambda_gen(
                        caption_map_gen(gen, caption_map)
                    , func=preprocess)
                , num_classes=num_classes)
            )
        , aug_config, enable=(aug_config is not None))
    )


aug_config = {
    'flip_lr_percentage': 0.5,
    'flip_ud_percentage': 0.5,
    'affine': {
        "order": 1,
        'scale': {
            "x": (0.8, 1.2),
            "y": (0.8, 1.2)
        },
        "rotate": (-10, 10),
        "shear": (-5, 5),
        "mode": 'constant'
    },
#     'color': {
#         'probability': 1.00,
#         'hue': (0, 0),
#         'saturation': (0, 0),
#         'value': (0, 0)
#     }
}
# aug_config = None # Uncomment to remove augmentation (goes around 50% faster but much worse results)

In [None]:
use_cached_gens = False
use_balanced_set = False

if use_balanced_set:
    train_gen = pipeline(
        coco_train.generator(imgIds=list(balanced_image_ids_train), shuffle_ids=True),
        aug_config=aug_config)
    val_gen = pipeline(coco_val.generator(imgIds=list(balanced_image_ids_val), shuffle_ids=True))
    test_gen = pipeline(coco_test.generator(imgIds=list(balanced_image_ids_test), shuffle_ids=True))
else:
    train_gen = pipeline(
        coco_train.generator(imgIds=None, shuffle_ids=True),
        aug_config=aug_config)
    val_gen = pipeline(coco_val.generator(imgIds=None, shuffle_ids=True))
    test_gen = pipeline(coco_test.generator(imgIds=None, shuffle_ids=True))

if use_cached_gens:
    print("USING CACHED VAL/TEST DATA")
    if aug_config is None:
        print("USING CACHED TRAIN DATA")
        train_gen = cached_gen(train_gen, len(balanced_image_ids_train))
    val_gen = cached_gen(val_gen, len(balanced_image_ids_val))
    test_gen = cached_gen(test_gen, len(balanced_image_ids_test))

In [None]:
%%timeit -n1 -r1
for i, (image, target) in enumerate(caption_map_gen(coco_val.generator(), caption_map)):
    print("out", target)
    if np.sum(target) == 0:
        print("BG")
    if i >= 10:
        break

In [None]:
for i, (train, val, test) in enumerate(zip(train_gen, val_gen, test_gen)):
    for data in (train, val, test):
        print(data[0].shape, data[1], (np.min(data[0]), np.max(data[0])))
    plt.figure()
    plt.subplot(1, 3, 1)
    plt.imshow(postprocess(train[0]))
    plt.title(', '.join([caption_map_r[int(cap_id)] for cap_id in np.argwhere(train[1])]))
    
    plt.subplot(1, 3, 2)
    plt.imshow(postprocess(val[0]))
    plt.title(', '.join([caption_map_r[int(cap_id)] for cap_id in np.argwhere(val[1])]))
    
    plt.subplot(1, 3, 3)
    plt.imshow(postprocess(test[0]))
    plt.title(', '.join([caption_map_r[int(cap_id)] for cap_id in np.argwhere(test[1])]))
    
    if i >= 0:
        break
print("Left to right: ground truth samples from train, val test")        

In [None]:
# This cell intentionally left blank due to display bug above.

In [None]:
count_function = count_labels_single if caption_type == "single" else count_labels_multi

# for label, gen, coco, balanced_image_ids in zip(
#         ["train", "val", "test"],
#         [train_gen, val_gen, test_gen],
#         [coco_train, coco_val, coco_test],
#         [balanced_image_ids_train, balanced_image_ids_val, balanced_image_ids_test]):
#     data = gen_dump_data(gen, len(balanced_image_ids))
#     counter = count_function(data)
#     print(label, counter)

val_data = gen_dump_data(val_gen, len(balanced_image_ids_val))
test_data = val_data
class_weights = None
# Uncomment below line to use class weights, not needed if using balanced_set
# class_weights = calc_class_weights(gen_dump_data(train_gen, len(balanced_image_ids_train)), caption_type)

print("training class weights:")
print(class_weights)

print("Binary accuracy if you were to output all 0s")
acc = binary_accuracy(val_data[1], val_data[1] * 0).eval(session=K.get_session())
print(np.mean(acc))
print("Categorical accuracy if you were to output all 0s")
acc = categorical_accuracy(val_data[1], val_data[1] * 0).eval(session=K.get_session())
print(np.mean(acc))
print("percent of data covered by class", val_data[1].sum(axis=0) / val_data[1].shape[0])

# Setup model

In [None]:
def add_model_regularization(model, kernel_regularizer_l2, activity_regularizer_l1):
#     # Add L2 Regularization
#     # Skip gamma and beta weights of batch normalization layers.
#     if kernel_regularizer_l2:
#         reg_losses = [
#             keras.regularizers.l2(kernel_regularizer_l2)(w) / tf.cast(tf.size(w), tf.float32)
#             for w in model.trainable_weights
#             if not any([l in w.name for l in ['gamma', 'beta']])]
#         model.add_loss(tf.add_n(reg_losses, name='l2_reg_loss'))
#     if activity_regularizer_l1:
#         reg_losses = [
#             keras.regularizers.l1(activity_regularizer_l1)(
#                 layer.get_output_at(0)) / tf.cast(tf.size(layer.get_output_at(0)), tf.float32)
#             for layer in model.layers
#             if layer.trainable and not any([l in layer.name for l in ['class_logits', 'batch_norm']])]
#         model.add_loss(tf.add_n(reg_losses, name='l1_reg_loss'))
    for layer in model.layers: #Save 
        if not layer.trainable or 'batch_norm' in layer.name:
            continue
        if hasattr(layer, 'kernel_regularizer') and kernel_regularizer_l2:
            if 'kernel' in layer.weights[0].name:
                size = np.product(layer.weights[0].shape.as_list())
                if size:
                    layer.kernel_regularizer = l1_l2(0, kernel_regularizer_l2 / size)
        if hasattr(layer, 'activity_regularizer') and activity_regularizer_l1:
#            if 'class_logits' in layer.name:
            size = np.product(layer.get_output_shape_at(0)[1:])
            if size:
                layer.activity_regularizer = l1_l2(activity_regularizer_l1 / size, 0)
            
    # Suspect this is where GPU memory leak is coming from
    model_config = model.get_config()
    model_weights = model.get_weights()
    model = None
    K.clear_session()
    model = Model.from_config(model_config)
    model.set_weights(model_weights)
    return model
    
def create_new_head(base_model, model_params, opt_params=None):
    '''make sure base_model has include_top=False. If loss=None then it is determined.'''
    if not opt_params:
        opt_params = {"optimizer": 'nadam'}
    
    if model_params.loss is None:
        if model_params.caption_type == "single":
            opt_params['loss'] = "categorical_crossentropy" 
        elif model_params.caption_type == "multi":
            # weights = np.array([
                # i[1] for i in sorted(model_params.class_weights.items())])[np.newaxis, ...] \
                # if model_params.class_weights else 1.0
            opt_params['loss'] = 'binary_crossentropy'
    else:
        opt_params['loss'] = model_params.loss
        
    if model_params.activation == None:
        if model_params.caption_type == "single":
            model_params.activation = "softmax" 
        else:
            model_params.activation = "sigmoid"

    x = base_model.output
    if model_params.pool == 'avg':
        x = GlobalAveragePooling2D()(x)
    elif model_params.pool == 'max':
        x = GlobalMaxPooling2D()(x)
    else:
        x = Flatten()(x)
        
    x = BatchNormalization()(x)
    if model_params.dropout:
        x = Dropout(model_params.dropout)(x)

    for _ in range(model_params.num_hidden_layers):
        x = Dense(model_params.num_hidden_neurons, activation='relu',
                  kernel_initializer=keras.initializers.he_uniform())(x)
        x = BatchNormalization()(x)
        if model_params.dropout:
            x = Dropout(model_params.dropout)(x)

    predictions = Dense(
        model_params.num_classes,
        activation=model_params.activation,
        kernel_initializer=keras.initializers.he_uniform(),
#         bias_initializer=keras.initializers.he_uniform(),
        name='class_logits')(x)

    for layer in base_model.layers:
        layer.trainable = model_params.train_features
        
    model = add_model_regularization(
        Model(inputs=base_model.input, outputs=predictions),
        model_params.kernel_regularizer_l2,
        model_params.activity_regularizer_l1)
    
    model.compile(**opt_params, metrics=['binary_accuracy', 'categorical_accuracy'])
    return model

In [None]:
def get_gradients(model):
    """Get the gradients of the loss with respect to the weights."""
    weights = [tensor for tensor in model.trainable_weights 
               if model.trainable_weights]
    return weights, model.optimizer.get_gradients(model.total_loss, weights)

def evaluate_model(model, test_data, thresh=0.5):
    def multi_label_decision(y_true, y_pred):
        return (y_true > thresh) == (y_pred > thresh)
    def single_label_decision(y_true, y_pred):
        return np.argmax(y_true, axis=-1) == np.argmax(y_pred, axis=-1)
    decision_function = single_label_decision if caption_type == 'single' else multi_label_decision

    Y_true = test_data[1]
    Y_pred = model.predict(test_data[0])
    TP = decision_function(Y_true, Y_pred)
    acc = np.count_nonzero(TP) / TP.size
    
    print("Test using {:d} samples:".format(len(test_data[0])))
    print("accuracy", acc)
    return Y_true, Y_pred, TP

def display_performance(Y_true, Y_pred, TP):
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(num_classes):
        precision[i], recall[i], _ = precision_recall_curve(Y_true[:, i],
                                                            Y_pred[:, i])
        average_precision[i] = average_precision_score(Y_true[:, i], Y_pred[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(Y_true.ravel(),
        Y_pred.ravel())
    average_precision["micro"] = average_precision_score(Y_true, Y_pred,
                                                         average="micro")
    print('Average precision score, micro-averaged over all classes: {0:0.2f}'
          .format(average_precision["micro"]))

    z = np.all((Y_pred > 0.5) == Y_true, axis=1)
    acc = np.count_nonzero(z) / z.size
    print("exact accuracy", acc)
    z = ((Y_pred > 0.5) == Y_true)
    acc = np.count_nonzero(z) / z.size
    print("binary accuracy", acc)
    
    # setup plot details
    colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

    plt.figure(figsize=(8, 10))
    f_scores = np.linspace(0.2, 0.8, num=4)
    lines = []
    labels = []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
        plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

    lines.append(l)
    labels.append('iso-f1 curves')
    l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2)
    lines.append(l)
    labels.append('micro-average Precision-recall (area = {0:0.2f})'
                  ''.format(average_precision["micro"]))

    for i, color in zip(range(num_classes), colors):
        l, = plt.plot(recall[i], precision[i], color=color, lw=2)
        lines.append(l)
        labels.append('{0} (area = {1:0.2f})'
                      ''.format(caption_map_r[i], average_precision[i]))

    fig = plt.gcf()
    fig.subplots_adjust(bottom=0.25)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Micro Average Precision vs. Recall')
    plt.legend(lines, labels, loc=(0, -.4), prop=dict(size=14))
    plt.show()
    plt.savefig(model_plot_path, dpi=150)
    
def save_model(model, name, class_map_r, prediction_type,
               model_weights_path, model_def_path, model_info_path, history,
               test_metrics=None, description=""):
    from abyss.utils import JsonNumpyEncoder
    def merged(a, b):
        merged = dict(a)
        merged.update(b)
        return merged
        
    model_info = {
        "name": name,
        "description": description,
        "weights": model_weights_path,
        "prediction_type": caption_type,
        "model": model_def_path,
        "classes": class_map_r,
        "architecture": {
            "backbone": "inceptionv3",
            "logit_activation": model.get_layer("class_logits").activation.__name__,
            "input_shape": image_dims
        },
        "metrics": {
            "loss_function": str(history.model.loss),
            "train": merged(
                history.history,
                {
                    "epoch": history.epoch,
                    "params": history.params
                })
        }
    }
    if test_metrics:
        model_info['metrics']['test'] = test_metrics
    
    print("Writing model def to " + model_def_path)
    with open(model_def_path, "w") as file:
        file.write(model.to_json())
        
    print("Writing model weights to " + model_weights_path)
    model.save_weights(model_weights_path)
    
    print("Writing model info to " + model_info_path)
    with open(model_info_path, "w") as file:
        file.write(json.dumps(model_info, cls=JsonNumpyEncoder))
        
def hamming_loss(y_true, y_pred):
    return K.mean(y_true * (1 - y_pred) + (1 - y_true) * y_pred)


def check_gradients(model):
    grad_test = None
    for image, label in train_gen:
        grad_test = (image, label)
        break
    rates = []
    weights, grads = get_gradients(model)
    feed_dict = {
        "class_logits_sample_weights:0": np.ones(2),
        "input_1:0": grad_test[0][np.newaxis, ...],
        "class_logits_target:0": grad_test[1][np.newaxis, ...]
    }
    for i, (w, g) in enumerate(zip(weights, grads)):
        if 'bias' in w.name:
            continue
        grad_norm = np.linalg.norm(g.eval(feed_dict, K.get_session()))
        weight_norm = np.linalg.norm(w.eval(K.get_session()))
        rate = grad_norm / weight_norm
        rates.append(rate)
    if np.mean(rates) < 5e-4 or np.mean(rates) > 3e-1: # These values change with network structure
        print("Bad gradients ({:.3e}).".format(np.mean(rates)))
        return False
    return True

# Model selection and training
You may have to change these callbacks to suit the dataset and model
Note that calculating gradients and histogram on large layered networks (resnet and inception) takes a long time (5 minutes per epoch calculated) so you may only want to do this infrequently or not at all.

In [None]:
class LogLR(Callback):
    def __init__(self, **kwargs):
        super(LogLR, self).__init__()

    def on_epoch_end(self, epoch, logs=None):
        if not logs:
            logs = {'lr': K.get_value(self.model.optimizer.lr)}
        elif 'lr' not in logs:
            logs['lr'] = K.get_value(self.model.optimizer.lr)

def setup_callbacks(log_dir, schedule=None, hist=False, grads=False):
    !mkdir -p "$log_dir/models"
    best_path = os.path.join(log_dir, "models/best.{epoch:03d}-{val_loss:.4f}.h5")
    callbacks=[
        ModelCheckpoint(
            best_path, monitor='val_loss', verbose=1,
            save_best_only=True, save_weights_only=True, mode='auto', period=1),
        PRTensorBoard(
            log_dir=log_dir, 
            histogram_freq=(hist or 0), batch_size=batch_size,
            write_graph=False,
            write_grads=grads,
            write_images=False),
#             EarlyStopping(
#                 monitor='val_loss', min_delta=0.0, patience=12, verbose=1, mode='auto'),
#         clr_callback.CyclicLR(base_lr=1e-4, max_lr=0.1, min_lr=0.01, step_size=(2*steps_per_epoch))
        LogLR()
    ]
    if schedule == 'plateau':
        callbacks.append(
            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=30, cooldown=0, verbose=1))
    else:
        callbacks.append(LearningRateScheduler(schedule, verbose=1))
    return callbacks

def go(model, callbacks, epochs, class_weights, initial_epoch=0):
    return model.fit_generator(
        batching_gen(train_gen, batch_size=batch_size),
        validation_data=tuple(val_data),
        steps_per_epoch=steps_per_epoch,
        validation_steps=steps_per_epoch_val,
        class_weight=class_weights,
        callbacks=callbacks, 
        epochs=epochs,
        verbose=1, initial_epoch=initial_epoch, workers=10)

In [None]:
# Modify this class to change search parameters
class Experiment(object):
    def __init__(self, data_name, batch_size, input_shape):
        self.data_name = data_name
        self.num_classes = num_classes
        self.caption_type = caption_type
        self.model = None
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.id = str(np.random.randint(0, 9999))
        if self.caption_type == 'single':
            self.activation = 'softmax'
            self.loss = 'categorical_crossentropy'
        else:
            self.activation = 'sigmoid'
            self.loss = 'binary_crossentropy'
            
        self.feature_extractor = VGG16
        self.model_name = self.feature_extractor.__name__
        self.learning_rate = 5e-5
        self.dropout = None
        self.train_features = False
        self.pool = 'avg'
        self.num_hidden_layers = 0
        self.num_hidden_neurons = 1024
        self.pretrained_weights = 'imagenet'
        self.class_weights = None
        self.kernel_regularizer_l2 = None
        self.activity_regularizer_l1 = None

    def random_init(self):
        self.feature_extractor = np.random.choice([VGG16, ResNet50, InceptionV3])
        self.model_name = self.feature_extractor.__name__
        self.learning_rate = 10 ** np.random.uniform(-5, -4)
        self.dropout = np.random.uniform(0.0, 0.6)
        self.train_features = bool(np.random.binomial(1, 0.75))
        self.pool = np.random.choice(['avg', 'max'])
        self.num_hidden_layers = int(np.random.choice([0, 1], size=1, p=(0.75, 0.25)))
        self.num_hidden_neurons = 1024
        self.pretrained_weights = np.random.choice(['imagenet', None])
        self.class_weights = None
        self.kernel_regularizer_l2 = 10 ** np.random.uniform(-5, -3)
        self.activity_regularizer_l1 = 10 ** np.random.uniform(-5, -3)
                
    
    def serialize(self):
        self.model_name = self.feature_extractor.__name__
        return ' '.join([
            ':'.join([key, str(value)]) 
            for key, value in [
                ("id", self.id),
                ("data", self.data_name),
                ("batchsize", self.batch_size),
                ("activation", self.activation),
                ("model", self.model_name),
                ("head", "{:d}x{:d}".format(self.num_hidden_layers, self.num_hidden_neurons)),
                ("train", 'all' if self.train_features else 'heads'),
                ("from", str(self.pretrained_weights)),
                ("loss", self.loss.__name__ if callable(self.loss) else str(self.loss)),
                ("init_lr", "{:.3e}".format(self.learning_rate)),
                ("act_reg", "{:.3e}".format(self.activity_regularizer_l1 or 0)),
                ("ker_reg", "{:.3e}".format(self.kernel_regularizer_l2 or 0)),
                ("dropout", "{:.1f}".format(self.dropout or 0)),
                ("pool", str(self.pool)),
                ("CW", str(True if self.class_weights else False))]])
    
    def deserialize(self, string):
        strs = dict(([tuple(field.split(':')) for field in string.split(' ') if len(field.split(':')) == 2]))
        self.id = strs['id']
        self.model_name = strs['model']
        self.batch_size = int(strs['batchsize'])
        self.num_hidden_layers, self.num_hidden_neurons = [int(s) for s in strs['head'].split('x')]
        self.train_features = strs['train'] == 'all'
        self.pretrained_weights = strs['from']
        self.activation = strs['activation']
        self.loss = strs['loss']
        self.learning_rate = float(strs['init_lr'])
        self.activity_regularizer_l1 = float(strs['ker_reg'])
        self.kernel_regularizer_l2 = float(strs['act_reg'])
        self.dropout = float(strs['dropout'])
        self.pool = bool(strs['pool'])
        self.class_weights = None #strs['CW'] == 'True' #TODO
        if self.class_weights:
            raise NotImplementedError("Have not yet serialized class weights")
        self.feature_extractor = {
            'VGG16': VGG16,
            'ResNet50': ResNet50,
            'InceptionV3': InceptionV3
        }[strs['model']]
    
    def describe(self):
        return self.serialize().replace(' ', "\n").replace(":", ": ")
    
    def make_model(self):
        self.model = None
        K.clear_session()
        self.model = create_new_head(
            self.feature_extractor(
                include_top=False,
                weights=self.pretrained_weights,
                input_shape=self.input_shape),
            self, 
            opt_params={'optimizer': 'nadam'}
        )

In [None]:
search_output_dir = "/data/log/cnn/fd/tuesday3" # Change this output dir.
num_epochs_train = 100
model_instance = None
# model_instance = "id:3880 data:ours batchsize:5 activation:sigmoid model:VGG16 head:0x1024 train:all from:imagenet loss:binary_crossentropy init_lr:2e-04 act_reg:1e-5 ker_reg:3e-5 dropout:0.25 pool:avg CW:False"
history_data = {}

def lr_schedule_exp(epoch, base_lr=1e-3, gamma=0.98):
    return base_lr * gamma ** epoch

### Pick one
schedule = 'plateau'
# schedule = lambda epoch, lr: lr_schedule_exp(epoch, base_lr=5e-5, gamma=0.98) # Exponential decay
# schedule = None


for attempt_no in range(1):
    K.clear_session()
    exp = Experiment(dataset_name, batch_size, image_dims)
    if model_instance:  # If loading network structure from model_instance (not weights)
        exp.deserialize(model_instance)
        exp.id = str(np.random.randint(0, 999))
    experiment_name = exp.serialize()
    print('=' * 80)
    print(exp.describe())
    exp.make_model() # Comment this out if you want to reuse model in memory, but tensorboard stats will be ruined unless you change initial_epoch

    log_dir = os.path.join(search_output_dir, experiment_name)
    model_def_path = os.path.join(log_dir, "model_def.json")
    model_weights_path = os.path.join(log_dir, "model_weights.h5")
    model_info_path = os.path.join(log_dir, "model.json")
    model_plot_path = os.path.join(log_dir, "precision-recall.png")
    
    print(experiment_name)
    print(log_dir)
    print("Training: {:d} layers".format(len([1 for layer in exp.model.layers if layer.trainable])))

    K.set_value(exp.model.optimizer.lr, exp.learning_rate)
    callbacks = setup_callbacks(log_dir, schedule=schedule, hist=2, grads=True)
    history_data[experiment_name] = go(exp.model, callbacks, num_epochs_train, exp.class_weights, initial_epoch=0)

    (Y_true, Y_pred, TP) = evaluate_model(exp.model, test_data, thresh=0.5)
    display_performance(Y_true, Y_pred, TP)

    save_model(
        exp.model, name=experiment_name,
        class_map_r=caption_map_r, prediction_type=caption_type,
        model_weights_path=model_weights_path, model_def_path=model_def_path, model_info_path=model_info_path,
        test_metrics=None, history=history_data[experiment_name],
        description="Test model for 5 FDs"
    )

if history_data:
    with open(os.path.join(search_output_dir, "history-{:d}epoch.pkl".format(num_epochs_train)), "wb") as file:
        pickle.dump({key: history.history for key, history in history_data.items()}, file)

### Should you need to load this pkl:
# with open(os.path.join(search_output_dir, "history-100epoch.pkl"), "rb") as file:
#     history = pickle.load(file)

In [None]:
# lrs = [key[1] for key, val  in sorted(history_data.items(), key=lambda x: x[0])]
# val_loss = [history.history['val_loss'][-1] for key, history  in sorted(history_data.items(), key=lambda x: x[0])]
# loss = [history.history['loss'][-1] for lr, history  in sorted(history_data.items(), key=lambda x: x[0])]
# acc = [history.history['binary_accuracy'][-1] for lr, history  in sorted(history_data.items(), key=lambda x: x[0])]
# val_acc = [history.history['val_binary_accuracy'][-1] for lr, history  in sorted(history_data.items(), key=lambda x: x[0])]

# plt.figure()
# plt.subplot(1, 2, 1)
# plt.semilogx(lrs, loss, '.b', label='loss')
# plt.semilogx(lrs, val_loss, '.r', label='val_loss')
# plt.legend()
# plt.title("Loss Vs. LR (100 Epoch)")
# plt.subplot(1, 2, 2)
# plt.semilogx(lrs, acc, '.b', label='binary_accuracy')
# plt.semilogx(lrs, val_acc, '.r', label='val_binary_accuracy')
# plt.legend()
# plt.title("Accuracy Vs. LR (100 Epoch)")
# plt.tight_layout()

# Continue training specific model

In [None]:
model = None
K.clear_session()
search_output_dir = "/data/log/cnn/fd/tuesday3" # Change this output dir.
model_instance = "id:2274 data:ours batchsize:50 activation:sigmoid model:VGG16 head:0x1024 train:heads from:imagenet loss:binary_crossentropy init_lr:5.000e-05 act_reg:0.000e+00 ker_reg:0.000e+00 dropout:0.0 pool:avg CW:False"
model_best_weight = "model_weights.h5"

### Don't set below
exp = Experiment(dataset_name, batch_size, image_dims)
exp.deserialize(model_instance)
exp.id = "c{:d}".format(np.random.randint(0,999))
experiment_name = exp.serialize()

model_weights_in_path = os.path.join(search_output_dir, model_instance, model_best_weight)
log_dir = os.path.join(search_output_dir, model_instance, "continued", experiment_name)
best_path = os.path.join(log_dir, "models/best.{epoch:03d}-{val_loss:.4f}.h5")
model_def_path = os.path.join(log_dir, "model_def.json")
model_info_path = os.path.join(log_dir, "model.json")
model_plot_path = os.path.join(log_dir, "precision-recall.png")

print(model_weights_in_path)
if os.path.exists(model_weights_in_path):
    !mkdir -p "$log_dir/models"
else:
    raise OSError("path does not exist")
    
print("loading")
print(os.path.join(search_output_dir, model_instance, "model.json"))
exp.model = Inference(os.path.join(search_output_dir, model_instance, "model.json")).model
# base_model = p.feature_extractor(
#     include_top=False, weights=p.pretrained_weights, input_shape=image_dims)
# model = create_new_head(
#     base_model, p.num_classes, p.caption_type, p, 
#     opt_params={'optimizer': 'nadam'}
# )
exp.model.load_weights(os.path.join(search_output_dir, model_instance, "model_weights.h5"))
# model = add_model_regularization(model, params.kernel_regularizer_l2, params.activity_regularizer_l1)

# for layer in model.layers:
#     layer.trainable = True

exp.model.compile( # TODO, load this from JSON, manually change this if you are doing single label
    'nadam',
    loss=exp.loss,
    metrics=['binary_accuracy', 'categorical_accuracy'])

print(experiment_name)
print(log_dir)
print("Training: {:d} layers".format(len([1 for layer in exp.model.layers if layer.trainable])))

In [None]:
initial_epoch = 100
num_epoch = 200 # cumulative with initial_epoch
class_weights = None # Can't currently resume training with imbalance data #TODO
new_learning_rate = 5e-4
### Pick one schedule
schedule = 'plateau'
# schedule = lambda epoch, lr: lr_schedule_exp(epoch, base_lr=5e-5, gamma=0.98) # Exponential decay
# schedule = None


#### Don't set below

K.set_value(exp.model.optimizer.lr, new_learning_rate)
callbacks = setup_callbacks(log_dir, schedule=schedule, hist=2, grads=True)
history_data[experiment_name] = go(exp.model, callbacks, num_epoch, exp.class_weights, initial_epoch=initial_epoch)

(Y_true, Y_pred, TP) = evaluate_model(exp.model, test_data, thresh=0.5)
display_performance(Y_true, Y_pred, TP)

save_model(
    exp.model, name=experiment_name,
    class_map_r=caption_map_r, prediction_type=caption_type,
    model_weights_path=model_weights_path, model_def_path=model_def_path, model_info_path=model_info_path,
    test_metrics=None, history=history_data[experiment_name],
    description="Test model for 5 FDs"
)

# LR Search

In [None]:
class LrSearch():
    def __init__(self, instance_str):
        self.model = None
        self.model_init_w = None
        self.model_instance = instance_str
        self.params = ExperimentParameters.from_string(model_instance)
        self.history = {}
        
    def new_model(self):
        p = self.params
        save_weights = self.model == None
        
        self.model = None
#         K.clear_session()
        self.model = create_new_head(
            p.feature_extractor(
            include_top=False, weights=p.pretrained_weights, input_shape=image_dims),
            p.num_classes, p.caption_type, p, 
            opt_params={'optimizer': 'nadam'}
        )
        
        if save_weights:
            self.model_init_w = self.model.get_weights()
        else:
            self.model.set_weights(self.model_init_w)
        print("Trainable layers: {:d}".format(sum([layer.trainable for layer in self.model.layers])))
        
    def go(self, epochs, num_steps=10):
        self.history = {}
        callbacks = []

        for base_lr in 10 ** np.random.uniform(-9, 0, num_steps):
            print("Learning rate = {:.3e}".format(base_lr))
            self.new_model()
            K.set_value(self.model.optimizer.lr, base_lr)
            self.history[base_lr] = self.model.fit_generator(
                batching_gen(train_gen, batch_size=batch_size),
                validation_data=tuple(val_data),
                steps_per_epoch=steps_per_epoch,
                validation_steps=steps_per_epoch_val,
                class_weight=None,
                callbacks=[], 
                epochs=epochs,
                verbose=1, workers=10)
        return self.history
    
    def plot(self):
        df = []
        for lr, h in self.history.items():
            metrics = np.vstack([np.array(h.history['binary_accuracy']), np.array(h.history['loss'])])
            diff = (metrics[:, -1] - metrics[:, 0])[np.newaxis]
            metrics = np.vstack([metrics[:, -1], diff]).ravel()
            metrics = np.array([lr] + metrics.tolist())
            dfh = pd.DataFrame(
                    data=pd.Series(
                        data=metrics,
                        index=['lr', 'binary_accuracy', 'loss', 'diff_acc', 'diff_loss'])
                ).T.set_index('lr')
            df.append(dfh)
        df = pd.concat(df).sort_index()
        df.plot(logx=True)

In [None]:
model_instance = "id:3880 data:ours batchsize:5 activation:sigmoid model:VGG16 head:0x1024 train:heads from:imagenet loss:binary_crossentropy init_lr:6.582e-05 act_reg:1e-4 ker_reg:1e-4 dropout:0.25 pool:avg CW:False"
search = LrSearch(model_instance)
search.go(5, num_steps=50)
search.plot()

# Heads only: 4.4e-5 to 2e-2

# Below is untested with new changes, don't use

In [None]:
# layer=model.layers[-3]
# print(layer)
# for weight in layer.weights:
#     weight.initializer.run(session=K.get_session())
# # w = layer.get_weights()
# # plt.figure()
# # plt.hist(w[0].ravel(), bins=100)

In [None]:
# # More training
# K.set_value(model.optimizer.lr, 1e-4)

# go(300, class_weights, initial_epoch=200)
# (Y_true, Y_pred, TP) = evaluate_model(model, test_data, thresh=0.5)
# display_performance(Y_true, Y_pred, TP)

# # save_model(
# #     model, name=experiment_name + "-second",
# #     class_map_r=caption_map_r, prediction_type=caption_type,
# #     model_weights_path=model_weights_path, model_def_path=model_def_path, model_info_path=model_info_path,
# #     test_metrics=None, history=history_data[experiment_name],
# #     description="Test model for 5 FDs"
# # )

# Model Testing

In [None]:
from herbicide.utils import vis_square
TP_mask = np.logical_and.reduce(TP, axis=1)
right = test_data[0][TP_mask]
wrong = test_data[0][~TP_mask]
wrong.shape
plt.figure()
vis_square(wrong)
plt.title("Incorrectly Predicted")
plt.figure()
vis_square(right)
plt.title("Correctly Predicted")

# Binary coded the labels then count them wrt TP/FP
print("num labels", test_data[1].sum(axis=0))
coded = np.sum(test_data[1][~TP_mask] * 2 ** np.arange(num_classes)[::-1], axis=1).astype(int)
print("binary coded class error count:", dict(sorted(Counter(coded).items(), key=lambda x: x[0])))
coded = np.sum(test_data[1][TP_mask] * 2 ** np.arange(num_classes)[::-1], axis=1).astype(int)
print("binary coded class correct count:", dict(sorted(Counter(coded).items(), key=lambda x: x[0])))
print(Y_pred[TP_mask])

# Learning Curve

In [None]:
# def learning_curve(dataset, lr, steps, val_data, log_dir):
#     def save_model(path):
#         print("Saving", path)
#         os.makedirs(os.path.dirname(path), exist_ok=True)
#         model.save_weights(path)
#     def setup_callbacks():
#         return [
# #                 ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, cooldown=5, verbose=1),
# #                 ModelCheckpoint(
# #                     model_best_path, monitor='val_loss', verbose=1,
# #                     save_best_only=True, save_weights_only=True, mode='auto', period=1),
# #                 ModelCheckpoint(
# #                     best_path, monitor='val_loss', verbose=1,
# #                     save_best_only=False, save_weights_only=True, mode='auto', period=50),
#                 PRTensorBoard(
#                     log_dir=model_log_dir,
#                     histogram_freq=0,
#                     batch_size=batch_size,
#                     write_graph=False,
#                     write_grads=False,
#                     write_images=False),
#         #         EarlyStopping(
#         #             monitor='val_loss', min_delta=0.0, patience=40, verbose=1, mode='auto')
#         ]
#     def create_new_model(load_base=False):
#         clear_session()
#         model = create_new_head(
#             InceptionV3(include_top=False, weights='imagenet', input_shape=image_dims),
#             num_classes, caption_type, opt_params={'optimizer': Nadam()},
#             class_weights=None, train_features=False, l2_reg=None)
#         if load_base:
#             print("Loading base model")
#             model.load_weights(base_model_path, by_name=True)
#         return model

#     def train():
#         print("Training")
#         K.set_value(model.optimizer.lr, lr)
#         history[subset_size] = model.fit_generator(
#             batching_gen(gen, batch_size=batch_size),
#             validation_data=tuple(val_data),
#             steps_per_epoch=(subset_size // batch_size),
#             validation_steps=steps_per_epoch_val,
#             class_weight=model_class_weights,
#             callbacks=setup_callbacks(), 
#             epochs=50,
#             verbose=1)
#     model_class_weights = None
#     model = None
#     model_path = None
#     image_ids = [image['id'] for image in dataset.imgs.values()]
#     np.random.shuffle(image_ids)
#     num_images = len(image_ids)
#     print("num_images", num_images)
#     history = {}
#     base_model_path = os.path.join(log_dir, "base", "weights.h5")
#     model_path = base_model_path
#     for subset_size in np.linspace(0, num_images, steps + 1).astype(int):
#         if subset_size > 0:
#             imgIds = image_ids[:subset_size]
#             gen = pipeline(
#                 dataset.generator(shuffle_ids=False, imgIds=imgIds),
#                 aug_config=None)
#             model_class_weights = calc_class_weights(gen, dataset) # TODO

#             model_path = os.path.join(log_dir, "subset-of-{:d}/weights.h5".format(subset_size))
#             model_log_dir = os.path.dirname(model_path)
#             model_best_path = os.path.join(log_dir, "subset-of-{:d}/best.h5".format(subset_size))
#             os.makedirs(model_log_dir, exist_ok=True)

#             print("learning curve(lr={:.3e}, size={:d})".format(lr, subset_size))
#             print("model_log_dir", model_log_dir)
#             print("training class weights")
#             print(model_class_weights)
#         model = create_new_model(load_base=(subset_size > 0))
#         if subset_size:
#             train()
#         save_model(model_path)
#     return history

# model = None
# lr = 1e-5
# learning_curve_dir = "/data/log/cnn/fd/learning_curve_5--{:.2e}".format(lr)
# lc_history = learning_curve(coco_train, lr, 5, val_data, learning_curve_dir)
# val_loss = np.array([(size, h.history['val_loss'][-1]) for size, h in lc_history.items()])
# train_loss = np.array([(size, h.history['loss'][-1]) for size, h in lc_history.items()])
# plt.figure()
# plt.plot(train_loss[:, 0], train_loss[:, 1], 'b.')
# plt.plot(val_loss[:, 0], val_loss[:, 1], 'r.')
# plt.xlabel("Number of Training Samples")
# plt.ylabel("Loss")
# plt.savefig(os.path.join(learning_curve_dir, "plot.png"), dpi=150)

In [None]:
# !rm -R /data/log/cnn/fd/learning-curve/

In [None]:
# images = None
# for images, labels in batching_gen(train_gen, batch_size=batch_size):
#     print(images.shape, labels.shape)
    
#     pred = model.predict(images)
#     print(labels)
#     print(pred)
#     print(K.eval(K.tf.losses.sigmoid_cross_entropy(labels, pred)))
#     break

In [None]:
# for unique_label in np.unique(val_data[1], axis=0):
#     unique_data = [val_data[0][i] for i in range(len(val_data[0])) if np.all(val_data[1][i] == unique_label)]
#     num_data = len(unique_data)
#     print(unique_label, num_data)
#     plt.figure()
#     vis_square(np.array(unique_data))
#     plt.title(unique_label)

# Check Update/Weight Ratio

In [None]:

# for i, (w, g) in enumerate(zip(weights, grads)):
#     grad_norm = np.linalg.norm(g.eval(feed_dict, K.get_session()))
#     weight_norm = np.linalg.norm(w.eval(K.get_session()))
#     rate = grad_norm / weight_norm
#     print(i, rate)


# Visualize filters

In [None]:
from herbicide.utils import vis_square
for layer in model.layers:
    if not layer.trainable_weights:
        continue
    for weight in layer.trainable_weights: #  Assumes FD is not trainable
        if 'kernel' not in weight.name:
            continue
        print(weight.name)
        value = K.eval(weight.value())
        print(value.shape)
    
    plt.figure()
    vis_square(value.transpose((3, 0, 1, 2)))
    break


In [None]:
caption_stats = []
for i, (image, caption) in enumerate(coco_train.generator(imgIds=balanced_image_ids_train)):
    plt.figure()
    plt.imshow(image)
    plt.title(str(caption))
    if i == 10:
        break
    caption_stats.append(caption)