## Findings
  * general
    * batch size should be at least 1x(number of classes) in order to take advantage of MTL
    * Always start training a model with dropout=ker_reg=act_reg=0, look at bias and variance then add until good fit
    * No pooling gives very fast results but strong overfitting and large model size
    * Don't worry about class weights unless heavily (20x or more) imbalanced
    * head:0x** works best, any more layers underfits
    * training with dropout makes val_loss very noisy and auto saving doesn't work well, enable regular saving
    * Don't use both Dropout and Batch Norm together (if you do use very small dropout, https://arxiv.org/pdf/1801.05134.pdf)
    
  * multi-label output
    * sigmoid output makes model very sensitive to learning_rate.
      * I have found with VGG16 around 5e-5 is a good start
      * Use eg setup_callbacks(hist=2, grads=True) to enable gradient outputs; check if class_logits_out is becoming spread between 0 and 1, check that gradients are not 0 (should be around 1e-3).
  * single-label output
    * Pretty stable with any architecture
    

## Bugs
  * There may be a GPU memory leak somewhere... keras does not recycle models properly. I'll try to find this.

In [None]:
%matplotlib notebook

import os
# os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64:' + os.environ['LD_LIBRARY_PATH']
from abyss_deep_learning.utils import config_gpu
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import keras.backend as K
config_gpu([1])

# Setup application specifics
Configure below cell:
* Training augmentation (AUG_CONFIG)
* Pre- and post- processing of data
* Various arguments
* Caption map and caption translator

In [None]:
def setup_args():
    from keras.applications.resnet50 import preprocess_input
    from bidict import bidict
    from imgaug import augmenters as iaa
    from imgaug.parameters import Normal
    from skimage.transform import resize
    
    from abyss_deep_learning.datasets.translators import AnnotationTranslator

    def preprocess_data(image):
        '''Transform the image before (possibly caching) and input to the network.'''
        image = resize(image, args['image_dims'], preserve_range=True, mode='constant')
        return preprocess_input(image.astype(args['nn_dtype']), mode='tf')

    def postprocess_data(image):
        '''Inverse transform of preprocess_data, used when trying to visualize images out of the dataset.'''
        return ((image + 1) * 127.5).astype(np.uint8)

    def pipeline(gen, aug_config=None):
        '''The pipeline to run the dataset generator through.'''
        from abyss_deep_learning.keras.classification import multihot_gen, augmentation_gen

        return (
            augmentation_gen(
                multihot_gen(gen, num_classes=args['num_classes'])
            , aug_config, enable=(aug_config is not None))
        )

    class CaptionMapper(AnnotationTranslator):
        '''Transform JSON string CSV caption annotations into a list of integer captions'''
        def __init__(self, caption_map):
            self.caption_map = caption_map
            self.num_classes = len(caption_map)

        def filter(self, annotation):
            return 'caption' in annotation

        def translate(self, annotation):
            return [
                self.caption_map[caption]
                for caption in annotation['caption'].split(',')
                if caption in self.caption_map]
        
    caption_map = bidict({
        "IP": 0,
        "ED": 1
    })
    
    augmentation_config = iaa.Sequential([ 
        iaa.Fliplr(0.5),
        iaa.Flipud(0.5),
        iaa.Affine(
            scale=(0.8, 1.2),
            translate_percent=(-0.2, 0.2), 
            rotate=(-22.5, 22.5),
            mode='constant', cval=0, order=0
        ),
        iaa.Sequential([ # Colour aug
            iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"),
            iaa.WithChannels(0, iaa.Add(Normal(0, 256 / 6))),
            iaa.WithChannels(1, iaa.Add(Normal(0, 256 / 6))),
            iaa.WithChannels(2, iaa.Add(Normal(0, 256 / 6))),
            iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")
        ])
    ])

    args = {
        'annotation_translator': CaptionMapper(caption_map),
        'augmentation': augmentation_config,    # Training augmentation
        'caption_map': caption_map,             # Captio
        'caption_type': ['single', 'multi'][1], # Caption type can be either "single" or "multi".
                                                # This sets up various other parameters in the system.
        'data': {
            'base_dir': "/data/abyss/projectmax/feature-detection/large-fromCF",
            'name': "alltogether-unique",
            'sets': ('train', 'val', 'test')
        },
        'image_dims': (480//2, 640//2, 3),    # What to resize images to before CNN
        'nn_dtype': np.float32,         # Pretrained networks are in float32
        'num_classes': len(caption_map),
        'use_balanced_set': False,      # Force the use of the largest class-balanced dataset
        'use_cached': False,            # Cache the dataset in memory
        'use_class_weights': True,      # Use class population to weight in the training loss
        'use_parallel': False,          # Use multiple GPUs
        'preprocess_data': preprocess_data,
        'postprocess_data': postprocess_data,
        'pipeline': pipeline
    }
    
    return args
ARGS = setup_args()

# Setup Datasets

In [None]:
def setup_datasets(args):
    from abyss_deep_learning.datasets.coco import ImageClassificationDataset
    
    dataset = dict()
    for set_name in args['data']['sets']:
        path = os.path.join(args['data']['base_dir'], "{:s}/{:s}.json".format(args['data']['name'], set_name))
        dataset[set_name] = ImageClassificationDataset(
            path,
            translator=args['annotation_translator'],
            cached=args['use_cached'],
            preprocess_data=args['preprocess_data'])
        print("\n", set_name)
        dataset[set_name].print_class_stats()


    print("\nNumber of classes:", args['num_classes'])
    print("captions:")
    print(args['caption_map'])
    return dataset
DATASET = setup_datasets(ARGS)

In [None]:
def test_dataset_speed(set_name='train'):
    image, target = DATASET['train'].sample()
    if np.sum(target) == 0:
        print("BG")
    else:
        print(image.shape)
        print(target)

In [None]:
# %%timeit -n5 -r1
test_dataset_speed('train')

In [None]:
def view_dataset_samples(num_rows=2):
    plt.figure()
    print("Column-wise left to right, bottom row:")
    for i, (name, ds) in enumerate(DATASET.items()):
        print(name, end=' ')
        for j, (image, label) in enumerate(ARGS['pipeline'](ds.generator(shuffle_ids=True))):
            print(label)
            plt.subplot(num_rows, 3, 3 * j + i + 1)
            plt.imshow(ARGS['postprocess_data'](image))
            plt.title(', '.join([ARGS['caption_map'].inv[int(cap_id)] for cap_id in np.argwhere(label)]))
            plt.axis('off')
            if j + 1 == num_rows:
                break
        print('shape: {}, label: {}, min: {:.1f}, mean: {:.1f}, max: {:.1f}'.format(
            image.shape, label, image.min(), image.mean(), image.max()))

view_dataset_samples(num_rows=2)

In [None]:
# This cell intentionally left blank due to display bug above.

In [None]:
# %%timeit -n1 -r1
# Takes 1.52 seconds with pipeline, use_cached=False
# Takes 1.52 seconds without pipeline, use_cached=False
# Takes  seconds with pipeline, use_cached=True
# Takes  seconds without pipeline, use_cached=True
def dump_dataset(dataset, num_data, aug_config=None):
    data = np.empty((num_data,) + tuple(ARGS['image_dims']), dtype=ARGS['nn_dtype'])
    targets = np.empty((num_data, ARGS['num_classes']), dtype=ARGS['nn_dtype'])
    for i, (datum, target) in enumerate(ARGS['pipeline'](dataset.generator(), aug_config)):
        data[i], targets[i] = datum, target
        if i + 1 == num_data:
            break
    return data, targets

VAL_DATA = dump_dataset(DATASET['val'], num_data=len(DATASET['val'].data_ids), aug_config=None)

# Setup model

In [None]:
# Modify this class to change search parameters
class Experiment(object):
    def __init__(self, data_name, input_shape):
        from keras.applications.vgg16 import VGG16

        self.callbacks = None
        self.history = dict()
        self.data_name = ARGS['data']['name']
        self.num_classes = ARGS['num_classes']
        self.caption_type = ARGS['caption_type']
        self.model = None
        self.model2 = None
        self.input_shape = input_shape
        self.batch_size = ARGS['num_classes']
        self.id = str(np.random.randint(0, 9999))
        if self.caption_type == 'single':
            self.activation = 'softmax'
            self.loss = 'categorical_crossentropy'
        else:
            self.activation = 'sigmoid'
            self.loss = 'binary_crossentropy'
            
        self.backbone = 'VGG16'
        self.learning_rate = 5e-5
        self.dropout = None
        self.train_features = True
        self.pool = 'avg'
        self.num_hidden_layers = 0
        self.num_hidden_neurons = 0
        self.pretrained_weights = 'imagenet'
        self.class_weights = None
        self.kernel_regularizer_l2 = None
        self.activity_regularizer_l1 = None
        
    @property
    def backbone(self):
        return self._backbone
    
    @backbone.setter
    def backbone(self, value):
        self._backbone = value
        self.model_name = value

    def random_init(self):
        # Modify this for parameter search
        self.backbone = np.random.choice(['VGG16', 'ResNet50', 'InceptionV3'])
        
        self.learning_rate = 10 ** np.random.uniform(-5, -3)
        self.dropout = np.random.uniform(0.0, 0.5)
        self.kernel_regularizer_l2 = 10 ** np.random.uniform(-5, -2)
        self.activity_regularizer_l1 = 10 ** np.random.uniform(-5, -2)
                
    
    def serialize(self):
        return ' '.join([
            ':'.join([key, str(value)]) 
            for key, value in [
                ("ID", self.id),
                ("DS", self.data_name),
                ("BS", self.batch_size),
                ("AC", self.activation),
                ("MO", self.model_name),
                ("HE", "{:d}x{:d}".format(self.num_hidden_layers, self.num_hidden_neurons)),
                ("TR", 'all' if self.train_features else 'heads'),
                ("FT", str(self.pretrained_weights)),
                ("LF", self.loss.__name__ if callable(self.loss) else str(self.loss)),
                ("LR", "{:.1e}".format(self.learning_rate)),
                ("AR", "{:.1e}".format(self.activity_regularizer_l1 or 0)),
                ("KR", "{:.1e}".format(self.kernel_regularizer_l2 or 0)),
                ("DO", "{:.2f}".format(self.dropout or 0)),
                ("PO", str(self.pool)),
                ("CW", str(None) \
                           if self.class_weights is None\
                           else ','.join(["{:d}={:.2f}".format(k, v) for k, v in self.class_weights.items()]))
            ]])
    
    def deserialize(self, string):
        strs = dict(([tuple(field.split(':')) for field in string.split(' ') if len(field.split(':')) == 2]))
        self.id = strs['ID']
        self.model_name = strs['MO']
        self.batch_size = int(strs['BS'])
        self.num_hidden_layers, self.num_hidden_neurons = [int(s) for s in strs['HE'].split('x')]
        self.train_features = strs['TR'] == 'all'
        self.pretrained_weights = None if strs['FT'] == 'None' else strs['FT']
        self.activation = strs['AC']
        self.loss = strs['LF']
        self.learning_rate = float(strs['LR'])
        self.activity_regularizer_l1 = float(strs['KR'])
        self.kernel_regularizer_l2 = float(strs['AR'])
        self.dropout = float(strs['DO'])
        self.pool = strs['PO']
        if strs['CW'].lower() in ['none', 'false', '0']:
            self.class_weights = None
        else:
            self.class_weights = {
                int(field.split('=')[0]): float(field.split('=')[1])
                for field in strs['CW'].split(',')
            }
        model_map = {
            'VGG16': VGG16,
            'ResNet50': ResNet50,
            'InceptionV3': InceptionV3,
        }
#         if strs['MO'].startswith('simple,'):
#             fields = strs['MO'].split(',')[1:]
#             model_map[strs['MO']] = simple_model_factory(int(fields[0]), float(fields[1]), float(fields[2]))
        self.backbone = model_map[strs['MO']]
    
    def describe(self):
        return self.serialize().replace(' ', "\n").replace(":", ": ")
    
    def make_backbone(self, with_reg=False):
        from keras.models import Model
        from keras.applications.inception_v3 import InceptionV3
        from keras.applications.resnet50 import ResNet50
        from keras.applications.vgg16 import VGG16
        
        def add_model_regularization(model, activity_regularizer_l1, kernel_regularizer_l2):
            '''Assign regularization to layers, save the model structure and reload it to make the losses contribute.'''
            from keras.regularizers import l1_l2

            for layer in model.layers: 
                if not layer.trainable or 'batch_norm' in layer.name:
                    continue
                if hasattr(layer, 'kernel_regularizer') and kernel_regularizer_l2:
                    print("kernel_regularizer: ", layer.name)
                    if 'kernel' in layer.weights[0].name:
                        size = np.product(layer.weights[0].shape.as_list())
                        if size:
                            layer.kernel_regularizer = l1_l2(0, kernel_regularizer_l2 / size)
                if hasattr(layer, 'activity_regularizer') and activity_regularizer_l1 and layer.name != 'predictions':
                    print("activity_regularizer: ", layer.name)
                    size = np.product(layer.get_output_shape_at(0)[1:])
                    if size:
                        layer.activity_regularizer = l1_l2(activity_regularizer_l1 / size, 0)

            # Suspect this is where GPU memory leak is coming from
            model_config = model.get_config()
            model_weights = model.get_weights()
            model = None
            K.clear_session()
            model = Model.from_config(model_config)
            model.set_weights(model_weights)
            return model


        backbone_t = {
            'VGG16': VGG16,
            'ResNet50': ResNet50,
            'InceptionV3': InceptionV3,
        }[self.backbone]
        model = backbone_t(
            include_top=False,
            weights=self.pretrained_weights,
            input_shape=self.input_shape,
            pooling=None
        )
        model.name = "backbone"
        if with_reg:
            model = add_model_regularization(model, self.activity_regularizer_l1, self.kernel_regularizer_l2)
        return model

    def make_head(self, feature_shape):
        '''make sure base_model has include_top=False. If loss=None then it is determined.
        We don't use batch norm incase the backbone doesn't utilise it. We use Dropout instead.'''
        from keras.models import Model
        import keras.layers as layers
        from keras.layers.advanced_activations import PReLU
        from keras.regularizers import l1_l2

        if self.activation == None:
            self.activation = "sigmoid" if self.caption_type == "multi" else "softmax"

        features_input = layers.Input(shape=feature_shape, name='head_input')
        x = layers.GlobalAveragePooling2D()(features_input)

        for i in range(self.num_hidden_layers):
            x = layers.Dropout(self.dropout)(x)
            x = layers.Dense(
                self.num_hidden_neurons,
                kernel_regularizer=l1_l2(0, self.kernel_regularizer_l2),
                activity_regularizer=l1_l2(self.activity_regularizer_l1, 0))(x)
            x = PReLU()(x)
            
        x = layers.Dropout(self.dropout, name='class_logits')(x)
        predictions = layers.Dense(
            self.num_classes,
            activation=self.activation,
            kernel_regularizer=l1_l2(0, self.kernel_regularizer_l2),
            kernel_initializer='uniform',
            name='predictions')(x)
        return Model(inputs=features_input, outputs=predictions, name='classify_head')
    
    def make_model(self, weights=None, backbone_layer=None, parallel=False):
        from keras.models import Model, Sequential
        self.model = None
        K.clear_session()
        add_reg = 'simple' not in self.model_name
        backbone = self.make_backbone(with_reg=False)
        if backbone_layer:
            backbone = Model(backbone.input, backbone.get_layer(backbone_layer).output, name='backbone')
        backbone.trainable = self.train_features
        head = self.make_head(backbone.output.shape.as_list()[1:])
        
        self.model = Model(backbone.inputs, head(backbone(backbone.input)))
        if weights:
            self.model.set_weights(weights)
#         else:
#             freeze = []
#             if self.pretrained_weights is None:
#                 transfer_vgg(self.model)
#                 freeze = [layer.name for layer in self.model.layers if 'conv' in layer.name.lower()][0:1]
#             elif self.pretrained_weights == 'imagenet':
#                 if self.feature_extractor.__name__ =='VGG16':
#                     freeze = [layer.name for layer in self.model.layers if 'conv' in layer.name.lower()][0:3]
#                 elif self.feature_extractor.__name__ =='ResNet50':
#                     freeze = [layer.name for layer in self.model.layers if 'conv' in layer.name.lower()][0:3]
#                 elif self.feature_extractor.__name__ =='InceptionV3':
#                     freeze = [layer.name for layer in self.model.layers if 'conv' in layer.name.lower()][0:3]
#             for name in freeze:
#                 layer = self.model.get_layer(name=name)
#                 print("locking", layer.name)
#                 layer.trainable = False
        if parallel:
            from keras.utils import multi_gpu_model
            self.model2 = multi_gpu_model(self.model, gpus=2)#, cpu_merge=True, cpu_relocation=False)            
    
    def get_train_model(self):
        return self.model2 or self.model
            
    def compile_model(self, opt_params=None):
        if not opt_params:
            opt_params={'optimizer': 'nadam'}
        opt_params['loss'] = self.loss
        self.get_train_model().compile(**opt_params, metrics=['binary_accuracy', 'categorical_accuracy'])
        
    def set_logdir(self, log_dir):
        self.log_dir = log_dir
        self.model_def_path = os.path.join(log_dir, "model_def.json")
        self.model_weights_path = os.path.join(log_dir, "model_weights.h5")
        self.model_info_path = os.path.join(log_dir, "model.json")
        self.model_plot_path = os.path.join(log_dir, "precision-recall.png")
        
    def setup_callbacks(self, schedule=None, hist=False, grads=False):
        from keras.callbacks import TerminateOnNaN, ModelCheckpoint, EarlyStopping
        from abyss_deep_learning.keras.classification import PRTensorBoard
        
        models_dir = os.path.join(self.log_dir, "models")
        !mkdir -p "$models_dir"
        best_path = os.path.join(models_dir, "best.{epoch:03d}-{val_loss:.4f}.h5")
        self.callbacks = [
            TerminateOnNaN(),
            ModelCheckpoint(
                best_path, monitor='val_loss', verbose=1,
                save_best_only=True, save_weights_only=True, mode='auto', period=1),
            PRTensorBoard(
                log_dir=self.log_dir, 
                histogram_freq=(hist or 0),
                batch_size=10,
                write_graph=False,
                write_grads=grads,
                write_images=False,
                embeddings_freq=(hist or 0),
                embeddings_layer_names=['class_logits/cond/Merge:0'],
                embeddings_metadata='metadata.tsv',
                embeddings_data=VAL_DATA[0]
            ),
                EarlyStopping(
                    monitor='val_loss', min_delta=0.0, patience=50, verbose=1, mode='auto'),
    #         clr_callback.CyclicLR(base_lr=1e-4, max_lr=0.1, min_lr=0.01, step_size=(2*steps_per_epoch))
            
        ]
        if schedule:
            self.callbacks.append(schedule)

    def go(self, epochs, initial_epoch=0, val_data=None):
        from abyss_deep_learning.keras.utils import batching_gen
        
        steps_per_epoch = len(DATASET['train'].data_ids) // self.batch_size
        steps_per_epoch_val = VAL_DATA[0].shape[0] // self.batch_size 
        print("Steps per epoch:", steps_per_epoch)
        print("Steps per steps_per_epoch_val:", steps_per_epoch_val)

        train_gen = ARGS['pipeline'](DATASET['train'].generator(
            shuffle_ids=True, endless=True), aug_config=ARGS['augmentation'])
        common = {
            "class_weight": DATASET['train'].class_weights,
            "callbacks": self.callbacks,
            "epochs": epochs,
            "verbose": 1,
            "initial_epoch": initial_epoch,
        }

        self.history = self.get_train_model().fit_generator(
            batching_gen(train_gen, batch_size=self.batch_size),
#             validation_data=batching_gen(
#                 data_source('val'), batch_size=exp.batch_size),
            validation_data=val_data,
            steps_per_epoch=steps_per_epoch,
            validation_steps=steps_per_epoch_val,
            workers=10,
            **common)
        return self.history

    def evaluate_model(self, test_data, thresh=0.5):
        def multi_label_decision(y_true, y_pred):
            return (y_true > thresh) == (y_pred > thresh)
        def single_label_decision(y_true, y_pred):
            return np.argmax(y_true, axis=-1) == np.argmax(y_pred, axis=-1)
        decision_function = single_label_decision if caption_type == 'single' else multi_label_decision

        Y_true = test_data[1]
        Y_pred = self.get_train_model().predict(test_data[0])
        TP = decision_function(Y_true, Y_pred)
        acc = np.count_nonzero(TP) / TP.size

        print("Test using {:d} samples:".format(len(test_data[0])))
        print("accuracy", acc)
        return Y_true, Y_pred, TP
    

    def save_model(
            self, class_map_r, prediction_type,
            test_metrics=None, description=""):
        if self.model == None:
            return
        import json
        from abyss.utils import JsonNumpyEncoder
        
        def merged(a, b):
            merged = dict(a)
            merged.update(b)
            return merged

        model_info = {
            "name": self.serialize(),
            "description": description,
            "weights": self.model_weights_path,
            "prediction_type": ARGS['caption_type'],
            "model": self.model_def_path,
            "classes": class_map_r,
            "architecture": {
                "backbone": "inceptionv3", # TODO IMPORTANT
                "logit_activation": self.model.get_layer("classify_head").get_layer("predictions").activation.__name__,
                "input_shape": self.input_shape
            }

        }
        if self.history:
            model_info['metrics'] = {
                "loss_function": str(self.history.model.loss),
                "train": merged(
                    self.history.history,
                    {
                        "epoch": self.history.epoch,
                        "params": self.history.params
                    })
            }
            if test_metrics:
                model_info['metrics']['test'] = test_metrics

        print("Writing model def to " + self.model_def_path)
        with open(self.model_def_path, "w") as file:
            file.write(self.model.to_json())

        print("Writing model weights to " + self.model_weights_path)
        self.model.save_weights(self.model_weights_path)

        print("Writing model info to " + self.model_info_path)
        with open(self.model_info_path, "w") as file:
            file.write(json.dumps(model_info, cls=JsonNumpyEncoder))

    
    def display_performance(self, Y_true, Y_pred, TP):
        from sklearn.metrics import precision_recall_curve, average_precision_score
        precision = dict()
        recall = dict()
        average_precision = dict()
        for i in range(self.num_classes):
            precision[i], recall[i], _ = precision_recall_curve(Y_true[:, i],
                                                                Y_pred[:, i])
            average_precision[i] = average_precision_score(Y_true[:, i], Y_pred[:, i])

        # A "micro-average": quantifying score on all classes jointly
        precision["micro"], recall["micro"], _ = precision_recall_curve(Y_true.ravel(),
            Y_pred.ravel())
        average_precision["micro"] = average_precision_score(Y_true, Y_pred,
                                                             average="micro")
        print('Average precision score, micro-averaged over all classes: {0:0.2f}'
              .format(average_precision["micro"]))

        z = np.all((Y_pred > 0.5) == Y_true, axis=1)
        acc = np.count_nonzero(z) / z.size
        print("exact accuracy", acc)
        z = ((Y_pred > 0.5) == Y_true)
        acc = np.count_nonzero(z) / z.size
        print("binary accuracy", acc)

        # setup plot details
        colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

        plt.figure(figsize=(8, 10))
        f_scores = np.linspace(0.2, 0.8, num=4)
        lines = []
        labels = []
        for f_score in f_scores:
            x = np.linspace(0.01, 1)
            y = f_score * x / (2 * x - f_score)
            l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
            plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

        lines.append(l)
        labels.append('iso-f1 curves')
        l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2)
        lines.append(l)
        labels.append('micro-average Precision-recall (area = {0:0.2f})'
                      ''.format(average_precision["micro"]))

        for i, color in zip(range(self.num_classes), colors):
            l, = plt.plot(recall[i], precision[i], color=color, lw=2)
            lines.append(l)
            labels.append('{0} (area = {1:0.2f})'
                          ''.format(ARGS['caption_map'].inv[i], average_precision[i]))

        fig = plt.gcf()
        fig.subplots_adjust(bottom=0.25)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Micro Average Precision vs. Recall')
        plt.legend(lines, labels, loc=(0, -.4), prop=dict(size=14))
        plt.show()
        plt.savefig(self.model_plot_path, dpi=150)

# Model selection and training
You may have to change these callbacks to suit the dataset and model
Note that calculating gradients and histogram on large layered networks (resnet and inception) takes a long time (5 minutes per epoch calculated) so you may only want to do this infrequently or not at all.

## Model Train

Learning rates for VGG given output layer:
* VGG16:block4_conv3 0x0: 5e-3
* VGG16:block3_conv3 0x0: 1e-3
* VGG16:block3_conv3 1x512: 5e-4 (rough) 2.5e-4 (smooth but worse)
* VGG16:block3_conv3 2x512: 5e-4 (rough) 2.5e-4 (smooth but worse)

Regularization:
* L2 kernel doesn't really kick in till > 1e-2
* L1 activity doesn't really kick in till > 1e-3

In [None]:
def try_experiment(val_data):
    global exp
    search_output_dir = "/data/log/cnn/fd/large-fromCF/thursday2" # Change this output dir.
    num_epochs_train = [20, 40]
    learning_rate_multipliers = [1, 0.02]
    model_instance = None
    # model_instance = "ID:4704 DS:alltogether BS:5 AC:sigmoid MO:simple-2 HE:0x32 TR:all FT:None LF:binary_crossentropy LR:3.7e-04 AR:8.6e-02 KR:2.9e-02 DO:0.00 PO:avg CW:0=2.10,1=0.58,2=0.59,3=0.49,4=1.41,5=1.66,6=3.74,7=1.98 method:step"
    history_data = {}

    def lr_schedule_exp(epoch, base_lr=1e-3, gamma=0.98):
        return base_lr * gamma ** epoch

    def lr_schedule_step(epoch, base_lr, steps):
        lr = base_lr
        for step_epoch, lr_mult in steps.items():
            if epoch >= step_epoch:
                lr *= lr_mult
        return lr

    def init_schedule():
        from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
        ### Pick one
        schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20, cooldown=10, verbose=1)
    #     schedule = LearningRateScheduler(
    # #         lambda epoch, lr: lr_schedule_exp(epoch, base_lr=5e-3, gamma=0.98) # Exponential decay
    #         lambda epoch, lr: lr_schedule_step(epoch, base_lr=5e-3, steps={1: 0.1, 5: 0.1}) # Step decay
    #     )
    #     schedule = None
        return schedule

    backbone_layers = [
        ("block2_conv2", 5e-4),
        ("block3_conv1", 1e-3),
        ("block3_conv2", 1e-3),
        ("block3_conv3", 1e-3),
        ("block4_conv1", 5e-3),
        ("block4_conv2", 5e-3),
        ("block4_conv3", 5e-3),
        ("block5_conv1", 1e-2),
        ("block5_conv2", 1e-2),
        ("block5_conv3", 1e-2),
    ]
    for attempt_no, (backbone_layer, lr) in enumerate(backbone_layers):
        K.clear_session()
        exp = Experiment(ARGS['data']['name'], ARGS['image_dims'])
        exp.class_weights = DATASET['train'].class_weights
        exp.batch_size = 6 # keep this divisible by the amount of GPUs
        if model_instance:  # If loading network structure from model_instance (not weights)
            exp.deserialize(model_instance)
            exp.id = str(np.random.randint(0, 999))
        else:
            exp.random_init()
            exp.pretrained_weights = 'imagenet'
            exp.train_features = False
            exp.dropout = 0.5
            exp.pool = 'avg'
            exp.num_hidden_layers = 1
            exp.num_hidden_neurons = 256
            exp.learning_rate = lr #10**np.random.uniform(-7, -2)
            exp.activity_regularizer_l1 = 1e-4 #10**np.random.uniform(-4, 1)
            exp.kernel_regularizer_l2 = 1e-4 #10**np.random.uniform(-4, 1)
            exp.backbone = 'VGG16' #simple_model_factory(4, exp.activity_regularizer_l1, exp.kernel_regularizer_l2)
        experiment_name = exp.serialize() + " BBL:" + backbone_layer #+ " method:plateau"
        print('=' * 80)
        print(exp.describe())

        log_dir = os.path.join(search_output_dir, experiment_name)
        exp.set_logdir(log_dir)

        print(experiment_name)
        print(log_dir)
        try:
            current_epoch = 0
            # Initialize heads with high LR then train whole model with low LR
            for i, (epochs, lr_mult) in enumerate(zip(num_epochs_train, learning_rate_multipliers)):
                weights = None if i == 0 else exp.model.get_weights()
    #             exp.train_features = False#i > 0
                exp.make_model(weights, backbone_layer=backbone_layer, parallel=ARGS['use_parallel'])
                if i > 0:
                    backbone = exp.get_train_model().get_layer('backbone')
                    backbone.trainable = True
                exp.compile_model()
                exp.setup_callbacks(schedule=init_schedule(), hist=5, grads=True)
                exp.get_train_model().summary()
                raise ValueError("DED")

                K.set_value(exp.get_train_model().optimizer.lr, lr * lr_mult)
                print("Training: {:d} layers".format(len([1 for layer in exp.model.layers if layer.trainable])))
                exp.model.summary()
                exp.model.get_layer("backbone").summary()
                exp.model.get_layer("classify_head").summary()
                history_data[experiment_name] = exp.go(
                    current_epoch + epochs,
                    initial_epoch=current_epoch,
                    val_data=val_data)
                current_epoch += epochs
        except KeyboardInterrupt:
            history_data[experiment_name] = None
        except:
            raise
        finally:
            exp.save_model(
                class_map_r=dict(ARGS['caption_map'].inv),
                prediction_type=ARGS['caption_type'],
                test_metrics=None,
                description="Test model for 5 FDs"
            )
try_experiment(val_data=VAL_DATA)

In [None]:
exp.model.get_layer("classify_head/class_logits").output

In [None]:
exp.display_performance(*exp.evaluate_model(test_data, thresh=0.5))

In [None]:
raise Exception("Stop run all")

In [None]:
if history_data:
    with open(os.path.join(search_output_dir, "history-{:d}epoch.pkl".format(num_epochs_train)), "wb") as file:
        pickle.dump({key: history.history for key, history in history_data.items()}, file)

In [None]:
# #### Should you need to load this pkl:
# # with open(os.path.join(search_output_dir, "history-100epoch.pkl"), "rb") as file:
# #     history = pickle.load(file)

# lrs = [key[1] for key, val  in sorted(history_data.items(), key=lambda x: x[0])]
# val_loss = [history.history['val_loss'][-1] for key, history  in sorted(history_data.items(), key=lambda x: x[0])]
# loss = [history.history['loss'][-1] for lr, history  in sorted(history_data.items(), key=lambda x: x[0])]
# acc = [history.history['binary_accuracy'][-1] for lr, history  in sorted(history_data.items(), key=lambda x: x[0])]
# val_acc = [history.history['val_binary_accuracy'][-1] for lr, history  in sorted(history_data.items(), key=lambda x: x[0])]

# plt.figure()
# plt.subplot(1, 2, 1)
# plt.semilogx(lrs, loss, '.b', label='loss')
# plt.semilogx(lrs, val_loss, '.r', label='val_loss')
# plt.legend()
# plt.title("Loss Vs. LR (100 Epoch)")
# plt.subplot(1, 2, 2)
# plt.semilogx(lrs, acc, '.b', label='binary_accuracy')
# plt.semilogx(lrs, val_acc, '.r', label='val_binary_accuracy')
# plt.legend()
# plt.title("Accuracy Vs. LR (100 Epoch)")
# plt.tight_layout()

In [None]:
log_dir = "/data/log/cnn/fd/large-fromCF/thursday/ID:3379 DS:alltogether-unique BS:7 AC:sigmoid MO:InceptionV3 HE:0x0 TR:all FT:imagenet LF:binary_crossentropy LR:2.7e-05 AR:1.8e-05 KR:3.1e-05 DO:0.00 PO:avg CW:0=3.05,1=0.54,2=1.02,3=0.54,4=4.57,5=0.76,6=2.29 method:plateau"
exp.model_def_path = os.path.join(log_dir, "model_def.json")
exp.model_weights_path = os.path.join(log_dir, "model_weights.h5")
exp.model_info_path = os.path.join(log_dir, "model.json")
exp.model_plot_path = os.path.join(log_dir, "precision-recall.png")

In [None]:
# model_instance = "ID:3379 DS:alltogether-unique BS:7 AC:sigmoid MO:InceptionV3 HE:0x0 TR:all FT:imagenet LF:binary_crossentropy LR:2.7e-05 AR:1.8e-05 KR:3.1e-05 DO:0.00 PO:avg CW:0=3.05,1=0.54,2=1.02,3=0.54,4=4.57,5=0.76,6=2.29 method:plateau"
# exp = Experiment(dataset_name, image_dims)
# exp.deserialize(model_instance)
# exp.make_model(parallel=True)
# exp.get_train_model().load_weights("/data/log/cnn/fd/large-fromCF/thursday/ID:3379 DS:alltogether-unique BS:7 AC:sigmoid MO:InceptionV3 HE:0x0 TR:all FT:imagenet LF:binary_crossentropy LR:2.7e-05 AR:1.8e-05 KR:3.1e-05 DO:0.00 PO:avg CW:0=3.05,1=0.54,2=1.02,3=0.54,4=4.57,5=0.76,6=2.29 method:plateau/models/best.040-0.2809.h5")
# # exp.id = "3379"
# exp.set_logdir(os.path.join(search_output_dir, 'fritatta'))
exp.display_performance(*exp.evaluate_model(test_data, thresh=0.5))
# exp.history = dict()
# exp.save_model(
#     caption_map_r,
#     prediction_type=caption_type, test_metrics=None, description="Test model for 5 FDs")

# Continue training specific model

In [None]:
model = None
K.clear_session()
search_output_dir = "/data/log/cnn/fd/large-fromCF/thursday" # Change this output dir.
model_instance = "ID:3379 DS:alltogether-unique BS:7 AC:sigmoid MO:InceptionV3 HE:0x0 TR:all FT:imagenet LF:binary_crossentropy LR:2.7e-05 AR:1.8e-05 KR:3.1e-05 DO:0.00 PO:avg CW:0=3.05,1=0.54,2=1.02,3=0.54,4=4.57,5=0.76,6=2.29 method:plateau"
model_best_weight = "model_weights.h5"

### Don't set below
exp = Experiment(dataset_name, image_dims)
exp.deserialize(model_instance)
exp.id = "{:d}".format(np.random.randint(0, 999))
exp.dropout = 0.5

experiment_name = exp.serialize()

model_weights_in_path = os.path.join(search_output_dir, model_instance, model_best_weight)
log_dir = os.path.join(search_output_dir, model_instance, "continued", experiment_name)
best_path = os.path.join(log_dir, "models/best.{epoch:03d}-{val_loss:.4f}.h5")
model_def_path = os.path.join(log_dir, "model_def.json")
model_info_path = os.path.join(log_dir, "model.json")
model_plot_path = os.path.join(log_dir, "precision-recall.png")

print(model_weights_in_path)
if os.path.exists(model_weights_in_path):
    !mkdir -p "$log_dir/models"
else:
    raise OSError("path does not exist")
    
print("loading")
print(os.path.join(search_output_dir, model_instance, "model.json"))
exp.model = Inference(os.path.join(search_output_dir, model_instance, "model.json")).model
# base_model = p.feature_extractor(
#     include_top=False, weights=p.pretrained_weights, input_shape=image_dims)
# model = create_new_head(
#     base_model, p.num_classes, p.caption_type, p, 
#     opt_params={'optimizer': 'nadam'}
# )
exp.model.load_weights(model_weights_in_path)
# model = add_model_regularization(model, params.kernel_regularizer_l2, params.activity_regularizer_l1)

# for layer in model.layers:
#     layer.trainable = True

exp.model.compile( # TODO, load this from JSON, manually change this if you are doing single label
    'nadam',
    loss=exp.loss,
    metrics=['binary_accuracy', 'categorical_accuracy'])

print(experiment_name)
print(log_dir)
print("Training: {:d} layers".format(len([1 for layer in exp.model.layers if layer.trainable])))

In [None]:
initial_epoch = 2
num_epoch = 200 # cumulative with initial_epoch
class_weights = None # Can't currently resume training with imbalance data #TODO
new_learning_rate = 1e-6
### Pick one schedule
schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.85, patience=3, cooldown=0, verbose=1)
# schedule = lambda epoch, lr: lr_schedule_exp(epoch, base_lr=5e-5, gamma=0.98) # Exponential decay
# schedule = None


#### Don't set below

K.set_value(exp.model.optimizer.lr, new_learning_rate)
callbacks = setup_callbacks(log_dir, schedule=schedule, hist=2, grads=True)
history_data[experiment_name] = go(exp.model, callbacks, num_epoch, exp.class_weights, initial_epoch=initial_epoch)

(Y_true, Y_pred, TP) = evaluate_model(exp.model, test_data, thresh=0.5)
display_performance(Y_true, Y_pred, TP)

save_model(
    exp.model, name=experiment_name,
    class_map_r=dict(ARGS['caption_map'].inv), prediction_type=ARGS['caption_type'],
    model_weights_path=model_weights_path, model_def_path=model_def_path, model_info_path=model_info_path,
    test_metrics=None, history=history_data[experiment_name],
    description="Test model for 5 FDs"
)

In [None]:
del exp

# LR Search

In [None]:
class LrSearch():
    def __init__(self, instance_str):
        self.model_init_w = None
        self.model_instance = instance_str
        self.exp = None
        self.history = {}
        
    def new_model(self):
        K.clear_session()
        self.exp = Experiment(dataset_name, image_dims)
        self.exp.deserialize(self.model_instance)
        self.exp.make_model(self.model_init_w, parallel=True)
        self.exp.compile_model()
#         self.exp.setup_callbacks(schedule=init_schedule(), hist=0)
        print("Trainable layers: {:d}".format(sum([layer.trainable for layer in self.exp.model.layers])))
        if self.model_init_w is None:
            self.model_init_w = self.exp.model.get_weights()
        
        
    def go(self, epochs, num_steps=10, lr_range=(-9, 0)):
        for base_lr in 10 ** np.random.uniform(*lr_range, size=num_steps):
            print("Learning rate = {:.3e}".format(base_lr))
            self.new_model()
            K.set_value(self.exp.get_train_model().optimizer.lr, base_lr)
            self.history[base_lr] = self.exp.go(epochs)
        return self.history
    
    def plot(self):
        df = []
        for lr, h in self.history.items():
            metrics = np.vstack([np.array(h.history['binary_accuracy']), np.array(h.history['loss'])])
            diff = (metrics[:, -1] - metrics[:, 0])[np.newaxis]
            metrics = np.vstack([metrics[:, -1], diff]).ravel()
            metrics = np.array([lr] + metrics.tolist())
            dfh = pd.DataFrame(
                    data=pd.Series(
                        data=metrics,
                        index=['lr', 'binary_accuracy', 'loss', 'diff_acc', 'diff_loss'])
                ).T.set_index('lr')
            df.append(dfh)
        df = pd.concat(df).sort_index()
        df.plot(logx=True)

In [None]:
del exp

In [None]:
model_instance = "ID:5740 DS:alltogether BS:4 AC:sigmoid MO:VGG16 HE:0x64 TR:all FT:imagenet LF:binary_crossentropy LR:5.1e-04 AR:1.0e-06 KR:1.0e-06 DO:0.00 PO:avg CW:0=4.26,1=0.64,2=0.49,3=7.11 method:plateau"
search = LrSearch(model_instance)
# Hexpo: 4.5e-4 (1e-4 to 6e-4)
# VGG Heads only: 4.4e-5 to 2e-2
# VGG Imagenet: 2e-5

In [None]:
search.go(5, num_steps=4, lr_range=(-6, -4))
search.plot()
# del search

In [None]:
del search

# Below is untested with new changes, don't use

In [None]:
# def get_gradients(model):
#     """Get the gradients of the loss with respect to the weights."""
#     weights = [tensor for tensor in model.trainable_weights 
#                if model.trainable_weights]
#     return weights, model.optimizer.get_gradients(model.total_loss, weights)

        
# def hamming_loss(y_true, y_pred):
#     return K.mean(y_true * (1 - y_pred) + (1 - y_true) * y_pred)


# def check_gradients(model):
#     grad_test = None
#     for image, label in train_gen:
#         grad_test = (image, label)
#         break
#     rates = []
#     weights, grads = get_gradients(model)
#     feed_dict = {
#         "class_logits_sample_weights:0": np.ones(2),
#         "input_1:0": grad_test[0][np.newaxis, ...],
#         "class_logits_target:0": grad_test[1][np.newaxis, ...]
#     }
#     for i, (w, g) in enumerate(zip(weights, grads)):
#         if 'bias' in w.name:
#             continue
#         grad_norm = np.linalg.norm(g.eval(feed_dict, K.get_session()))
#         weight_norm = np.linalg.norm(w.eval(K.get_session()))
#         rate = grad_norm / weight_norm
#         rates.append(rate)
#     if np.mean(rates) < 5e-4 or np.mean(rates) > 3e-1: # These values change with network structure
#         print("Bad gradients ({:.3e}).".format(np.mean(rates)))
#         return False
#     return True

In [None]:
# layer=model.layers[-3]
# print(layer)
# for weight in layer.weights:
#     weight.initializer.run(session=K.get_session())
# # w = layer.get_weights()
# # plt.figure()
# # plt.hist(w[0].ravel(), bins=100)

In [None]:
# # More training
# K.set_value(model.optimizer.lr, 1e-4)

# go(300, class_weights, initial_epoch=200)
# (Y_true, Y_pred, TP) = evaluate_model(model, test_data, thresh=0.5)
# display_performance(Y_true, Y_pred, TP)

# # save_model(
# #     model, name=experiment_name + "-second",
# #     class_map_r=caption_map_r, prediction_type=caption_type,
# #     model_weights_path=model_weights_path, model_def_path=model_def_path, model_info_path=model_info_path,
# #     test_metrics=None, history=history_data[experiment_name],
# #     description="Test model for 5 FDs"
# # )

# Model Testing

In [None]:
from herbicide.utils import vis_square
TP_mask = np.logical_and.reduce(TP, axis=1)
right = test_data[0][TP_mask]
wrong = test_data[0][~TP_mask]
wrong.shape
plt.figure()
vis_square(wrong)
plt.title("Incorrectly Predicted")
plt.figure()
vis_square(right)
plt.title("Correctly Predicted")

# Binary coded the labels then count them wrt TP/FP
print("num labels", test_data[1].sum(axis=0))
coded = np.sum(test_data[1][~TP_mask] * 2 ** np.arange(num_classes)[::-1], axis=1).astype(int)
print("binary coded class error count:", dict(sorted(Counter(coded).items(), key=lambda x: x[0])))
coded = np.sum(test_data[1][TP_mask] * 2 ** np.arange(num_classes)[::-1], axis=1).astype(int)
print("binary coded class correct count:", dict(sorted(Counter(coded).items(), key=lambda x: x[0])))
print(Y_pred[TP_mask])

# Learning Curve

In [None]:
# def learning_curve(dataset, lr, steps, val_data, log_dir):
#     def save_model(path):
#         print("Saving", path)
#         os.makedirs(os.path.dirname(path), exist_ok=True)
#         model.save_weights(path)
#     def setup_callbacks():
#         return [
# #                 ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, cooldown=5, verbose=1),
# #                 ModelCheckpoint(
# #                     model_best_path, monitor='val_loss', verbose=1,
# #                     save_best_only=True, save_weights_only=True, mode='auto', period=1),
# #                 ModelCheckpoint(
# #                     best_path, monitor='val_loss', verbose=1,
# #                     save_best_only=False, save_weights_only=True, mode='auto', period=50),
#                 PRTensorBoard(
#                     log_dir=model_log_dir,
#                     histogram_freq=0,
#                     batch_size=batch_size,
#                     write_graph=False,
#                     write_grads=False,
#                     write_images=False),
#         #         EarlyStopping(
#         #             monitor='val_loss', min_delta=0.0, patience=40, verbose=1, mode='auto')
#         ]
#     def create_new_model(load_base=False):
#         clear_session()
#         model = create_new_head(
#             InceptionV3(include_top=False, weights='imagenet', input_shape=image_dims),
#             num_classes, caption_type, opt_params={'optimizer': Nadam()},
#             class_weights=None, train_features=False, l2_reg=None)
#         if load_base:
#             print("Loading base model")
#             model.load_weights(base_model_path, by_name=True)
#         return model

#     def train():
#         print("Training")
#         K.set_value(model.optimizer.lr, lr)
#         history[subset_size] = model.fit_generator(
#             batching_gen(gen, batch_size=batch_size),
#             validation_data=tuple(val_data),
#             steps_per_epoch=(subset_size // batch_size),
#             validation_steps=steps_per_epoch_val,
#             class_weight=model_class_weights,
#             callbacks=setup_callbacks(), 
#             epochs=50,
#             verbose=1)
#     model_class_weights = None
#     model = None
#     model_path = None
#     image_ids = [image['id'] for image in dataset.imgs.values()]
#     np.random.shuffle(image_ids)
#     num_images = len(image_ids)
#     print("num_images", num_images)
#     history = {}
#     base_model_path = os.path.join(log_dir, "base", "weights.h5")
#     model_path = base_model_path
#     for subset_size in np.linspace(0, num_images, steps + 1).astype(int):
#         if subset_size > 0:
#             imgIds = image_ids[:subset_size]
#             gen = pipeline(
#                 dataset.generator(shuffle_ids=False, imgIds=imgIds),
#                 aug_config=None)
#             model_class_weights = calc_class_weights(gen, dataset) # TODO

#             model_path = os.path.join(log_dir, "subset-of-{:d}/weights.h5".format(subset_size))
#             model_log_dir = os.path.dirname(model_path)
#             model_best_path = os.path.join(log_dir, "subset-of-{:d}/best.h5".format(subset_size))
#             os.makedirs(model_log_dir, exist_ok=True)

#             print("learning curve(lr={:.3e}, size={:d})".format(lr, subset_size))
#             print("model_log_dir", model_log_dir)
#             print("training class weights")
#             print(model_class_weights)
#         model = create_new_model(load_base=(subset_size > 0))
#         if subset_size:
#             train()
#         save_model(model_path)
#     return history

# model = None
# lr = 1e-5
# learning_curve_dir = "/data/log/cnn/fd/learning_curve_5--{:.2e}".format(lr)
# lc_history = learning_curve(coco_train, lr, 5, val_data, learning_curve_dir)
# val_loss = np.array([(size, h.history['val_loss'][-1]) for size, h in lc_history.items()])
# train_loss = np.array([(size, h.history['loss'][-1]) for size, h in lc_history.items()])
# plt.figure()
# plt.plot(train_loss[:, 0], train_loss[:, 1], 'b.')
# plt.plot(val_loss[:, 0], val_loss[:, 1], 'r.')
# plt.xlabel("Number of Training Samples")
# plt.ylabel("Loss")
# plt.savefig(os.path.join(learning_curve_dir, "plot.png"), dpi=150)

In [None]:
# !rm -R /data/log/cnn/fd/learning-curve/

In [None]:
# images = None
# for images, labels in batching_gen(train_gen, batch_size=batch_size):
#     print(images.shape, labels.shape)
    
#     pred = model.predict(images)
#     print(labels)
#     print(pred)
#     print(K.eval(K.tf.losses.sigmoid_cross_entropy(labels, pred)))
#     break

In [None]:
# for unique_label in np.unique(val_data[1], axis=0):
#     unique_data = [val_data[0][i] for i in range(len(val_data[0])) if np.all(val_data[1][i] == unique_label)]
#     num_data = len(unique_data)
#     print(unique_label, num_data)
#     plt.figure()
#     vis_square(np.array(unique_data))
#     plt.title(unique_label)

# Check Update/Weight Ratio

In [None]:

# for i, (w, g) in enumerate(zip(weights, grads)):
#     grad_norm = np.linalg.norm(g.eval(feed_dict, K.get_session()))
#     weight_norm = np.linalg.norm(w.eval(K.get_session()))
#     rate = grad_norm / weight_norm
#     print(i, rate)


# Visualize filters

In [None]:
from herbicide.utils import vis_square
for layer in model.layers:
    if not layer.trainable_weights:
        continue
    for weight in layer.trainable_weights: #  Assumes FD is not trainable
        if 'kernel' not in weight.name:
            continue
        print(weight.name)
        value = K.eval(weight.value())
        print(value.shape)
    
    plt.figure()
    vis_square(value.transpose((3, 0, 1, 2)))
    break


In [None]:
caption_stats = []
for i, (image, caption) in enumerate(coco_train.generator(imgIds=balanced_image_ids_train)):
    plt.figure()
    plt.imshow(image)
    plt.title(str(caption))
    if i == 10:
        break
    caption_stats.append(caption)