In [1]:
#!pip install tqdm
#!pip install opencv-python

# Libraries

In [2]:
from __future__ import absolute_import
from __future__ import print_function

import keras
from keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TerminateOnNaN, CSVLogger
from keras import backend as K
from keras.models import load_model
from math import ceil
import numpy as np

from models.keras_ssd300 import ssd_300
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
from keras_layers.keras_layer_L2Normalization import L2Normalization

from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
from ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast

from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_geometric_ops import Resize
from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
from data_generator.data_augmentation_chain_original_ssd import SSDDataAugmentation
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms

Using TensorFlow backend.


# Global Parameters

In [3]:
# Global parameters
IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS = 300, 300, 3
IMAGE_SHAPE = (IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)
# The per-channel mean of the images in the dataset. Do not change this value if you're using any of the pre-trained weights.
MEAN_COLOR = [123, 117, 104]
# The color channel order in the original SSD is BGR, so we'll have the model reverse the color channel order of the input images.
SWAP_CHANNELS = [2, 1, 0]
# The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
SCALES = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]
# The anchor box aspect ratios used in the original SSD300; the order matters
ASPECT_RATIOS = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]]
# The space between two adjacent anchor box center points for each predictor layer.
STEPS = [8, 16, 32, 64, 100, 300]
# The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
OFFSETS= [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
# Whether or not to clip the anchor boxes to lie entirely within the image boundaries
CLIP_BOXES = False
# The variances by which the encoded target coordinates are divided as in the original implementation
VARIANCES = [0.1, 0.1, 0.2, 0.2]
NORMALIZE_COORDS = True
# The XML parser needs to now what object class names to look for and in which order to map them to integers.
CLASSES = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']
N_CLASSES = len(CLASSES)-1
MODEL_NAME = 'model.h5'

# Hyperparameters

In [4]:
# Get hyperparameters
#batch_size = env.hyperparameters.get('batch_size', default=16, object_type=int)
batch_size = 32
#lr = env.hyperparameters.get('learning_rate', default=.001, object_type=float)
#alpha = env.hyperparameters.get('alpha', default=1.0, object_type=float)
#beta_1 = env.hyperparameters.get('beta_1', default=0.9, object_type=float)
#beta_2 = env.hyperparameters.get('beta_2', default=0.999, object_type=float)
#epsilon = env.hyperparameters,get('eposilon', default==1e-08, object_type=float)
#decay = env.hyperparameters.get('decay', default=0.0, object_type=float)
#EPOCHS = env.hyperparameters.get('epochs', default=10, object_type=int)
#gpu_count = env.hyperparameters.get('gpu_count', default=0, object_type=int)

# Model

In [5]:
model = ssd_300(image_size=IMAGE_SHAPE,
                n_classes=N_CLASSES,
                mode='training',
                l2_regularization=0.0005,
                scales=SCALES,
                aspect_ratios_per_layer=ASPECT_RATIOS,
                two_boxes_for_ar1=True,
                steps=STEPS,
                offsets=OFFSETS,
                clip_boxes=CLIP_BOXES,
                variances=VARIANCES,
                normalize_coords=NORMALIZE_COORDS,
                subtract_mean=MEAN_COLOR,
                swap_channels=SWAP_CHANNELS
               )

weights_path = './VGG_ILSVRC_16_layers_fc_reduced.h5'
model.load_weights(weights_path, by_name=True)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)
model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

# Generate Training Data
>__NOTE:__ This will be done on the SageMaker Notebook Instance to download to S3 and create the various channels

In [6]:
train_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None)
val_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None)

# The directories that contain the images.
VOC_2007_images_dir      = '/tmp/VOCdevkit/VOC2007/JPEGImages/'
VOC_2012_images_dir      = '/tmp/VOCdevkit/VOC2012/JPEGImages/'

# The directories that contain the annotations.
VOC_2007_annotations_dir      = '/tmp/VOCdevkit/VOC2007/Annotations/'
VOC_2012_annotations_dir      = '/tmp/VOCdevkit/VOC2012/Annotations/'

# The paths to the image sets.
VOC_2007_train_image_set_filename    = '/tmp/VOCdevkit/VOC2007/ImageSets/Main/train.txt'
VOC_2012_train_image_set_filename    = '/tmp/VOCdevkit/VOC2012/ImageSets/Main/train.txt'
VOC_2007_val_image_set_filename      = '/tmp/VOCdevkit/VOC2007/ImageSets/Main/val.txt'
VOC_2012_val_image_set_filename      = '/tmp/VOCdevkit/VOC2012/ImageSets/Main/val.txt'
VOC_2007_trainval_image_set_filename = '/tmp/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt'
VOC_2012_trainval_image_set_filename = '/tmp/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt'
VOC_2007_test_image_set_filename     = '/tmp/VOCdevkit/VOC2007/ImageSets/Main/test.txt'

# Create Training Dataset
train_dataset.parse_xml(images_dirs=[VOC_2007_images_dir,
                                     VOC_2012_images_dir],
                        image_set_filenames=[VOC_2007_trainval_image_set_filename,
                                             VOC_2012_trainval_image_set_filename],
                        annotations_dirs=[VOC_2007_annotations_dir,
                                          VOC_2012_annotations_dir],
                        classes=CLASSES,
                        include_classes='all',
                        exclude_truncated=False,
                        exclude_difficult=False,
                        ret=False)

# Create Testing Dataset
val_dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                      image_set_filenames=[VOC_2007_test_image_set_filename],
                      annotations_dirs=[VOC_2007_annotations_dir],
                      classes=CLASSES,
                      include_classes='all',
                      exclude_truncated=False,
                      exclude_difficult=True,
                      ret=False)

# Optional: Convert the dataset into an HDF5 dataset. This will require more disk space, but will
# speed up the training. Doing this is not relevant in case you activated the `load_images_into_memory`
# option in the constructor, because in that cas the images are in memory already anyway. If you don't
# want to create HDF5 datasets, comment out the subsequent two function calls.

#train_dataset.create_hdf5_dataset(file_path='train.h5',
#                                  resize=False,
#                                  variable_image_size=True,
#                                  verbose=True)

#val_dataset.create_hdf5_dataset(file_path='test.h5',
#                                resize=False,
#                                variable_image_size=True,
#                                verbose=True)

Processing image set 'trainval.txt': 100%|██████████| 5011/5011 [00:18<00:00, 271.68it/s]
Processing image set 'trainval.txt': 100%|██████████| 11540/11540 [00:38<00:00, 302.34it/s]
Loading images into memory: 100%|██████████| 16551/16551 [01:24<00:00, 196.63it/s]
Processing image set 'test.txt': 100%|██████████| 4952/4952 [00:16<00:00, 295.65it/s]
Loading images into memory: 100%|██████████| 4952/4952 [00:22<00:00, 222.87it/s]


>__NOTE:__ This will be done on the Training container

In [10]:
# For the training generator:
ssd_data_augmentation = SSDDataAugmentation(img_height=IMAGE_HEIGHT,
                                            img_width=IMAGE_WIDTH,
                                            background=MEAN_COLOR)

# For the validation generator:
convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH)

# The encoder constructor needs the spatial dimensions of the model's predictor layers to create the anchor boxes.
predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3],
                   model.get_layer('fc7_mbox_conf').output_shape[1:3],
                   model.get_layer('conv6_2_mbox_conf').output_shape[1:3],
                   model.get_layer('conv7_2_mbox_conf').output_shape[1:3],
                   model.get_layer('conv8_2_mbox_conf').output_shape[1:3],
                   model.get_layer('conv9_2_mbox_conf').output_shape[1:3]]

ssd_input_encoder = SSDInputEncoder(img_height=IMAGE_HEIGHT,
                                    img_width=IMAGE_WIDTH,
                                    n_classes=N_CLASSES,
                                    predictor_sizes=predictor_sizes,
                                    scales=SCALES,
                                    aspect_ratios_per_layer=ASPECT_RATIOS,
                                    two_boxes_for_ar1=True,
                                    steps=STEPS,
                                    offsets=OFFSETS,
                                    clip_boxes=CLIP_BOXES,
                                    variances=VARIANCES,
                                    matching_type='multi',
                                    pos_iou_threshold=0.5,
                                    neg_iou_limit=0.5,
                                    normalize_coords=NORMALIZE_COORDS
                                   )

# Create the generator handles that will be passed to Keras' `fit_generator()` function.
train_generator = train_dataset.generate(batch_size=batch_size,
                                         shuffle=True,
                                         transformations=[ssd_data_augmentation],
                                         label_encoder=ssd_input_encoder,
                                         returns={'processed_images',
                                                  'encoded_labels'},
                                         keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size,
                                     shuffle=False,
                                     transformations=[convert_to_3_channels,
                                                      resize],
                                     label_encoder=ssd_input_encoder,
                                     returns={'processed_images',
                                              'encoded_labels'},
                                     keep_images_without_gt=False)

In [11]:
# Get the number of samples in the training and validations datasets.
train_dataset_size = train_dataset.get_dataset_size()
val_dataset_size   = val_dataset.get_dataset_size()

print("Number of images in the training dataset:\t{:>6}".format(train_dataset_size))
print("Number of images in the validation dataset:\t{:>6}".format(val_dataset_size))

Number of images in the training dataset:	 16551
Number of images in the validation dataset:	  4952


# Set Training Parameters

In [12]:
# Define a learning rate schedule.
def lr_schedule(epoch):
    if epoch < 80:
        return 0.001
    elif epoch < 100:
        return 0.0001
    else:
        return 0.00001

# Define model callbacks.
# Note: SageMaker contianer output filepath under which you want to save the model.
model_checkpoint = ModelCheckpoint(filepath=MODEL_NAME.split('.')[0]+'_epoch-{epoch:02d}.h5',
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True,
                                   save_weights_only=False,
                                   mode='auto',
                                   period=1)
csv_logger = CSVLogger(filename='training_log.csv',
                       separator=',',
                       append=True)
learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule,
                                                verbose=1)
terminate_on_nan = TerminateOnNaN()
callbacks = [model_checkpoint,
             csv_logger,
             learning_rate_scheduler,
             terminate_on_nan]

# Train

In [13]:
initial_epoch   = 0
final_epoch     = 120
steps_per_epoch = 1000

history = model.fit_generator(generator=train_generator,
                              steps_per_epoch=steps_per_epoch,
                              epochs=final_epoch,
                              callbacks=callbacks,
                              validation_data=val_generator,
                              validation_steps=ceil(val_dataset_size/batch_size),
                              initial_epoch=initial_epoch)

Epoch 1/120

Epoch 00001: LearningRateScheduler setting learning rate to 0.001.
   3/1000 [..............................] - ETA: 9:55:13 - loss: 94.3858  

KeyboardInterrupt: 