# Initialize imports

In [1]:
import json
import math
import os

from keras.backend import clear_session
import matplotlib.pyplot as plt
import numpy as np

from abyss_deep_learning.datasets.coco import ImageClassificationDataset
from abyss_deep_learning.datasets.translators import CocoCaptionTranslator
from abyss_deep_learning.keras.classification import caption_map_gen, onehot_gen, hamming_loss
from abyss_deep_learning.keras.utils import lambda_gen, batching_gen
from abyss_deep_learning.keras.models import ImageClassifier
from abyss_deep_learning.utils import balanced_set

Using TensorFlow backend.


# Directory paths

In [5]:
DATA_DIR = "/mnt/ssd1/processed/industry-data/swc/train_1/cloudfactory/datasets/with-bg/notebook-ready/split-batch1/"
JSON_FILE = "train-nb.json"

In [None]:
# DATA_DIR = "/mnt/ssd1/processed/industry-data/project-max/ml/cloud-factory-data/with-bg/multi-label-datasets
# /forwards/notebook-ready"

DATA_DIR = "/home/users/khu/src/abyss/project-max/ml/cloud-factory-data/with-bg/multi-label-datasets/both/notebook-ready"

DIR_NAME = os.path.dirname(os.path.realpath('__file__'))
IM_DIR = os.path.join(DIR_NAME, "../../../../project-max/all-images")

# Correct image paths in train json file 
# Count occurrences of classes for multi label

In [None]:
with open(os.path.join(DATA_DIR, "train-nb.json"), 'r') as json_file:
    data = json.load(json_file)

for image in (data["images"]):
    image_name = image['path'].rpartition('/')
    image['path'] = os.path.join(IM_DIR, image_name[2])
    
caption_count = {}
    
for ann in data["annotations"]:
    for cap in ann['caption'].split(','):
        if cap not in caption_count:
            caption_count[cap] = 0
        caption_count[cap] += 1

with open(os.path.join(DATA_DIR, "train-nb.json"), 'w') as json_file:
    json.dump(data, json_file, indent=4)

## Count of all classes in training set

In [None]:
print(caption_count)

In [None]:
total = sum(caption_count.values())
caption_pct = {}
for key, value in caption_count.items():
    caption_pct[key] = value / total * 100
print(caption_pct)

# Correct images paths in validation json file

In [None]:
with open(os.path.join(DATA_DIR, "val-nb.json"), 'r') as json_file:
    data = json.load(json_file)

for image in (data["images"]):
    image_name = image['path'].rpartition('/')
    image['path'] = os.path.join(IM_DIR, image_name[2])

with open(os.path.join(DATA_DIR, "val-nb.json"), 'w') as json_file:
    json.dump(data, json_file, indent=4)

# Load annotations into memory

In [6]:
train_ds = ImageClassificationDataset(
    os.path.join(DATA_DIR, JSON_FILE),
    image_dir=DATA_DIR,
    cached=False,
    translator=CocoCaptionTranslator(separator=","))

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!


In [None]:
balanced = list(balanced_set(train_ds.coco))

In [7]:
train_ds.print_class_stats

{'SeRJ', 'SO', 'StB', 'F', 'SJ', 'SeRM', 'StD', 'StJL', 'StiL', 'SeRT', 'SeE', 'StJR', 'SeRF', 'StF', 'STF', 'StCR', 'BG', 'SeRB'}


KeyError: 'class_weights'

In [None]:
val_ds = ImageClassificationDataset(
    os.path.join(DATA_DIR, "val-nb.json"),
    image_dir=DATA_DIR,
    cached=False,
    translator=CocoCaptionTranslator(separator=','))

In [None]:
val_ds._calc_class_stats()
print(val_ds.captions)

# Caption map for one hot gen

In [None]:
caption_map = {key: i for i, key in enumerate(caption_count)}
print(caption_map)

# Reverse caption map to determine what class an integer represents

In [None]:
caption_map_r = {val: key for key, val in caption_map.items()}
print(caption_map_r)

# Determine class weights to penalize frequently occurring classes

In [None]:
class_weights =  1 / np.array([j**2 for i, j in sorted(caption_count.items(), key=lambda x: x[0])], dtype=np.float32)
class_weights /= np.linalg.norm(class_weights)
class_weights = dict(zip(sorted(caption_count.keys()), class_weights.tolist()))

for key in caption_map:
    class_weights[caption_map[key]] = class_weights.pop(key)

print("class_weights:")
print(class_weights)

# Define pipeline for the generators

In [None]:
def pipeline(gen, caption_map):
    """ Pipeline consists of lambda expression mapping x -> x, and y to become a set of comma separated captions
        i.e.  {'ED,IP'} -> {'ED', 'IP'}
        caption_map_gen converts caption labels to numeric integers
        onehot_gen converts numeric integers to a vector of 1's and 0's where 1 is a given label
    """
    return onehot_gen(
        caption_map_gen(
            lambda_gen(gen, lambda x, y: (x, set(y.pop().split(',')))),
            caption_map
        ),
        len(caption_map)
    )

def create_new_model():
    clear_session()
    model = ImageClassifier(
        backbone='xception', input_shape=(None, None, 3), classes=5,
        init_lr=1e-5, init_weights='imagenet',
        trainable=True, loss='binary_crossentropy', output_activation='sigmoid',
        metrics=['accuracy', hamming_loss]
    )
    return model

# Process 100 images 1 by 1 per epoch for 10 epochs

# Cross validate on 100 samples

In [None]:
batch_size = 1
#steps_per_epoch = math.floor(len(train_ds.coco.getImgIds()) / batch_size)
steps_per_epoch = 100

model = create_new_model()
print("Break-even loss is", -np.log(1 / model.classes))
model.save_on_epoch_end()
model.fit_generator(batching_gen(pipeline(train_ds.generator(data_ids=balanced,shuffle_ids=True), caption_map), batch_size=batch_size),
                    validation_data=batching_gen(pipeline(val_ds.generator(endless=True), caption_map), batch_size=batch_size), 
                    validation_steps=100,
                    epochs=10, 
                    use_multiprocessing=True,
                    steps_per_epoch=steps_per_epoch,
                    verbose=1)

In [None]:
print(model.history.validation_data)

In [None]:
model.model_.load_weights('./logs/weights_epoch:01-val_loss:0.39.hdf5')

In [None]:
val_image, val_labels = val_ds.sample()

print("Predictions (%)")
print(model.predict_proba(np.expand_dims(val_image, axis=0))[0])
print("Actual")
print([caption_map[x] for x in val_labels])
plt.imshow(val_ds.sample()[0])