## Images Visual Relationship project
This project is based on the [Google AI Open Images - Visual Relationship Track Kaggle Challenge](https://www.kaggle.com/c/google-ai-open-images-visual-relationship-track).

The challenge is to build the best performing algorithm for automatically detecting relationships triplets.

In [1]:
import argparse
import csv
import pickle
import os
import sys
import random
import math
import re
import time
import wget
import numpy as np
from PIL import Image
import cv2
import glob
from shutil import copyfile
import collections
from joblib import Parallel, delayed

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import keras
from keras import __version__ as keras_version
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Lambda, Cropping2D, Reshape
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from keras import backend as K

import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Visualizations will be shown in the notebook.
%matplotlib inline

# https://github.com/fizyr/keras-retinanet/blob/master/examples/ResNet50RetinaNet.ipynb
import keras_retinanet
from keras_retinanet import models
from keras_retinanet.utils.image import read_image_bgr, preprocess_image, resize_image
from keras_retinanet.utils.visualization import draw_box, draw_caption
from keras_retinanet.utils.colors import label_color

# set tf backend to allow memory to grow, instead of claiming everything
#import tensorflow as tf

#def get_session():
#    config = tf.ConfigProto()
#    config.gpu_options.allow_growth = True
#    return tf.Session(config=config)

# use this environment flag to change which GPU to use
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# set the modified tf session as backend in keras
#keras.backend.tensorflow_backend.set_session(get_session())

# Directory to save logs and trained model
# https://github.com/fizyr/keras-retinanet/releases/download/0.5.0/resnet50_coco_best_v2.1.0.h5
RCNN_COCO_MODEL_URL = "https://github.com/fizyr/keras-retinanet/releases/download/0.5.0/resnet50_coco_best_v2.1.0.h5"
RCNN_COCO_MODEL = "resnet50_coco_best_v2.1.0.h5"

# Local path to trained weights file
MODEL_PATH = os.path.join(os.path.join(os.getcwd(), "pretrained_models"), RCNN_COCO_MODEL)

# Download COCO trained weights from Releases if needed
if not os.path.exists(MODEL_PATH):
    print ('Downloading COCO trained weights: {} to path: {}'.format(RCNN_COCO_MODEL, MODEL_PATH))  
    wget.download(RCNN_COCO_MODEL_URL, MODEL_PATH)  
else:
    print ('Using COCO trained weights: {} from path: {}, downloaded from: {}'.format(
        RCNN_COCO_MODEL, MODEL_PATH, RCNN_COCO_MODEL_URL))
    
import warnings
warnings.simplefilter('ignore', UserWarning)

Using TensorFlow backend.


Using COCO trained weights: resnet50_coco_best_v2.1.0.h5 from path: /home/powell/work/scpd/vision_project/visual_relationship/pretrained_models/resnet50_coco_best_v2.1.0.h5, downloaded from: https://github.com/fizyr/keras-retinanet/releases/download/0.5.0/resnet50_coco_best_v2.1.0.h5


### Helper APIs

In [2]:
"""
Helper API
"""
def process_labels_from_csv_input(prefix='data/raw', 
                                  labels_csv_fname_list=['class-descriptions-boxable.csv', 
                                                         'class-descriptions.csv']):
    label_dict = {}
    # Process the labels info files and create a key value pair
    for flabel_csv in labels_csv_fname_list:
        with open(os.path.join(prefix, flabel_csv)) as f:
            rows = csv.reader(f)
            for row in rows:
                if row[0] in label_dict:
                    #print('Label already exists: {}:{}, new label: {}:{}'.
                    #       format(row[0], label_dict[row[0]], row[0], row[1]))
                    assert row[1] == label_dict[row[0]]
                label_dict[row[0]]=row[1]
                
    return label_dict
                
def process_raw_csv_input(prefix='data/raw', train_csv_fname = 'challenge-2018-train-vrd.csv', 
                          labels_csv_fname_list = ['class-descriptions-boxable.csv', 'class-descriptions.csv']):
    """
    Process the labels from given label names and create three categories
    a. Entity
    b. Attribute
    c. Relationship
    """
    
    label_dict = process_labels_from_csv_input(prefix, labels_csv_fname_list)
    
    # Process the training data and create x, y i.e x->(imageid, (bounding box data)) y->(label1,label2,relationship)
    label1_dict = collections.defaultdict(int)
    label2_dict = collections.defaultdict(int)
    relationship_dict = collections.defaultdict(int)
    xy_list = []
    missing_label_dict = {}
    ignore_header = True
    miss_count = 0
    with open(os.path.join(prefix, train_csv_fname)) as f:
        rows = csv.reader(f)
        for row in rows:
            miss = False
            if ignore_header:
                ignore_header = False
                continue
            x = (row[0], (row[3:11]))
            y = (row[1], row[2], row[11])
            if y[0] not in label_dict:
                if y[0] not in missing_label_dict:
                    print('Label1 missing: {}'.format(y[0]))
                miss = True
                missing_label_dict[y[0]] = y[0]
            else:
                label1_dict[y[0]] += 1
            if y[1] not in label_dict:
                if y[1] not in missing_label_dict:
                    print('Label2 missing: {}, label1 : {}, relation: {}'.format(y[1], label_dict[y[0]], row[11]))
                miss_count += 1
                miss = True
                missing_label_dict[y[1]] = y[1]
            else:
                label2_dict[y[1]] += 1
            relationship_dict[y[2]] += 1
            if miss is False:
                xy_list.append((x, y))
    print ("Missing label count: {}".format(miss_count))             
    return xy_list, (label1_dict, label2_dict, relationship_dict), label_dict

def get_data_dir_from_raw_single_dir(X_dict, prefix='data', dir_list=None, out_dir='processed'):
    X_fset = set()
    copy_prefix_dir = os.path.join(prefix, out_dir)
    for d in dir_list:
        copy_dir = os.path.join(os.getcwd(), os.path.join(copy_prefix_dir, d))
        os.makedirs(copy_dir, exist_ok=True)
        flist = glob.glob(os.path.join(os.path.join(prefix, d), '*.jpg'))
        for f in flist:
            fid = os.path.basename(f).split('.')[0]
            if fid in X_dict:
                dst_f = os.path.join(copy_dir, os.path.basename(f))
                X_fset.add(dst_f)
                X_dict[fid] = (X_dict[fid], dst_f)
                copyfile(os.path.join(os.getcwd(), f), dst_f)
                
    return X_fset

def get_data_from_dir_recursive(xy_list, prefix='data/processed', dir_input='raw'):
    """
    Load the file path for each image id. The dictionary can only have image file path since
    a single image can have multiple labels i.e multiple y values.

    Example of entry in xy_list:
    Train data xy_list[0]: (
                            ('fe58ec1b06db2bb7', ['0.005', '0.033125', '0.58', '0.62777776', 
                                                   '0.005', '0.033125', '0.58', '0.62777776']) , 
                            ('/m/04bcr3', '/m/083vt', 'is'))
    """
    cwd = os.getcwd()
    xy_list_valid = []      # xy_list that has valid image files available
    X_id_to_file_dict = {}  # id of the image to file dictionary
    def process_files(dir_path):
        flist = glob.glob(os.path.join(dir_path, '*.jpg'))
        print('Processing dir: {}, image count: {}'.format(dir_path, len(flist)))
            
        for f in flist:
            fid = os.path.basename(f).split('.')[0]
            if fid in X_id_to_file_dict:
                print ('Error id exists twice: {}-{}-{}'.format(fid, f, X_id_to_file_dict[fid]))
                continue
            else:
                X_id_to_file_dict[fid] = os.path.join(cwd, f)
                
    def helper(dir_input_full):
        l = next(os.walk(dir_input_full))[1]
        if len(l) == 0:   
            return
        
        for d in l:
            dir_path = os.path.join(dir_input_full, d)
            process_files(dir_path)
            helper(dir_path)
    
    process_files(os.path.join(prefix, dir_input))
    helper(os.path.join(prefix, dir_input))
    
    for xy in xy_list:
        if xy[0][0] in X_id_to_file_dict:
            xy_list_valid.append(xy)

    return xy_list_valid, X_id_to_file_dict

def bounding_box_to_plt(image, b):
    """
    Convert one bounding box data into what mathplotlib understands
    [XMin1,    XMax1,     YMin1,   YMax1,        XMin2,    XMax2,    YMin2,   YMax2]
    ['0.005', '0.033125', '0.58', '0.62777776', '0.005', '0.033125', '0.58', '0.62777776']
    for: https://matplotlib.org/api/_as_gen/matplotlib.patches.Rectangle.html#matplotlib.patches.Rectangle
    """
    xsize = image.shape[1]
    ysize = image.shape[0]
    xy = (int(float(b[0]) * xsize), int(float(b[2]) * ysize))   # (XMin1 * xsize, YMin1 * ysize)
    width = int(float(b[1]) * xsize) - xy[0]        # XMax1 * xsize - XMin1 * xsize
    height = int(float(b[3]) * ysize) - xy[1]       # YMax1 * ysize - Ymin * ysize 
    return (xy, width, height)

def two_bounding_boxes_to_plt(image, b):
    """
    Convert two bounding box data into what mathplotlib understands
    """
    return [bounding_box_to_plt(image, b[0:4]), bounding_box_to_plt(image, b[4:len(b)])]
    
def show_images(images,titles=None, bounding_boxes_list=[]):
    """Display a list of images"""
    n_ims = len(images)
    if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)]
    fig = plt.figure()
    n = 1
    
    for i in range(0, len(images)):
        image = images[i]
        title = "None"
        if titles is not None and len(titles) > i:
            title = titles[i]
        
        bounding_boxes = None
        if bounding_boxes_list is not None and len(bounding_boxes_list) > i:
            bounding_boxes = bounding_boxes_list[i]

        a = fig.add_subplot(1,n_ims,n) # Make subplot
        if len(image.shape) == 2 or image.shape[2] == 1: # Is image grayscale?
            plt.imshow(np.resize(image, (image.shape[0], image.shape[1])), interpolation="bicubic", cmap="gray") # Only place in this blog you can't replace 'gray' with 'grey'
        else:
            plt.imshow(image, interpolation="bicubic")
            if bounding_boxes is not None:
                box1, box2 = two_bounding_boxes_to_plt(image, bounding_boxes)
                rect1 = patches.Rectangle((box1[0]),box1[1],box1[2],linewidth=2,edgecolor='y',facecolor='none')
                rect2 = patches.Rectangle((box2[0]),box2[1],box2[2],linewidth=2,edgecolor='g',facecolor='none')
                a.add_patch(rect1)
                a.add_patch(rect2)
        if titles is not None:
            a.set_title(title + ' {}x{}'.format(image.shape[0], image.shape[1]))
        n += 1
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.axis('off')
    plt.show()
    
def show_given_images(xy_given_list, id_to_file_dict):
    img_list = []
    label_list = []
    bounding_boxes_list = []
    for xy in xy_given_list:
        fid = xy[0][0]
        bounding_boxes_list.append(xy[0][1])
        y = xy[1]
        label1 = y[0]
        label2 = y[1]
        if label1 in label_dict:
            label1 = label_dict[label1]
        if label2 in label_dict:
            label2 = label_dict[label2]
        
        label_list.append('{} {} {}'.format(label1, y[2], label2))
        if fid not in id_to_file_dict:
            print ('Error could not find id: {} in id_to_file_dict'.format(fid))
            raise 
        img_list.append(cv2.cvtColor(cv2.imread(id_to_file_dict[fid], cv2.IMREAD_COLOR), cv2.COLOR_RGB2BGR))
    print ('Label_list" {}'.format(label_list))
    show_images(img_list, titles=label_list, bounding_boxes_list=bounding_boxes_list)
    
def show_random_images(xy_given_list, id_to_file_dict, count=4):
    xy_rnd_idx_list = np.random.choice(len(xy_given_list), count, replace=False)
    xy_rnd_list = [ xy_given_list[x] for x in xy_rnd_idx_list]
    show_given_images(xy_rnd_list, id_to_file_dict)
    return xy_rnd_list

def resize_all(id_to_file_dict, prefix='data/processed', output_dir='resized_images', xsize=223, ysize=223, count=None):
    output_dir = os.path.join(prefix, output_dir)
    os.makedirs(output_dir, exist_ok=True)
    ret_id_to_file_dict = {}
    c = 0
    for k, v in id_to_file_dict.items():
        if v is None:
            continue
        try:
            if os.path.isfile(v) is False:
                print('Invalid file failed for {}'.format(v))
                continue
        except:
            print('Invalid file failed for {}'.format(v))
            raise
        out_file = os.path.join(output_dir, os.path.basename(v))
        
        # If the file exists then 
        if os.path.isfile(out_file):
            ret_id_to_file_dict[k] = out_file
            continue
            
        if count is not None and c > count:
            break
        
        resize_img = cv2.resize(cv2.imread(v, cv2.IMREAD_COLOR),(ysize, xsize))
        out_file = os.path.join(output_dir, os.path.basename(v))
        cv2.imwrite(out_file, resize_img)
        
        ret_id_to_file_dict[k] = out_file
    return ret_id_to_file_dict

### Resize and data prep code
The following few cells help with saving resized images. Had to resize them to 96x96 to avoid out of memory issues. Have to fix these buy bumping up the memory on the box. Currently it only has 32GB. The full sized images are 28GB. Reducing them to 96x96 gets us to 803MB.

In [None]:
# Uncomment if you want to resize the data again
#X_resized_id_to_file_dict = resize_all(X_id_to_file_dict)

In [None]:
print ('Showing resized image with resized bounding box examples with bounding boxes')

showen_list = show_random_images(xy_list, X_id_to_file_dict, count=4)

print ('Showing resized but with original bounding box data image examples with bounding boxes')

show_given_images(showen_list, X_resized_id_to_file_dict)

In [None]:
xy_list, train_data_label_tuple, label_dict = process_raw_csv_input()
xy_list, X_resized_id_to_file_dict = get_data_from_dir_recursive(xy_list, dir_input='resized_images')

In [None]:
print ("Size of xy_list: {}".format(len(xy_list)))
print ("xy_list[0]: {}, xy_list[0][0]: {}".format(xy_list[0], xy_list[0][0]))
print ("Example file: {}".format(X_resized_id_to_file_dict[xy_list[0][0][0]]))

In [None]:
random_image_list = show_random_images(xy_list, X_resized_id_to_file_dict, count=4)

### Load the images and prep for the network.

In [None]:
# Output data for retinanet to train the labels that work for us i.e
# the object label and subject label
def prepare_dataset_for_retinanet(xy_train_list, xy_test_list, id_to_file_dict, count=None):
    def fix_boxes(box, xsize, ysize):
        # input is Xmin, Xmax, Ymin, Ymax in %/faction of xsize and ysize
        # retinanet needs x1,y1,x2,y2 in absolute of xsize and ysize
        b = [ int(float(box[0])*xsize), int(float(box[2])*ysize), int(float(box[1])*xsize), 
             int(float(box[3])*ysize) ]
        """
        Some of the bounding boxes does not satisfyin invariant constraint
        x1 < x2 and y1 < y2. In such cases retinanet ignores those images.
        Try to fix them here?. Fortunately they are only few such images.
        if b[0] > b[2]:
            tmp = b[2]
            b[2] = b[0]
            b[0] = tmp
        if b[1] > b[3]:
            tmp = b[3]
            b[3] = b[1]
            b[1] = tmp
        """
        return b
    
    image_size_dict = {}
    def get_dataset(xy_input_list):
        dataset = {'features': [], 'bounding_box_with_label': []}
        c = 0
        for xy in xy_input_list:
            # Add for first label
            if count is not None and c > count:
                break
            c += 1
            dataset['features'].append(id_to_file_dict[xy[0][0]])
            if xy[0][0] in image_size_dict:
                xsize, ysize = image_size_dict[xy[0][0]]
            else:
                img = cv2.imread(id_to_file_dict[xy[0][0]], cv2.IMREAD_COLOR)
                xsize = img.shape[1]
                ysize = img.shape[0]
                image_size_dict[xy[0][0]] = (xsize, ysize)

            dataset['bounding_box_with_label'].append(fix_boxes(xy[0][1][0:4], xsize, ysize) + [str(xy[1][0])])
            # Add for second label       
            dataset['features'].append(id_to_file_dict[xy[0][0]])
            dataset['bounding_box_with_label'].append(
            fix_boxes(xy[0][1][4:len(xy[0][1])], xsize, ysize) + [str(xy[1][1])])
        return dataset

    dataset_train = get_dataset(xy_input_list=xy_train_list)
    dataset_test = get_dataset(xy_input_list=xy_test_list)
    
    return dataset_train, dataset_test

def prepare_dataset_for_model(xy_train_list, xy_test_list, id_to_file_dict, count=None):
    image_size_dict = {}
    def get_dataset(xy_input_list):
        dataset = {'features': [],  'labels_orig':[]}
        c = 0
        for xy in xy_input_list:
            # Add for first label
            if count is not None and c > count:
                break
            c += 1
            dataset['features'].append(id_to_file_dict[xy[0][0]])
            dataset['labels_orig'].append(xy[1])
        return dataset
    
    dataset_train = get_dataset(xy_input_list=xy_train_list)
    dataset_test = get_dataset(xy_input_list=xy_test_list)
    
    return dataset_train, dataset_test
    
# Write the CSV annotations
def write_csv_annotations_for_retinanet(features, labels, 
                                        annotations_file='fullsize_train_annotations.csv'):
    with open(annotations_file, mode='w', newline='', encoding='utf8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for X, Y in zip(features, labels):
            row = [X] + Y
            writer.writerow(row)
            
def write_csv_classes_for_retinanet(train_data_label_tuple_input, classes_file='fullsize_classes.csv'):
    labels_combined_dict = set()
    # Make sure we have unique set of labels only the subject and object label. Should not contain
    # relationship label.
    for label in train_data_label_tuple[0].keys():
        labels_combined_dict.add(label)
    for label in train_data_label_tuple[1].keys():
        labels_combined_dict.add(label)

    with open(classes_file, mode='w', newline='', encoding='utf8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        count = 0
        for label in labels_combined_dict:
            row = [str(label), count]
            writer.writerow(row)
            count+=1

### Prepare the dataset for retinatnet training.

We take the full sized images label set given by the **challenge-2018-train-vrd.csv** which has 354122 entries and prepare it by breaking each entry into two examples i.e 1 per given bounding box and label.

The following step will take some time since retinanet input needs bounding box in absolute dimensions rather than the perctange/fractional input given by OID data.

In [None]:
xy_list, train_data_label_tuple, label_dict = process_raw_csv_input(
    prefix='data/raw', train_csv_fname = 'challenge-2018-train-vrd.csv', 
    labels_csv_fname_list = ['class-descriptions-boxable.csv', 'class-descriptions.csv'])
xy_full_list, X_full_id_to_file_dict = get_data_from_dir_recursive(xy_list, dir_input='raw')

### Break the input set to train and test set.

Before generating the CSV input to Retinanet break the data up. So that we can use test set for evaulating the model and use the same test set for final model evaulation.

In [None]:
def train_test_split(xy_input_list, test_size=0.20, seed=42):
    np.random.seed(seed=42)
    np.random.shuffle(xy_input_list)
    split_idx = int(round(float(len(xy_input_list)) * (1.0-test_size)))
    return xy_input_list[0:split_idx], xy_input_list[split_idx:len(xy_input_list)]

In [None]:
xy_train_list, xy_test_list = train_test_split(xy_input_list=xy_full_list)

In [None]:
print('Size of full set: {}, size of train set: {}, size of test set: {}, split %: {}'.format(
    len(xy_full_list), len(xy_train_list), len(xy_test_list), round(100*float(len(xy_test_list))/len(xy_full_list), 3)))

In [None]:
# Prepare dataset for model without the bounding boxes from full size images 
#dataset_model_train, dataset_model_test = prepare_dataset_for_model(xy_train_list, xy_test_list, 
#                                                                    X_full_id_to_file_dict)

In [None]:
# Print an example from datatset to make sure it is sane.
print ('Size of dataset: {}'.format(len(dataset_model_train['features'])))
rnd_idx = np.random.choice(len(dataset_model_train['features'])-1, 2, replace=False)
for i in rnd_idx:
    print ('dataset[features][{}]: {}, dataset[labels_orig][{}]: {}'.format(i,
        dataset_model_train['features'][i], i, dataset_model_train['labels_orig'][i]))
    print ('dataset[features][{}]: {}, dataset[labels_orig]][{}]: {}'.format(i+1,
        dataset_model_train['features'][i+1], i+1, dataset_model_train['labels_orig'][i+1]))

In [None]:
# Print an example from datatset to make sure it is sane.
print ('Size of dataset: {}'.format(len(dataset_model_test['features'])))
rnd_idx = np.random.choice(len(dataset_model_test['features'])-1, 2, replace=False)
for i in rnd_idx:
    print ('dataset[features][{}]: {}, dataset[labels_orig][{}]: {}'.format(i,
        dataset_model_test['features'][i], i, dataset_model_test['labels_orig'][i]))
    print ('dataset[features][{}]: {}, dataset[labels_orig][{}]: {}'.format(i+1,
        dataset_model_test['features'][i+1], i+1, dataset_model_test['labels_orig'][i+1]))

In [None]:
# Prepare dataset for reinanet from full size images 
#dataset_train, dataset_test = prepare_dataset_for_retinanet(xy_train_list, xy_test_list, 
#                                                            X_full_id_to_file_dict)

In [None]:
# Print an example from datatset to make sure it is sane.
print ('Size of dataset: {}'.format(len(dataset_train['features'])))
rnd_idx = np.random.choice(len(dataset_train['features'])-1, 2, replace=False)
for i in rnd_idx:
    print ('dataset[features][{}]: {}, dataset[bounding_box_with_label][{}]: {}'.format(i,
        dataset_train['features'][i], i, dataset_train['bounding_box_with_label'][i]))
    print ('dataset[features][{}]: {}, dataset[bounding_box_with_label][{}]: {}'.format(i+1,
        dataset_train['features'][i+1], i+1, dataset_train['bounding_box_with_label'][i+1]))

In [None]:
# Print an example from datatset to make sure it is sane.
print ('Size of dataset: {}'.format(len(dataset_test['features'])))
rnd_idx = np.random.choice(len(dataset_test['features'])-1, 2, replace=False)
for i in rnd_idx:
    print ('dataset[features][{}]: {}, dataset[bounding_box_with_label][{}]: {}'.format(i,
        dataset_test['features'][i], i, dataset_test['bounding_box_with_label'][i]))
    print ('dataset[features][{}]: {}, dataset[bounding_box_with_label][{}]: {}'.format(i+1,
        dataset_test['features'][i+1], i+1, dataset_test['bounding_box_with_label'][i+1]))

In [None]:
# Store the retinanet train data and train labels set as input for retinanet training from console.
annotations_file='fullsize_train_annotations.csv'
annotations_test_file='fullsize_test_annotations.csv'
classes_file='fullsize_classes.csv'

write_csv_annotations_for_retinanet(features=dataset_train['features'], 
                                    labels=dataset_train['bounding_box_with_label'], 
                                    annotations_file=annotations_file)
write_csv_annotations_for_retinanet(features=dataset_test['features'], 
                                    labels=dataset_test['bounding_box_with_label'], 
                                    annotations_file=annotations_file)
write_csv_classes_for_retinanet(train_data_label_tuple_input=train_data_label_tuple, classes_file=classes_file)

# Store the model train and test data as pickle.
import pickle
train_model_pickle ='fullsize_train_model.p'
test_model_pickle = 'fullsize_test_model.p'
with open(train_model_pickle, "wb" ) as f:
    pickle.dump(dataset_model_train, f)
with open(test_model_pickle, "wb") as f:
    pickle.dump(dataset_model_test, f)

### Following is the training script run to train on console

```
$ cd keras-retinanet/
$ keras_retinanet/bin/train.py --imagenet-weights --freeze-backbone --no-evaluation --random-transform --epochs 50  --image-min-side 256 --image-max-side 427 --batch-size 1 csv /home/powell/work/scpd/vision_project/visual_relationship/fullsize_train_annotations.csv /home/powell/work/scpd/vision_project/visual_relationship/fullsize_classes.csv
```

Resnet50 is the default backbone used by retinnet and its weights are frozen at training time.

*Total params: 37,627,657*
*Trainable params: 14,066,505*
*Non-trainable params: 23,561,152*

The epoch snapshots will be stored in **keras-retinanet/snapshots** and tensorboard log data is in **keras-retinanet/logs**.

The model output will be as follows:
**keras-retinanet/snapshots/resnet50_csv_epoch_number.h5**, example **keras-retinanet/snapshots/resnet50_csv_01.h5** after 1 epoch of training.

### Retinanet training epoch log

```
==================
Total params: 37,627,657
Trainable params: 14,066,505
Non-trainable params: 23,561,152
__________________________________________________________________________________________________
None
Epoch 1/50
10000/10000 [==============================] - 539s 54ms/step - loss: 3.2986 - regression_loss: 2.4344 - classification_loss: 0.8642

Epoch 00001: saving model to ./snapshots/resnet50_csv_01.h5
Epoch 2/50
10000/10000 [==============================] - 546s 55ms/step - loss: 2.9532 - regression_loss: 2.1671 - classification_loss: 0.7861

Epoch 00002: saving model to ./snapshots/resnet50_csv_02.h5
Epoch 3/50
 2322/10000 [=====>........................] - ETA: 7:02 - loss: 2.9135 - regression_loss: 2.1233 - classification_loss: 0.7901keras_retinanet/bin/../../keras_retinanet/preprocessing/generator.py:165: UserWarning: Image with id 28596 (shape (768, 1024, 3)) contains the following invalid boxes: [[  766.   707.  1594.   919.]
 [  766.   707.  1594.   919.]].
  annotations['bboxes'][invalid_indices, :]
 5743/10000 [================>.............] - ETA: 3:55 - loss: 2.8614 - regression_loss: 2.1055 - classification_loss: 0.7559keras_retinanet/bin/../../keras_retinanet/preprocessing/generator.py:165: UserWarning: Image with id 9600 (shape (1024, 768, 3)) contains the following invalid boxes: [[  449.   458.   785.  1016.]
 [  449.   458.   785.  1016.]].
  annotations['bboxes'][invalid_indices, :]
10000/10000 [==============================] - 551s 55ms/step - loss: 2.8223 - regression_loss: 2.0815 - classification_loss: 0.7408

Epoch 00003: saving model to ./snapshots/resnet50_csv_03.h5
Epoch 4/50
10000/10000 [==============================] - 552s 55ms/step - loss: 2.7240 - regression_loss: 2.0191 - classification_loss: 0.7050

Epoch 00004: saving model to ./snapshots/resnet50_csv_04.h5
Epoch 5/50
10000/10000 [==============================] - 540s 54ms/step - loss: 2.6735 - regression_loss: 1.9777 - classification_loss: 0.6958

Epoch 00005: saving model to ./snapshots/resnet50_csv_05.h5
Epoch 6/50
10000/10000 [==============================] - 540s 54ms/step - loss: 2.6272 - regression_loss: 1.9394 - classification_loss: 0.6878

Epoch 00006: saving model to ./snapshots/resnet50_csv_06.h5
Epoch 7/50
10000/10000 [==============================] - 541s 54ms/step - loss: 2.5968 - regression_loss: 1.9287 - classification_loss: 0.6681

Epoch 00007: saving model to ./snapshots/resnet50_csv_07.h5
Epoch 8/50
10000/10000 [==============================] - 542s 54ms/step - loss: 2.5604 - regression_loss: 1.8956 - classification_loss: 0.6648

Epoch 00008: saving model to ./snapshots/resnet50_csv_08.h5
Epoch 9/50
10000/10000 [==============================] - 539s 54ms/step - loss: 2.5373 - regression_loss: 1.8783 - classification_loss: 0.6590

Epoch 00009: saving model to ./snapshots/resnet50_csv_09.h5
Epoch 10/50
10000/10000 [==============================] - 538s 54ms/step - loss: 2.4884 - regression_loss: 1.8454 - classification_loss: 0.6430

Epoch 00010: saving model to ./snapshots/resnet50_csv_10.h5
Epoch 11/50
10000/10000 [==============================] - 545s 54ms/step - loss: 2.4869 - regression_loss: 1.8444 - classification_loss: 0.6424

Epoch 00011: saving model to ./snapshots/resnet50_csv_11.h5
Epoch 12/50
10000/10000 [==============================] - 545s 55ms/step - loss: 2.4859 - regression_loss: 1.8377 - classification_loss: 0.6482

Epoch 00012: saving model to ./snapshots/resnet50_csv_12.h5
Epoch 13/50
10000/10000 [==============================] - 536s 54ms/step - loss: 2.4415 - regression_loss: 1.8194 - classification_loss: 0.6222

Epoch 00013: saving model to ./snapshots/resnet50_csv_13.h5
Epoch 14/50
10000/10000 [==============================] - 544s 54ms/step - loss: 2.4291 - regression_loss: 1.7997 - classification_loss: 0.6293
```

### Load the Retinanet trained model and evaluate on test set

In [23]:
# Retinanet model testing
# adjust this to point to your downloaded/trained model
# models can be downloaded here: https://github.com/fizyr/keras-retinanet/releases
model_path = os.path.join('retinanet_snapshot', 'resnet50_csv_12.h5')

# load retinanet model
retinanet_model = models.load_model(model_path, backbone_name='resnet50')

# if the model is not converted to an inference model, use the line below
# see: https://github.com/fizyr/keras-retinanet#converting-a-training-model-to-inference-model
retinanet_model_infer = models.convert_model(retinanet_model)

print(retinanet_model_infer.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
padding_conv1 (ZeroPadding2D)   (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9408        padding_conv1[0][0]              
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256         conv1[0][0]                      
__________________________________________________________________________________________________
conv1_relu

In [None]:
# evaluate the model
scores = retinanet_model_infer.evaluate(np.array(X_test), np.array(y_test))
print('\n{}: {}'.format(retinanet_model_infer.metrics_names[1], scores[1]*100))

In [4]:
# Load the labels
lable_tag_to_name = process_labels_from_csv_input()
def load_labels(label_tag_to_name, labels_file="fullsize_classes.csv"):
    labels_to_names = {}
    # Process the labels info files and create a key value pair
    with open(labels_file, encoding='utf8') as f:
        rows = csv.reader(f)
        for row in rows:
            labels_to_names[int(row[1])]=label_tag_to_name[row[0]]
    return labels_to_names

labels_to_names = load_labels(lable_tag_to_name)

In [5]:
print (sorted(labels_to_names.items()))

[(0, 'Camera'), (1, 'Briefcase'), (2, 'Rugby ball'), (3, 'Chopsticks'), (4, 'Bottle'), (5, 'Knife'), (6, 'Beer'), (7, 'Boy'), (8, 'Pretzel'), (9, 'Table tennis racket'), (10, 'Piano'), (11, 'Snake'), (12, 'Van'), (13, 'Sofa bed'), (14, 'Tennis ball'), (15, 'Table'), (16, 'Bed'), (17, 'Dog'), (18, 'Coffee cup'), (19, 'Textile'), (20, 'Microwave oven'), (21, 'Wine glass'), (22, 'Ski'), (23, 'Elephant'), (24, 'Violin'), (25, 'Plastic'), (26, 'Suitcase'), (27, 'Flute'), (28, 'Man'), (29, 'Mug'), (30, 'Mobile phone'), (31, 'Handbag'), (32, 'Bench'), (33, 'Coffee table'), (34, 'Oven'), (35, 'Girl'), (36, 'Wood'), (37, 'Fork'), (38, 'Tennis racket'), (39, 'Bicycle'), (40, 'Monkey'), (41, 'Cat'), (42, 'Desk'), (43, 'Chair'), (44, 'Football'), (45, 'Leather'), (46, 'Woman'), (47, 'Car'), (48, 'Drum'), (49, 'Backpack'), (50, 'Motorcycle'), (51, 'Snowboard'), (52, 'Microphone'), (53, 'Taxi'), (54, 'Horse'), (55, 'Surfboard'), (56, 'Guitar'), (57, 'Hamster'), (58, 'Spoon'), (59, 'Racket'), (60, 'D

In [None]:
image = read_image_bgr(X_resized_test_id_to_file_dict[xy_test_list[15][0][0]])

# copy to draw on
draw = image.copy()
draw = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB)

# preprocess image for network
image = preprocess_image(image)
image, scale = resize_image(image, min_side=246, max_side=427)

print ('Scale: {}'.format(scale))
# process image
start = time.time()
boxes, scores, labels = reinanet_model_infer.predict_on_batch(np.expand_dims(image, axis=0))
print("processing time: ", time.time() - start)
#print ('Boxes: {}, scores: {}, labels: {}'.format(boxes, scores, labels))

# correct for image scale
boxes /= scale

# visualize detections
top_k=10
box_score_label_list = [ x for x in zip(boxes[0], scores[0], labels[0]) ]
box_score_label_list = sorted(box_score_label_list, key=lambda x: x[1], reverse=True)

for i in range(0, top_k):
    box, score, label = box_score_label_list[i]
    # scores are sorted so we can break
        
    color = label_color(label)
    
    b = box.astype(int)
    draw_box(draw, b, color=color)
    
    label_name = label
    #print (label_name)
    if label_name in labels_to_names:
        label_name = labels_to_names[label]
    print ('Label: {}, Score: {}'.format(label_name, score))
    caption = "{} {:.3f}".format(label_name, score)
    draw_caption(draw, b, caption)
    
plt.figure(figsize=(15, 15))
plt.axis('off')
plt.imshow(draw)
plt.show()

### Load the data from pickle

In [6]:
import pickle
train_model_pickle ='fullsize_train_model.p'
test_model_pickle = 'fullsize_test_model.p'
with open(train_model_pickle, "rb" ) as f:
    dataset_model_train = pickle.load(f)
with open(test_model_pickle, "rb") as f:
    dataset_model_test = pickle.load(f)

### Load the images and prep for the network.

In [7]:
from keras_retinanet.utils.image import read_image_bgr, preprocess_image, resize_image, compute_resize_scale

In [11]:
from log_progress import log_progress

image_processed_dict = {}
def process_data_sets_for_model(dataset_raw, min_side=256, max_side=427, mode='caffe', count = None):
    dataset_processed = {'features': [], 'scale': [], 'labels': [], 'labels_orig': []}
    c = 0
    for x, y in log_progress(iter(zip(dataset_raw['features'], dataset_raw['labels_orig'])), every=10):
        if c >= count:
            break
        c += 1
        img = None
        if x in image_processed_dict:
            img = image_processed_dict[x]
        else:
            img = read_image_bgr(x)
            img, scale = resize_image(img, min_side=min_side, max_side=max_side)
            img = preprocess_image(img, mode=mode)
            image_processed_dict[x] = img
        dataset_processed['features'].append(img)
        dataset_processed['scale'].append(scale)
        dataset_processed['labels_orig'].append(y)
    return dataset_processed
    
dataset_train_processed = process_data_sets_for_model(dataset_model_train, count=5000)
dataset_test_processed = process_data_sets_for_model(dataset_model_test, count=1000)

### Use Multi Label Binarizer to encode the labels for the images.
Split the data into training and test sets.

In [12]:
print ('Training data set size: {}'.format(len(dataset_train_processed['features'])))
print ('Test data set size: {}'.format(len(dataset_test_processed['features'])))  

print ('Before Multi-Encoding eature shape: {}, label: {}'.format(dataset_train_processed['features'][0].shape, dataset_train_processed['labels_orig'][0]))

# Multi-label encoding
mlb = MultiLabelBinarizer()
mlb_fit = mlb.fit(dataset_train_processed['labels_orig'] + dataset_test_processed['labels_orig'])
                  
dataset_train_processed['labels'] = mlb.transform(dataset_train_processed['labels_orig'])
dataset_test_processed['labels'] = mlb.transform(dataset_test_processed['labels_orig'])
                  
print ('After Multi-Encoding eature shape: {}, label: {}'.format(dataset_train_processed['features'][0].shape, dataset_train_processed['labels'][0]))
print('MLB classes size {}, classes: {}'.format(len(mlb.classes_), mlb.classes_))

Training data set size: 5000
Test data set size: 1000
Before Multi-Encoding eature shape: (256, 387, 3), label: ('/m/04bcr3', '/m/083vt', 'is')
After Multi-Encoding eature shape: (256, 387, 3), label: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
MLB classes size 61, classes: ['/m/01226z' '/m/01599' '/m/01940j' '/m/0199g' '/m/019w40' '/m/01_5g'
 '/m/01bl7v' '/m/01mzpv' '/m/01s55n' '/m/01y9k5' '/m/01yrx' '/m/026t6'
 '/m/02jvh9' '/m/02p5f1q' '/m/0342h' '/m/03bt1vf' '/m/03k3r' '/m/03m3pdh'
 '/m/03ssj5' '/m/04_sv' '/m/04bcr3' '/m/04ctx' '/m/04dr76w' '/m/04lbp'
 '/m/04yx4' '/m/050k8' '/m/0584n8' '/m/05_5p_0' '/m/05r5c' '/m/05r655'
 '/m/05z87' '/m/071p9' '/m/078jl' '/m/078n6m' '/m/07y_7' '/m/080hkjn'
 '/m/083vt' '/m/09tvcd' '/m/0bt9lr' '/m/0cmx8' '/m/0cvnqh' '/m/0dnr7'
 '/m/0dt3t' '/m/0dv5r' '/m/0dv9c' '/m/0h2r6' '/m/0h8my_4' '/m/0hg7b'
 '/m/0k4j' '/m/0l14j_' '/m/0pg52' '/m/0wdt60w' 'at' 'hits' 'holds'
 'inside_of

### Triplet Relation model.
We will use the trained Retinanet model which will output bounding boxes proposals and labels for them. We will add few more layers on top of the model and add add logit at the end.

In [25]:
from keras_retinanet.utils.model import freeze
def RelationshipModel(input_model, output_size):
    """
    Model is regular CNN without Pooling for some or all layers
    """
    # Taken from https://github.com/fizyr/keras-retinanet
    assert(all(output in input_model.output_names for output in ['regression', 'classification'])), \
        "Input is not a training model (no 'regression' and 'classification' outputs were found, outputs are: {}).".format(input_model.output_names)

    input_model.layers.pop()
    input_model = freeze(input_model)   
        
    model = Sequential()
    # Layer 1
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    # Layer 2
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    # Output Layer
    model.add(Dense(output_size, activation='sigmoid'))

    return keras.models.Model(inputs=input_model.inputs, outputs=model(input_model.output), name="triplet_relation")

### Loss function and Optimization

We use the **Focal** loss function and **Adam** optimizer. Batch size is set to 64 and number of default epochs are 32.
**We note that more investigation is need to find out a correct loss function and an accuracy measure to help SGD find the optimal weights.**

In [26]:
from keras_retinanet import losses

def train_model(model, x_train, y_train, learn_rate=0.001, epochs=32, batch_size=64, verbose=1):
    """
    Train the model.
    Using as loss function and Adam optimizer, default learning rate is .001
    """
    model.compile(loss=losses.focal, 
        optimizer=keras.optimizers.Adam(lr=learn_rate, decay=learn_rate / epochs), metrics=['accuracy'])
    return model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
        verbose=verbose, validation_split=0.2, shuffle=True)

In [27]:
# Get the model
print ('Retinanet model input: {}'.format(retinanet_model.input.shape))
model = RelationshipModel(retinanet_model, output_size=len(mlb.classes_))
model.summary()

# Train the model
epochs = 4
batch_size = 8
model_history = train_model(model, np.array(X_train), np.array(y_train), epochs=epochs, batch_size=batch_size)

Retinanet model input: (?, ?, ?, 3)


TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'

### Save the model
Save the model that has been trained.

In [None]:
# Save the weights.
model.save_weights('model_weights.h5')

# Save the model architecture.
with open('model_architecture.json', 'w') as f:
    f.write(model.to_json())
    
# Save the MLB labels for later use.
with open("mlb_labels.p", "wb") as f:
    f.write(pickle.dumps(mlb))

In [None]:
# plot the training loss and accuracy
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, epochs), model_history.history["loss"], label="train_loss")
plt.plot(np.arange(0, epochs), model_history.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, epochs), model_history.history["acc"], label="train_acc")
plt.plot(np.arange(0, epochs), model_history.history["val_acc"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="upper left")
plt.savefig("model_training_plot.png")

### Invalid accuracy score of test set.
Due to multi-label classification problem the accuracy is artificially high due the fact that many labels are correctly classified as False.

In [None]:
# evaluate the model
scores = model.evaluate(np.array(dataset_test_processed['features']), np.array(dataset_test_processed['labels']))
print('\n{}: {}'.format(model.metrics_names[1], scores[1]*100))

In [None]:
pred = model.predict(np.expand_dims(X_train[0], axis=0))
print('Pred : {}'.format((pred * 100).flatten().tolist()))
print('Truth: {}'.format(y_train[0].flatten().tolist()))

def check_pred(pred, ground):
    pred = pred.flatten().tolist()
    ground = ground.flatten().tolist()
    max_check = float('-inf')
    
    # Get the maximum in pred not corresponding to
    # categories set to true in ground.
    l = [] # Save the ground indices where True exists.
    for i in range(0, len(ground)):
        if ground[i] == 0:
            max_check = max(max_check, pred[i])
        else:
            l.append(i)
    
    # Check that all other elements that do not
    # correspond to prediction do not exceed the minumum above.
    count = 0
    for i in l:
        if max_check < pred[i]:
            count +=1
    
    # Did we get all, if not how many?.
    return count == len(l), count

print ('check_pred: {}'.format(check_pred(pred, y_train[0])))

### Test set error calculation
Iterate through every test set example get the prediction using the model and check with the available label.

In [None]:
def accuracy_calc_helper(feature_set, label_set):
    accurate_prediction_count_dict = {0: [], 1: [], 2: [], 3:[]}
    for i in range(0, len(feature_set)):
        pred = model.predict(np.expand_dims(feature_set[i], axis=0))
        success, success_count = check_pred(pred, label_set[i])
        if success:
            assert success_count == 3
        accurate_prediction_count_dict[success_count].append((i, pred))

    acc_percentages = {}
    for k, v in accurate_prediction_count_dict.items():
        acc_percentages[k] = str(round(len(v)*100.0/len(feature_set), 2))+'%'
        
    return accurate_prediction_count, acc_percentages

accurate_prediction_count, acc_percentages = accuracy_calc_helper(X_test, y_test)

print('Total Test samples: {}, Accuracy percentage map: {}'.format(len(X_test), acc_percentages))

### Train set error calculation
Let us check if network is actually working for the training set.
We showcase here the pitfall of loss function used in training which gives a wrong idea of accuracy at training time.

In [None]:
accurate_prediction_count, acc_percentages = accuracy_calc_helper(X_train, y_train)
print('Total Train samples: {}, Accuracy percentage map: {}'.format(len(X_train), acc_percentages))

### Summary

Things to do.

1. Investigate a better loss function and different network architectures. 
2. Incorporate advances in semantic segmentation networks by using the bounding box data. 
3. Investigate which pre trained network to use to help with object detection part of the network.
4. Investigate the feasibility of mining language data.