# Prepare

The aim of this notebook is to prepare all required data files for the particular challenge including but not limited to HDF5 batches and JPEG images.

In [1]:
# ...
INPUT_PATH_IMAGES   = "/home/ubuntu/data/CaptionTraining2018.zip"
INPUT_PATH_CONCEPTS = "/home/ubuntu/data/ConceptDetectionTraining2018-Concepts.csv"

In [2]:
# ...
OUTPUT_PATH_IMAGES   = "/home/ubuntu/temp/images/"
OUTPUT_PATH_CONCEPTS = "/home/ubuntu/temp/concepts/"

In [3]:
# ...
BATCH_SIZE = 1000
SPLIT_SIZE = 0.10

In [4]:
# ...
IMAGE_WIDTH  = 224
IMAGE_HEIGHT = 224

In [5]:
# ...
VALUES_RANGE = "-1_+1"

In [6]:
# ...
UNIQUE_IMAGES   = None
UNIQUE_CONCEPTS = None

In [7]:
IMAGE_LABEL = None

In [8]:
# install packages.
import sys
!conda install --yes --prefix {sys.prefix} tqdm

Solving environment: done


  current version: 4.4.8
  latest version: 4.4.10

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /usr

  added / updated specs: 
    - tqdm


The following NEW packages will be INSTALLED:

    ca-certificates: 2017.08.26-h1d4fec5_0
    certifi:         2018.1.18-py36_0     
    libedit:         3.1-heed3624_0       
    libffi:          3.2.1-hd88cf55_4     
    libgcc-ng:       7.2.0-h7cc24e2_2     
    libstdcxx-ng:    7.2.0-h7a57d05_2     
    ncurses:         6.0-h9df7e31_2       
    openssl:         1.0.2n-hb7f436b_0    
    pip:             9.0.1-py36h6c6f9ce_4 
    python:          3.6.4-hc3d631a_1     
    readline:        7.0-ha6073c6_4       
    setuptools:      38.5.1-py36_0        
    sqlite:          3.22.0-h1bed415_0    
    tk:              8.6.7-hc745277_3     
    tqdm:            4.19.4-py36ha5a5176_0
    wheel:           0.30.0-py36hfd4bba0_1
    xz:              5.2.3-h5

In [9]:
# import libraries.
import zipfile
import csv
import pickle
import numpy
import cv2
import random
import h5py
import tqdm

In [10]:
# ...
def open_images(path):
    images = zipfile.ZipFile(path)
    prefix = images.filelist[0].filename
    suffix = '.' + images.filelist[1].filename.split('.')[-1]
    return (images, prefix, suffix)

In [11]:
# ...
images, prefix, suffix = open_images(INPUT_PATH_IMAGES)

In [12]:
# ...
def open_concepts(path):
    data = []
    with open(path, 'rb') as csvfile:
        for image, captions in csv.reader(csvfile, delimiter='\t'):
            data.append([image, captions.split(";")])
    return data

In [13]:
# ...
concepts = open_concepts(INPUT_PATH_CONCEPTS)

In [14]:
# ...
def find_unique_images(data):
    unique_images = set()
    for image in data.filelist:
        image = image.filename
        if '.' not in image:
            continue
        unique_images.add(image)
    return unique_images

In [15]:
# ...
unique_images = find_unique_images(images)

In [16]:
# ...
def find_unique_concepts(data):
    unique_images = set()
    unique_concepts = set()
    for image, concepts in data:
        unique_images.add(image)
        for concept in concepts:
            unique_concepts.add(concept)
    return unique_images, unique_concepts

In [17]:
# ...
unique_images, unique_concepts = find_unique_concepts(concepts)

In [18]:
# sanity check.
len(find_unique_images(images)) == len(find_unique_concepts(concepts)[0])

True

In [19]:
# ...
def zip_with_index(data):
    tmp = dict()
    for index, concept in enumerate(data):
        tmp[concept] = index
    return tmp

In [20]:
# ...
unique_images = zip_with_index(unique_images)

In [21]:
# ...
unique_concepts = zip_with_index(unique_concepts)

In [22]:
# ...
def save_pickle(data, path):
    pickle.dump(
        obj=data,
        file=open(path, "wb")
    )

In [23]:
# ...
save_pickle(unique_images, OUTPUT_PATH_CONCEPTS + "unique_images.pickle")

In [24]:
# ...
save_pickle(unique_concepts, OUTPUT_PATH_CONCEPTS + "unique_concepts.pickle")

In [25]:
# ...
def load_pickle(path):
    return pickle.load(
        file=open(path, "rb")
    )

In [26]:
# ...
UNIQUE_IMAGES = load_pickle(OUTPUT_PATH_CONCEPTS + "unique_images.pickle")

In [27]:
# ...
UNIQUE_CONCEPTS = load_pickle(OUTPUT_PATH_CONCEPTS + "unique_concepts.pickle")

In [28]:
# ...
def generate_labels(data, indexies):
    tmp = dict()
    for image, concepts in data:
        sparse = []
        for concept in concepts:
            sparse.append(indexies[concept])
        tmp[image] = sparse
    return tmp

In [29]:
# ...
image_label = generate_labels(concepts, UNIQUE_CONCEPTS)

In [30]:
# ...
save_pickle(image_label, OUTPUT_PATH_CONCEPTS + "image_label.pickle")

In [31]:
# ...
IMAGE_LABEL = load_pickle(OUTPUT_PATH_CONCEPTS + "image_label.pickle")

In [32]:
# ...
def sparse_to_dense(sparse, length):
    dense = numpy.zeros(length)
    for index in sparse:
        dense[index] = 1
    return dense

In [33]:
# ...
def shuffle_a_dictionary(data):
    keys = data.keys()
    random.shuffle(keys)
    return keys

In [34]:
# ...
def load(path):
    return cv2.imdecode(numpy.frombuffer(images.read(path), numpy.uint8), cv2.IMREAD_COLOR)

In [35]:
# ...
def resize(image, width, height):
    return cv2.resize(image, (width, height), interpolation=cv2.INTER_CUBIC)

In [36]:
# ...
def equalize(image):
    for channel in range(3):
        image[:, :, channel] = cv2.equalizeHist(image[:, :, channel])
    return image

In [37]:
# ...
def normalize(image, method):
    for channel in range(3):
        if method == "+0_+1":
            image[:, :, channel] = ( image[:, :, channel] / ( 255.0 / 1.0 ) ) - 0.0
        elif method == "-1_+1":
            image[:, :, channel] = ( image[:, :, channel] / ( 255.0 / 2.0 ) ) - 1.0
    return image

In [38]:
# ...
def transpose(image):
    return numpy.transpose(image, (2, 0, 1))

In [39]:
# ...
def save_jpeg(path, image):
    cv2.imwrite(path, image)

In [40]:
# ...
def bound_jpeg(images):
    start = 1
    stop = len(images) + 1
    return start, stop

In [41]:
# ...
def process_jpeg(images):
    start, stop = bound_jpeg(images)
    for image in tqdm.tqdm_notebook(shuffle_a_dictionary(images)):
        path = OUTPUT_PATH_IMAGES + "jpeg/" + image + suffix
        data = resize(
                    load(
                        prefix + image + suffix
                    ),
                    IMAGE_WIDTH,
                    IMAGE_HEIGHT
                )
        save_jpeg(path, data)

In [42]:
# ...
process_jpeg(IMAGE_LABEL)




In [43]:
# for batch in tqdm.tqdm_notebook(range(start, stop)):
# # ...
# def save_hdf5(path, images):
#     for image in 
#     with h5py.File(path , 'w') as file:
#         file.create_dataset('name', data=batch_name[:i])
#         file.create_dataset('label', data=batch_label[:i])
#         file.create_dataset('data', data=batch_data[:i])
#         batch_name = BATCH_SIZE * [None]
#         batch_label = numpy.zeros((BATCH_SIZE, len(unique_concepts_data)))
#         batch_data = numpy.zeros((BATCH_SIZE, IMAGE_WIDTH, IMAGE_HEIGHT, 3))
#         path = OUTPUT_PATH_IMAGES + "batch" + '{:0{width}}'.format(part, width=length) + ".hdf5"
#    #batch_data[i, :, :, :] = 
#    #batch_label[i, :] = sparse_to_dense(images[image], len(unique_concepts_data))
#    #batch_name[i] = image
#    #i += 1
#    #images.pop(image, None)

In [44]:
# # ...
# def compute_process_boundaries(unique_images, batch_size):
#     start = 1
#     stop = (len(unique_images) / batch_size if len(unique_images) % batch_size == 0 else len(unique_images) / batch_size + 1) + 1
#     length = len(str(stop))
#     return start, stop, length