In [299]:
import os
import pickle
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [300]:
img_height = 28
img_width  = 28
img_depth  = 1

dataset = pickle.load(open('dataset.pickle', 'rb'))
train_dataset = dataset['train']
test_dataset  = dataset['test']
valid_dataset = dataset['valid']
extra_dataset = dataset['extra']

In [301]:
def image_number_edges(tops, heights, widths, lefts):
    x1 = np.amin(lefts)
    y1 = np.amin(tops)
    x2 = np.amax(np.add(lefts, widths))
    y2 = np.amax(np.add(tops, heights))
    return (int(x1),int(y1),int(x2),int(y2))

def process_image(image, edges):    
    image = image.crop(edges).resize((img_width,img_height))
    image = np.asarray(image)
    image = np.average( (image.astype(float) - pixel_depth / 2) / pixel_depth , axis=2)
    return image

def load_image(folder, filename):
    return Image.open("%s/%s" % (folder, filename))

def create_arrays(num_samples):
    input_dataset = np.ndarray((num_samples, img_height, img_width), dtype=np.float32)
    label_dataset = np.ndarray((num_samples, 6), dtype=np.int32)
    return input_dataset, label_dataset

## Datasets generation

In [343]:
def generate_input_output(dataset, images_folder):
    
    images = dataset['images']
    labels = dataset['labels']
    tops = dataset['tops']
    heights = dataset['heights']
    lefts = dataset['lefts']
    widths = dataset['widths']

    dataset_size = len(images)
    inputs, outputs = create_arrays(dataset_size)
    
    for i in range(dataset_size):
        if i%5000 == 0: print(i, "elapsed out of ", dataset_size, "for: ", images_folder)
        img = load_image(images_folder, images[i])
        d_count = labels[i][labels[i] != 0].shape[0]
        edges = image_number_edges(tops[i][:d_count], heights[i][:d_count], lefts[i][:d_count]+20, widths[i][:d_count])
        img_array = process_image(img, edges)
        inputs[i] = img_array
        outputs[i] = labels[i]        
#        if i==11:
#            print edges
#            print tops[i], heights[i], lefts[i], widths[i]
#            print d_count
#            img.show()
#            plt.imshow(img_array, cmap='gray')
#            plt.show()        
    return inputs, outputs

In [344]:
train_data, train_labels = generate_input_output(train_dataset, 'train')
test_data, test_labels   = generate_input_output(test_dataset, 'test')
valid_data, valid_labels = generate_input_output(valid_dataset, 'extra')
extra_data, extra_labels = generate_input_output(extra_dataset, 'extra')

(0, 'elapsed out of ', 33402, 'for: ', 'train')
(5000, 'elapsed out of ', 33402, 'for: ', 'train')
(10000, 'elapsed out of ', 33402, 'for: ', 'train')
(15000, 'elapsed out of ', 33402, 'for: ', 'train')
(20000, 'elapsed out of ', 33402, 'for: ', 'train')
(25000, 'elapsed out of ', 33402, 'for: ', 'train')
(30000, 'elapsed out of ', 33402, 'for: ', 'train')
(0, 'elapsed out of ', 13068, 'for: ', 'test')
(5000, 'elapsed out of ', 13068, 'for: ', 'test')
(10000, 'elapsed out of ', 13068, 'for: ', 'test')
(0, 'elapsed out of ', 2000, 'for: ', 'extra')
(0, 'elapsed out of ', 125000, 'for: ', 'extra')
(5000, 'elapsed out of ', 125000, 'for: ', 'extra')
(10000, 'elapsed out of ', 125000, 'for: ', 'extra')
(15000, 'elapsed out of ', 125000, 'for: ', 'extra')
(20000, 'elapsed out of ', 125000, 'for: ', 'extra')
(25000, 'elapsed out of ', 125000, 'for: ', 'extra')
(30000, 'elapsed out of ', 125000, 'for: ', 'extra')
(35000, 'elapsed out of ', 125000, 'for: ', 'extra')
(40000, 'elapsed out of ', 

### Serialization

In [346]:
def maybe_pickle(filename, force=False):
    if os.path.exists(filename + '.pickle') and not force:
        print('%s already present - Skipping pickling.' % filename)
    else:
        print('Pickling %s.pickle' % filename)
        dataset = {
            'train': {'data':train_data, 'labels':train_labels},
            'test':  {'data':test_data, 'labels':test_labels},
            'extra': {'data':extra_data, 'labels':extra_labels},
            'valid': {'data':valid_data, 'labels':valid_labels}
        }
        try:
            with open( filename + '.pickle', 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to %s.pickle' % filename, ':', e)

In [347]:
maybe_pickle('svhn_tf')

Pickling svhn_tf.pickle
