In [128]:
import os
import pickle
import numpy as np
from PIL import Image
from sklearn.cross_validation import train_test_split
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
%matplotlib inline

In [129]:
img_height = 32
img_width  = 32
img_depth  = 3
pixel_depth = 255

dataset = pickle.load(open('dataset.pickle', 'rb'))
train_dataset = dataset['train']
test_dataset  = dataset['test']
extra_dataset = dataset['extra']

In [280]:
def image_number_edges(tops, heights, widths, lefts):
    x1 = np.amin(lefts)
    y1 = np.amin(tops)
    x2 = np.amax(np.add(lefts, widths))
    y2 = np.amax(np.add(tops, heights))
    return (int(x1),int(y1),int(x2),int(y2))

def augment_image_borders(edges, image_w, image_h):
    x1, y1, x2, y2 = edges
    
    num_height = y2-y1
    num_width = x2-x1
    left = np.floor(x1 - .1*num_width)
    top = np.floor(y1 - .1*num_height)
    right = np.amin([np.ceil(x1 + 1.2*num_width), image_w])
    bottom = np.amin([np.ceil(y1 + 1.2*num_height), image_h])
    
    return (left,top,right,bottom)

def process_image(image, edges):
    edges = augment_image_borders(edges, image.size[0], image.size[1])
    image = image.crop(edges).resize((img_width,img_height), Image.ANTIALIAS).convert('L')
    data = np.asarray(image)
    return standarize_image_data(data)

def standarize_image_data(data):
    mean = np.mean(data, dtype='float32')
    std = np.std(data, dtype='float32', ddof=1)
    if std < 1e-4: std=1.
    data = (data - mean) / std
    return data

def load_image(folder, filename):
    return Image.open("%s/%s" % (folder, filename))

def create_arrays(num_samples):
    input_dataset = np.ndarray((num_samples, img_height, img_width), dtype=np.float32)
    label_dataset = np.ndarray((num_samples, 6), dtype=np.int32)
    return input_dataset, label_dataset

## Datasets generation

In [281]:
skip_digits = [0,6]
def generate_input_output(dataset, images_folder):
    
    images = dataset['images']
    labels = dataset['labels']
    tops = dataset['tops']
    heights = dataset['heights']
    lefts = dataset['lefts']
    widths = dataset['widths']

    dataset_size = len(images)
    inputs, outputs = create_arrays(dataset_size)
    images_count = 0
    
    for i in range(dataset_size):
        if i%5000 == 0: print(i, "elapsed out of ", dataset_size, "for: ", images_folder)

        img = load_image(images_folder, images[i])
        d_count = labels[i][labels[i] != 10].shape[0]
        if d_count in skip_digits: continue
        images_count += 1
        edges = image_number_edges(tops[i][:d_count], heights[i][:d_count], lefts[i][:d_count], widths[i][:d_count])
        img_array = process_image(img, edges)
#        if i == 46:
#            img.show()
#            plt.figure()
#            plt.imshow(img_array)
#            plt.figure()
#            img2=load_image(images_folder,images[i])
#            plt.imshow(img2)
#            print(lefts[i][:d_count], tops[i][:d_count], widths[i][:d_count], heights[i][:d_count])
#            Image.fromarray(img_array).save('test.png')
#            break
#            print(d_count)
        inputs[images_count-1] = img_array
        outputs[images_count-1] = labels[i]
    return inputs[:images_count], outputs[:images_count]

In [282]:
train_data, train_labels = generate_input_output(train_dataset, 'train')
test_data, test_labels   = generate_input_output(test_dataset, 'test')
extra_data, extra_labels = generate_input_output(extra_dataset, 'extra')

(0, 'elapsed out of ', 33402, 'for: ', 'train')
(5000, 'elapsed out of ', 33402, 'for: ', 'train')
(10000, 'elapsed out of ', 33402, 'for: ', 'train')
(15000, 'elapsed out of ', 33402, 'for: ', 'train')
(20000, 'elapsed out of ', 33402, 'for: ', 'train')
(25000, 'elapsed out of ', 33402, 'for: ', 'train')
(30000, 'elapsed out of ', 33402, 'for: ', 'train')
(0, 'elapsed out of ', 13068, 'for: ', 'test')
(5000, 'elapsed out of ', 13068, 'for: ', 'test')
(10000, 'elapsed out of ', 13068, 'for: ', 'test')
(0, 'elapsed out of ', 202353, 'for: ', 'extra')
(5000, 'elapsed out of ', 202353, 'for: ', 'extra')
(10000, 'elapsed out of ', 202353, 'for: ', 'extra')
(15000, 'elapsed out of ', 202353, 'for: ', 'extra')
(20000, 'elapsed out of ', 202353, 'for: ', 'extra')
(25000, 'elapsed out of ', 202353, 'for: ', 'extra')
(30000, 'elapsed out of ', 202353, 'for: ', 'extra')
(35000, 'elapsed out of ', 202353, 'for: ', 'extra')
(40000, 'elapsed out of ', 202353, 'for: ', 'extra')
(45000, 'elapsed out 

In [284]:
train_extra_data = np.append(train_data, extra_data, axis=0)
train_extra_labels = np.append(train_labels, extra_labels, axis=0)

print(train_extra_data.shape)
print(train_extra_labels.shape)

(235754, 32, 32)
(235754, 6)


In [285]:
train_data_final, valid_data_final, train_labels_final, valid_labels_final = train_test_split(
    train_extra_data, train_extra_labels, train_size=230000)

test_data_final, test_labels_final = test_data, test_labels

### Serialization

In [287]:
def maybe_pickle(filename, force=False):
    if os.path.exists(filename + '.pickle') and not force:
        print('%s already present - Skipping pickling.' % filename)
    else:
        print('Pickling %s.pickle' % filename)
        dataset = {
            'train': {'data':train_data_final, 'labels':train_labels_final},
            'test':  {'data':test_data_final, 'labels':test_labels_final},
            'valid': {'data':valid_data_final, 'labels':valid_labels_final}
        }
        try:
            with open( filename + '.pickle', 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to %s.pickle' % filename, ':', e)

In [288]:
maybe_pickle('svhn_grayscale')

Pickling svhn_grayscale.pickle
