In [1]:
import os
import pickle
import numpy as np
from PIL import Image 
from keras.utils import to_categorical
from numpy import array, argmax
import random
import h5py
import cv2
import gzip
import datetime

Using TensorFlow backend.


In this Jupyter notebook, we will generate datasets for digits classifier from the processed SVHN dataset. Also, we are going to generate a second dataset for a model which classifies how many digits number have in a given image. For the second dataset generation, besides the SVHN dataset, we will use other datasets that don't contain any digits, because we want to classify such images in our final approach. 

## 1 )  SVHN dataset preparation for classification model
For further data preparation we use data preprocessed at [1__Preprocess.ipynb](1__Preprocess.ipynb) jupyter notebook.

For training data images generation we'll do the same image preprocess as [Ian J. Goodfellow](https://arxiv.org/abs/1312.6082). Firstly, we are going to find the small rectangular bounding box that will contain individual character bounding boxes, then expand it by 30 % in all directions. Secondly, we'll crop the image to that bounding box and resize the crop to 64 × 64 pixels. Lastly, we'll crop a 54 × 54 pixel image from a random location within the 64 × 64 pixel image.<br>
Also, we are going to normalize image by the equation: <br>
<br>
$ \normalsize image_{norm} = \frac{image - 127.5} {255} $

We will augment data by repeating this process two times for every image and taking different number's location in the image in each repetition.
For a training and validation dataset, we just crop areas surrounded by bounding boxes and resize them to 54 × 54 pixels. 

In [2]:
def getLenNumber(boxes):
    if len(boxes) < 4:
        return len(boxes)
    else: 
        return 4
    
def generate_dataset(dataset, test = False, image_size = 64, length = False, grey = False):
    "test - true when generate validate/test dataser"
    
    if grey:
        num_channels = 1
    else:
        num_channels = 3
    w = np.array([[[ 0.07, 0.72,  0.21]]])
    pixel_depth = 255.0  # Number of levels per pixel.   
    images = np.ndarray(shape=(len(dataset), image_size, image_size, num_channels),
                         dtype=np.float32)
    labels = np.ndarray(shape=(len(dataset), 3, 11))
    if length:
        length_labels = np.ndarray(shape=(len(dataset), 5)) 
    num_image = 0
    
    
    for data in dataset:
        
        # Skip images with more then 5 digits
        if len(data["boxes"]) > 3 and not length:
            continue
        
        # Read image
        image_path = data['filename']
        image_data = Image.open(image_path)
        
        # Get digits bounding boxes and labels
        top = []; left = []; height = []; width = []; label = []
        for box in data["boxes"]:
            top.append(box['top'])
            left.append(box['left'])
            height.append(box['height'])
            width.append(box['width'])
            if box["label"] == 10:
                label.append(0.0)
            else:
                label.append(box["label"])
            
        # Get whole number bounding box
        num_top = np.amin(top)
        num_left = np.amin(left)
        num_height = np.amax(top) + height[np.argmax(top)] - num_top
        num_width = np.amax(left) + width[np.argmax(left)] - num_left  
        
           
        if test is True:
            image_data = image_data.crop((num_left, num_top,num_left + num_width, num_top + num_height)\
                                    ).resize([image_size,image_size])#.convert('L')
        else:
            # Expanding image by 30%
            num_bottom = np.amin([np.ceil(num_top + 1.3 * num_height), image_data.size[1]])
            num_right = np.amin([np.ceil(num_left + 1.3 * num_width), image_data.size[0]])
            num_top = np.amax([np.floor(num_top - 0.3 * num_height), 0])
            num_left = np.amax([np.floor(num_left - 0.3 * num_width), 0])         

            # Cropping the expanded bounding box 
            image_data = image_data.crop((int(num_left), int(num_top), int(num_right), int(num_bottom))\
                                        ).resize([image_size,image_size])#.convert('L')
        #Convert image to the grey scale and then normalize it
        normalized_image = (np.array(image_data) - pixel_depth / 2)/ pixel_depth
            
        images[num_image, :, :,:] = normalized_image
        
        if not length:
        # Isert blank category      
            i = 3 - len(label)
            if i > 0 :
                label.extend([10.0]*i)
            #labels.append(to_categorical(label,num_classes=11))
            labels[num_image, :, :] =  to_categorical(label,num_classes=11)
        else:
            length_labels[num_image, :] = to_categorical(getLenNumber(data["boxes"]), num_classes=5)
        num_image += 1
    
    if length:
        return images, length_labels    
    return images, labels

In [3]:
def augment_dataset(data, labels, positions, image_size = 54, length = False):
    
    k = 0
    for i in range(len(data)):
        if int(argmax(labels[i,1])) is 10:
            k+=1
    print(k)    
    
    new_dataset = np.ndarray(shape=(len(data)*positions +k* positions, image_size, image_size, 3),
                         dtype=np.float32)
    if not length:
        new_labels = np.ndarray(shape=(len(data)*positions+k* positions, 3, 11))
    else:
        new_labels = np.ndarray(shape=(len(data)*positions+k* positions, 5)) 
    num_image = 0


    for i in range(len(data)):
        pos = positions
        if not length:
            if int(argmax(labels[i,1])) is 10 :
                pos = positions * 2

        width = random.sample(range(10), pos)
        heigth = random.sample(range(10), pos)

        for j in range(pos):
            new_dataset[num_image, :, :, :] = data[i, heigth[j]:heigth[j] + image_size, width[j]:width[j]+image_size, :]
            new_labels[num_image] = labels[i]
            #new_labels[num_image, :, :] = labels[i]
            #new_labels[num_image, :] = labels[i]            
            num_image += 1
            
    return new_dataset, new_labels       
    #return new_dataset.reshape((-1, image_size, image_size, 1)), new_labels

Training dataset generation and augmentation.

In [4]:
def read_pickle_file(data):
    with open(data, 'rb') as f:
        data = pickle.load(f)
    return data

In [None]:
%%time
#load data
train_dataset = read_pickle_file(os.path.join("SVHN",'training.pickle'))
#generate and augment data
train_data, train_labels = generate_dataset(train_dataset, 64)
train_data, train_labels = augment_dataset(train_data, train_labels, 2)
del train_dataset

Augmented train dataset shuffling.

In [6]:
#shuffle train data
idx = np.random.permutation(len(train_data))
train_data = train_data[idx]
train_labels  = train_labels[idx]

Validation and testing datasets generation.

In [7]:
%%time
#load and prepare validation dataset
validate_dataset = read_pickle_file(os.path.join("SVHN",'validation.pickle'))
test_dataset = read_pickle_file(os.path.join("SVHN",'test.pickle'))

test_data, test_labels = generate_dataset(test_dataset, True, 54)
validate_data, validate_labels = generate_dataset(validate_dataset,True, 54)
del validate_dataset, test_dataset

Wall time: 1min 11s


Generated datasets saving in hdf5 format files.

In [8]:
with h5py.File('D:/classifier_1.hdf5', 'w') as f:
    #f.create_dataset("validate", data= validate_data, compression="gzip")
    #f.create_dataset("validate labels", data= validate_labels, compression="gzip")
    #f.create_dataset("train", data= train_data, compression="gzip")
    #f.create_dataset("train labels", data= train_labels, compression="gzip")
    f.create_dataset("test", data=test_data, compression="gzip")
    f.create_dataset("test labels", data=test_labels, compression="gzip")

## 2 ) SVHN  and other objects dataset preparation for classification model

Like I mentioned before, we need our number classification approach to recognize if we aren't picturing a number. 
Therefore, we going to append images that don't contain a number and contain other objects from [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html), [Caltech 101](http://www.vision.caltech.edu/Image_Datasets/Caltech101/)  and [Caltech 256](http://www.vision.caltech.edu/Image_Datasets/Caltech256/) datasets with SVHN dataset. SVHN dataset will be generated the same way as in the first section, the only difference is that we include images that contain more than 3 digits numbers.

## 2.1) SVHN dataset generation.

In [5]:
# Read pickle files
def read_pickle_file(data):
    with open(data, 'rb') as f:
        data = pickle.load(f)
    return data

# Set a new path of images.
def set_new_file_path(data,new_path):
    for i in range(len(data)):
        data[i]["filename"] = os.path.join(new_path, data[i]['filename']) 
    return data


In [6]:
#Reading data
extra_data = read_pickle_file(os.path.join("SVHN","extra", 'extra.pickle'))
train_data = read_pickle_file(os.path.join("SVHN","train", 'train.pickle'))
test_data = read_pickle_file(os.path.join("SVHN","test", 'test.pickle'))

extra_data = set_new_file_path(extra_data, os.path.join("SVHN","extra"))
train_data = set_new_file_path(train_data, os.path.join("SVHN","train"))
test_data = set_new_file_path(test_data, os.path.join("SVHN","test"))

dataset  = extra_data + train_data + test_data
del extra_data, train_data, test_data

In [7]:
#Filtering data
def filter_data(data):
    im_sizes = []; ind = []
    
    for i in range(len(data)):
        skip = False
        for j in range(len(data[i]["boxes"])):
            if data[i]["boxes"][j]["top"] < 0 or data[i]["boxes"][j]["left"] < 0:
                skip = True
        # Skip pictures which contains more that 3 digits, first digit is 0 and wrong boxes coordinate 
        if int(data[i]["boxes"][0]["label"] == 10) or skip:
            continue
        ind.append(i)
    return   ind

# Getting images indexes and filtering data
dataset_ind = filter_data(dataset)
dataset = np.array(dataset)[dataset_ind]

In [8]:
def get_indexes(data_length, validate_size, is_test_ind = False, test_size = 0):

    ind = np.arange(data_length)
    validate_ind = np.random.choice(ind, validate_size, replace= False)
    train_ind = np.delete(ind, validate_ind)
    if not is_test_ind:
        return validate_ind, train_ind
    
    test_ind = np.random.choice(train_ind, test_size, replace= False)
    train_ind = np.setdiff1d(train_ind, test_ind)
    return test_ind, validate_ind, train_ind

In [9]:
#Set test, validation and train dataset lenghts
images_size = len(dataset)
validate_size = 8000
test_size = 4000
test_ind, val_ind, train_ind =  get_indexes(len(dataset), validate_size, True, test_size)

In [10]:
%%time
SVHN_train_data_4, SVHN_train_length_4 = generate_dataset(dataset[train_ind], False, 64, True)
SVHN_train_data_4, SVHN_train_length_4 = augment_dataset(SVHN_train_data_4,  SVHN_train_length_4, 2, 54, True)

SVHN_validate_data_4, SVHN_validate_length_4 = generate_dataset(dataset[test_ind],True, 54, True)
SVHN_test_data_4, SVHN_test_length_4 = generate_dataset(dataset[val_ind], True, 54, True)


0
Wall time: 12min 45s


## 2.2) Object data preparation
CIFAR images preparation.

In [11]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [12]:
image_height, image_width = 54, 54
first_file = True
files = os.listdir("Cifar")
for file in files:
    data = unpickle(os.path.join("Cifar",file))
    if first_file:
        dataset = data[b"data"]
        first_file = False
    else:
        dataset = np.append(dataset, data[b"data"], axis = 0)
    
dataset = dataset.reshape((-1, 3,32,32))
dataset = np.swapaxes(dataset, 1, 2)
dataset = np.swapaxes(dataset, 2, 3)    
dataset = (dataset - 255.0/2) / 255.0

cifar_data = np.ndarray(shape=(len(dataset), image_height, image_width, 3),
                         dtype=np.float32)
cifar_labels = np.array([to_categorical(0, num_classes=5)]*len(dataset)).reshape((-1,5))

for i in range(len(dataset)):
    cifar_data[i,:,:,:] = cv2.resize(dataset[i], ( image_width, image_height))
del dataset

In [13]:
print(len(cifar_data),len(cifar_labels))

50000 50000


Caltech 101 and Caltech 256 images preparation.

In [14]:
def get_files_paths(primary_folder):
    paths = []
    folders = os.listdir(primary_folder )
    for folder in folders:
        files = os.listdir(os.path.join(primary_folder, folder))
        for file in files:
            paths.append(os.path.join(primary_folder, folder, file))
    return paths

In [15]:
#Get files paths
obj256 = get_files_paths("256_ObjectCategories")
obj101 = get_files_paths("101_ObjectCategories")
paths = obj101 + obj256

In [16]:
%%time
#2min 12s;
def read_data(paths, image_width, image_height):
    
    pixel_depth = 255.0  # Number of levels per pixel. 
    half_pixel_depth = pixel_depth / 2
    dataset = np.ndarray(shape=(len(paths), image_height, image_width, 3),
                         dtype=np.float32)
    j = 0
    
    for i in range(len(paths)):
        try:
            image = cv2.imread(paths[i])
            shape = image.shape
            if shape[0] < 70 or shape[1] < 110:
                continue
                
            shape = image.shape
            height = min(shape[0], random.randint(60,80))   
            width = min(shape[1], random.randint(95,125))
            y0 = random.randint(0, shape[0] - height)
            x0 = random.randint(0, shape[1] - width)
            
            dataset[i, :, :, :] = (cv2.resize(image[y0:y0+height, x0:x0+width], \
                                              (image_width, image_height)) - half_pixel_depth) / pixel_depth
            
            #labels[i,:, :] = temp_label
            #num_len[i, :] = temp_num_len
            j += 1
        except:
            print(i)
    labels = np.array([to_categorical(0, num_classes=5)]*j).reshape((-1,5))
    
    return dataset[:j], labels


obj_data, obj_labels = read_data(paths, 54, 54)   

13851
29888
Wall time: 2min 13s


CIFAR, Caltech 101 and Caltech 256 datasets merging.

In [17]:
obj_data = np.append(obj_data, cifar_data, axis = 0)
obj_labels = np.append(obj_labels, cifar_labels, axis = 0)

## 2.3) SVHN and object dataset merging.

We will append object dataset to SVHN train, validate, and test dataset proportionately by their sizes.

In [18]:
total = len(SVHN_train_data_4)+len(SVHN_validate_data_4)+ len(SVHN_train_data_4)
test_obj_size = int(len(SVHN_test_data_4)/total * len(obj_data))
validate_obj_size = int(len(SVHN_validate_data_4)/total *  len(obj_data))
test_ind, val_ind, train_ind =  get_indexes(len(obj_data), validate_size, True, test_size)

In [19]:
def shuffle_data(data, labels):
    idx = np.random.permutation(len(data))
    data = data[idx]
    labels  = labels[idx]
    return data, labels

def append_datasets(data1, data2, data1_labels, data2_labels):
    data = np.concatenate((data1, data2), axis = 0)
    data_labels = np.concatenate((data1_labels, data2_labels), axis = 0)
    data, data_labels = shuffle_data(data, data_labels)
    return data, data_labels

Appending and writing validation and test datasets to the h2py files.

In [20]:
test_data, test_labels = append_datasets(SVHN_test_data_4, obj_data[test_ind],
                                        SVHN_test_length_4, obj_labels[test_ind])
validate_data, validate_labels = append_datasets( SVHN_validate_data_4, obj_data[val_ind],
                                        SVHN_validate_length_4, obj_labels[val_ind])

In [21]:
with h5py.File('D:/classifier_2.hdf5', 'w') as f:
    f.create_dataset("validate", data= validate_data, compression="gzip")
    f.create_dataset("validate labels", data= validate_labels, compression="gzip")
    f.create_dataset("test", data=test_data, compression="gzip")
    f.create_dataset("test labels", data=test_labels, compression="gzip")
del validate_data, validate_labels, test_data, test_labels

obj_data = obj_data[train_ind] 
obj_labels = obj_labels[train_ind]

To append object and SVHN train datasets requires a lot of RAM which my computer 16GB doesn't enough to do so. We work this around by appending these datasets in h2py files.

In [22]:
#train labels merging  and shuffling
train_labels = np.append(SVHN_train_length_4, obj_labels, axis = 0)
ind = np.random.permutation(len(train_labels))
train_labels  = train_labels[ind]

In [None]:
#saving train dataset labels writing to the h2py files
with h5py.File('D:/classifier_2.hdf5', 'a') as f:
    f.create_dataset("train labels", data= train_labels, compression="gzip")
del train_labels

In [23]:
def create_dataset(indx, SVHN_data, obj_data, file_name, dataset_name, write = True):
    data = np.ndarray(shape=(len(indx), 54, 54, 3),
                         dtype=np.float32)
    train_size = len(SVHN_data)
    j = 0

    for i in indx:
    
        if i >= train_size:
            data[j] = obj_data[i-train_size]
        else:
            data[j] = SVHN_data[i]
        j += 1
    if write:
        with h5py.File(file_name, 'a') as f:
            f.create_dataset(dataset_name, data= data, compression="gzip", maxshape=(None,54,54,3))
    else: 
        with h5py.File(file_name, 'a') as f:
            f[dataset_name].resize((f[dataset_name].shape[0] + data.shape[0]), axis = 0)
            f[dataset_name][-data.shape[0]:] = data
            

In [24]:
%%time
#train images writing to the h2py files 
ind_m = int(len(ind)/2)
print(datetime.datetime.now())
create_dataset(ind[:ind_m], SVHN_train_data_4, obj_data, "D:/classifier_2.hdf5", "train data")
print(datetime.datetime.now())
create_dataset(ind[ind_m:], SVHN_train_data_4, obj_data, "D:/classifier_2.hdf5", "train data", write = False)

2019-07-22 15:53:21.446478
2019-07-22 16:06:43.791616
Wall time: 22min 50s
