In [3]:
''' PARSE DATA '''
import pandas as pd
import os

labels_csv = os.path.join('letters_data', 'labels.csv')
df = pd.read_csv(labels_csv, encoding='ISO-8859-8', header=None)
df = df.applymap(str.strip)

# clean data
doubles = df.applymap(lambda x: True if len(x)>1 and x.find('!') == -1 else False)[1]
X       = df.applymap(lambda x: True if x == 'X' else False)[1]

df = df[~doubles & ~X].reset_index(drop=True)

# label fix ש -> ה
df.iloc[3659][1]='ה!'

root = 'letters_data'
file_list = []
label_list = []
for _,y in df.iterrows():
    file_list.append(os.path.join(root,y[0][1:].replace('\\','/')))
    label_list.append(y[1])

l = sorted(list(set(label_list)))
label_to_idx = {}
for i,j in enumerate(l):
    label_to_idx[j] = i
label_list = [label_to_idx[s] for s in label_list]

In [4]:
''' MAKE IMAGES '''
import scipy.misc as sm
import numpy as np

imgs = [sm.imread(file,mode='RGB') for file in file_list]
resized_imgs = [sm.imresize(im,(32,32)) for im in imgs]

resized_imgs = [im[np.newaxis,:,:,:] for im in resized_imgs]
images = np.vstack(resized_imgs)
labels = np.array(label_list)

In [5]:
p = np.random.permutation(images.shape[0])
images = images[p]
labels = labels[p]

test         = images[:100]
test_labels  = labels[:100]
train        = images[100:]
train_labels = labels[100:]

In [12]:
''' MAKE MASKS '''
import os
import matplotlib.pyplot as plt
from skimage import feature
import scipy.misc as sm
import cv2
data_dir = '/home/lioruzan/pixel-cnn/data/letters_data'

def get_edges(imname):
    img = sm.imread(os.path.join(data_dir,imname))
    kernel = np.ones((5,5),np.uint8)
    im = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
    # Compute the Canny filter
    edges = feature.canny(im, sigma=3)
    #print(edges.shape)
    return edges

def get_masks(template_name, curset, t):
    edges    = get_edges(template_name)
    template = sm.imread(template_name)
    x,y = np.nonzero(edges)
    masks = np.zeros_like(curset)
    for c in range(curset.shape[0]):
        cc = np.random.randint(len(x))
        i, j = x[cc], y[cc]
        while i<t/2 or i>=edges.shape[0]-t/2 or j<t/2 or j>=edges.shape[1]-t/2:
            cc = np.random.randint(len(x))
            i,j = x[cc], y[cc]
        patch = template[ i-t//2:i+t//2, j-t//2:j+t//2 ]
        patch = sm.imresize(patch, (32,32))[:,:,np.newaxis]
        masks[c] = patch
    return masks

In [13]:
t=256  
train_masks = get_masks(os.path.join(data_dir,'P976-Fg001-R-C01-R02-D04082013-T145409-LR924 _012_mask.jpg'), 
                        train, t)
test_masks  = get_masks(os.path.join(data_dir,'P976-Fg004-R-C01-R02-D04082013-T143613-LR924 _012_mask.jpg'), 
                        test, t)

In [15]:
sm.imshow(test_masks[7])

In [16]:
import pickle as pkl

with open(os.path.join(data_dir,'letters_train.pkl'),'wb') as f:
    pkl.dump({'data':train,'labels':train_labels,'masks':train_masks},f)
with open(os.path.join(data_dir,'letters_test.pkl'),'wb') as f:
    pkl.dump({'data':test,'labels':test_labels,'masks':test_masks},f)