In [2]:
''' PARSE DATA '''
import pandas as pd
import os

labels_csv = os.path.join('letters_data', 'labels.csv')
df = pd.read_csv(labels_csv, encoding='ISO-8859-8', header=None)
df = df.applymap(str.strip)

# clean data
doubles = df.applymap(lambda x: True if len(x)>1 and x.find('!') == -1 else False)[1]
X       = df.applymap(lambda x: True if x == 'X' else False)[1]

df = df[~doubles & ~X].reset_index(drop=True)

# label fix ש -> ה
df.iloc[3659][1]='ה!'

root = 'letters_data'
file_list = []
label_list = []
for _,y in df.iterrows():
    file_list.append(os.path.join(root,y[0][1:].replace('\\','/')))
    label_list.append(y[1])

l = sorted(list(set(label_list)))
label_to_idx = {}
for i,j in enumerate(l):
    label_to_idx[j] = i
label_list = [label_to_idx[s] for s in label_list]

In [4]:
''' MAKE IMAGE TENSOR '''
import scipy.misc as sm
import numpy as np

def parse_resize(imgs, s=32):
    img_rsz = np.zeros((len(imgs),s,s,3), dtype=np.uint8)
    c=s//2
    for j,im in enumerate(imgs):
        h,w = im.shape[0],im.shape[1]
        if h>s:
            ar = s/h
            im = sm.imresize(im, ar, interp='bicubic')
            h, w = im.shape[0], im.shape[1]
        if w>s:
            ar = s/w
            im = sm.imresize(im,ar,interp='bicubic')
            h, w = im.shape[0], im.shape[1]
        img_rsz[j,(c-h//2):(c+h//2+h%2),(c-w//2):(c+w//2+w%2),:]=im
    return img_rsz

imgs = [sm.imread(file,mode='RGB') for file in file_list]
images = parse_resize(imgs)
labels = np.array(label_list)

In [5]:
''' IMAGE STATS '''
shapes=[o.shape for o in imgs]
print(np.median(shapes, axis=0), np.mean(shapes, axis=0), np.std(shapes, axis=0))
print(np.max(shapes,axis=0),np.min(shapes,axis=0))
print(np.argmax(shapes, axis=0), np.argmin(shapes,axis=0))

[ 192.  167.    3.] [ 208.19724556  168.49743231    3.        ] [ 47.07156859  29.70098797   0.        ]
[346 231   3] [143 111   3]
[947  86   0] [1566  952    0]


In [7]:
l = len(file_list)
train_idx = np.array([j for j in range(l) if file_list[j].find('candidates_38')==-1 and file_list[j].find('candidates_39')==-1])
test_idx = np.array([j for j in range(l) if file_list[j].find('candidates_38')!=-1 or file_list[j].find('candidates_39')!=-1])

test_images  = images[test_idx]
test_labels  = labels[test_idx]
train_images = images[train_idx]
train_labels = labels[train_idx]

In [8]:
''' MAKE MASKS '''
import os
import matplotlib.pyplot as plt
from skimage import feature
import scipy.misc as sm
import cv2
data_dir = '/home/lioruzan/pixel-cnn/data/letters_data'

def get_edges(imname):
    img = sm.imread(os.path.join(data_dir,imname))
    kernel = np.ones((5,5),np.uint8)
    im = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
    # Compute the Canny filter
    edges = feature.canny(im, sigma=3)
    #print(edges.shape)
    return edges

def get_masks(template_name, curset, t):
    edges    = get_edges(template_name)
    template = sm.imread(template_name)
    x,y = np.nonzero(edges)
    masks = np.zeros_like(curset)
    for c in range(curset.shape[0]):
        cc = np.random.randint(len(x))
        i, j = x[cc], y[cc]
        while i<t/2 or i>=edges.shape[0]-t/2 or j<t/2 or j>=edges.shape[1]-t/2:
            cc = np.random.randint(len(x))
            i,j = x[cc], y[cc]
        patch = template[ i-t//2:i+t//2, j-t//2:j+t//2 ]
        patch = sm.imresize(patch, (32,32))[:,:,np.newaxis]
        masks[c] = patch
    return masks//255

In [9]:
t=256  
train_masks = get_masks(os.path.join(data_dir,'P976-Fg001-R-C01-R02-D04082013-T145409-LR924 _012_mask.jpg'), 
                        train_images, t)
test_masks  = get_masks(os.path.join(data_dir,'P976-Fg004-R-C01-R02-D04082013-T143613-LR924 _012_mask.jpg'), 
                        test_images, t)

In [10]:
sm.imshow(test_masks[9]*255)

In [11]:
import pickle as pkl

with open(os.path.join(data_dir,'letters_train.pkl'),'wb') as f:
    pkl.dump({'data':train_images,'labels':train_labels,'masks':train_masks},f)
with open(os.path.join(data_dir,'letters_test.pkl'),'wb') as f:
    pkl.dump({'data':test_images,'labels':test_labels,'masks':test_masks},f)

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.misc as sm
data_dir = '/home/lioruzan/pixel-cnn/data/letters_data'
import pickle as pkl

with open(os.path.join(data_dir,'letters_test.pkl'),'rb') as f:
    test_letters= pkl.load(f)

''' update labels to be correct rotations for test time '''
from scipy.ndimage import measurements as me
rotations = np.zeros(len(test_letters['masks']))
c=0
for m in test_letters['masks']:
    m = m[:,:,0]
    xm,ym = me.center_of_mass(m)
    xm,ym=int(xm)//16,int(ym)//16
    if xm==0 and ym==0:
        rotations[c] = 0
    if xm==0 and ym==1:
        rotations[c] = 1
    if xm==1 and ym==1:
        rotations[c] = 2
    if xm==1 and ym==0:
        rotations[c] = 3
    c += 1

test_letters['labels'] = rotations
with open(os.path.join(data_dir,'letters_test.pkl'),'wb') as f:
    pkl.dump(test_letters, f)


In [None]:
''' figure out batch size that minimizes loss of test samples '''
print((rotations==0).sum()%32)
print((rotations==1).sum()%20)
print((rotations==2).sum()%4)
print((rotations==3).sum()%12)

In [10]:
''' modify letters_train.pkl to remove labels '''
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.misc as sm
data_dir = '/home/lioruzan/pixel-cnn/data/letters_data'
import pickle as pkl

with open(os.path.join(data_dir,'letters_train.pkl'),'rb') as f:
    train_letters= pkl.load(f)

train_letters['labels']=None

with open(os.path.join(data_dir,'letters_train.pkl'),'wb') as f:
    pkl.dump(train_letters,f)

In [1]:
''' modify letters_test.pkl to remove labels '''
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.misc as sm
data_dir = '/home/lioruzan/pixel-cnn/data/letters_data'
import pickle as pkl

with open(os.path.join(data_dir,'letters_test.pkl'),'rb') as f:
    test_letters= pkl.load(f)

test_letters['labels']=None

with open(os.path.join(data_dir,'letters_test.pkl'),'wb') as f:
    pkl.dump(test_letters,f)

# VERSION 2

In [None]:
''' PARSE DATA '''
import pandas as pd
import os
root = '/home/lioruzan/letters_to_classify_v2/'

labels_csv = os.path.join(os.path.join('all.csv'))
df = pd.read_csv(labels_csv, encoding='ISO-8859-8', header=None)

# clean data
X       = df.applymap(lambda x: True if x == 'back' else False)[1]

df = df[~X].reset_index(drop=True)

ones = df.applymap(lambda x: True if x=='1' else False)[1]
zeros = df.applymap(lambda x: True if x=='0' else False)[1]


file_list = []
label_list = []
for _,y in df.iterrows():
    file_list.append(os.path.join(root,y[0]).strip())
    label_list.append(y[1])

zeros_ind = [i for i in range(len(label_list)) if int(label_list[i])==0 ]
ones_ind = [i for i in range(len(label_list)) if int(label_list[i])==1 ]

zeros_files = [file_list[i] for i in zeros_ind]
ones_files = [file_list[i] for i in ones_ind]

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
inds = np.random.permutation(len(ones_files))
for j in inds[:10]:
    cur = ones_files[j]
    im = plt.imread(cur)
    plt.imshow(im)
    plt.show()

len(ones_ind)