# Build a model to decode sequences of digits from images

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import sys

## Load data

In [2]:
np.random.seed(133)

def load_data(file_name,valid_size=0):
     with open(file_name, 'r') as f:
        data=pickle.load(f)
        labels=data["label"]
        images=data["image"]
        if valid_size:
            all_data=zip(images,labels)
            np.random.shuffle(all_data)
            valid_data=all_data[:valid_size]
            valid_images,valid_labels=tuple([list(l) for l in zip(*valid_data)])
            train_data=all_data[valid_size:]
            train_images,train_labels=tuple([list(l) for l in zip(*train_data)])
            return train_images,train_labels,valid_images,valid_labels
        return images,labels
valid_size=1670 #about 5% of the data
train_images,train_labels,valid_images,valid_labels=load_data("train.pickle",valid_size)
test_images,test_labels=load_data("test.pickle")
print("train:",len(train_images),len(train_labels))
print("validation:",len(valid_images),len(valid_labels))
print("test:",len(test_images),len(test_labels))

train: 31732 31732
validation: 1670 1670
test: 13068 13068


In [3]:
img_size=32
num_channels = 3
distinct_labels=np.arange(1,12).astype(np.float32) # 11 reprsents no more digit
distinct_labels_size=len(distinct_labels)
blanc_label=distinct_labels[distinct_labels_size-1]

def reshape_image(image):
    return np.reshape(image,(-1,img_size,img_size,num_channels)).astype(np.float32)

def reshape_label(label):
    return np.reshape((distinct_labels==label),(1,distinct_labels_size)).astype(np.float32)

def reduce_label(reshaped_label):
    return np.sum(np.multiply(np.transpose(reshaped_label[0]),distinct_labels))

def print_label(label):
    if label==10.:
        return "0"
    elif label==11.0:
        return " "
    return str(int(label))

print("img size: ",img_size)
print("labels: ",distinct_labels)
print("5.0 reshaped: ",reshape_label(5.0))
print("printed labels: ",[print_label(i) for i in distinct_labels])
print("blanc label",blanc_label)
reshaped_blanc_label=reshape_label(blanc_label)
print("blanc label reshaped",reshaped_blanc_label,reshaped_blanc_label.shape)
print("blanc label reduced :",reduce_label(reshaped_blanc_label))

img size:  32
labels:  [  1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.]
5.0 reshaped:  [[ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]]
printed labels:  ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', ' ']
blanc label 11.0
blanc label reshaped [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]] (1, 11)
blanc label reduced : 11.0


In [4]:
#we can choose to expand only a subset of the data for the next steps
train_subset = len(train_images)
valid_subset=len(valid_images)
test_subset=len(test_images)

def expand_image(image,image_labels):
    """expand the image to have one instance for each digit"""  
    reshaped_images = list()
    reshaped_labels=list()
    reshaped_image=reshape_image(image)
    image_labels=image_labels+[blanc_label]#the last prediction should be blanc label
    for image_label in image_labels:
        reshaped_images.append(reshaped_image)
        reshaped_labels.append(reshape_label(image_label))
    return reshaped_images,reshaped_labels
    
def expand_data(images,labels):
    """expand the images to have one instance for each digit"""
    print("initial size: ",len(images),len(labels))
    expanded_images=list()
    expanded_labels=list()
    total_images=len(images)
    image_index=-1
    for image,image_labels in zip(images,labels):  
        image_index=image_index+1
        reshaped_images,reshaped_labels=expand_image(image,image_labels)
        expanded_images.extend(reshaped_images)
        expanded_labels.extend(reshaped_labels)
        if image_index % 500 == 0:
            percent=image_index*100/total_images
            sys.stdout.write("%d%%" % percent)
            sys.stdout.flush()
        elif image_index % 50 == 0:
            sys.stdout.write(".")
            sys.stdout.flush()
    print("transformed size: ",len(expanded_images),len(expanded_labels))
    return expanded_images,expanded_labels
    
print("expanding train data")
expanded_train_data,expanded_train_labels=expand_data(train_images[:train_subset],train_labels[:train_subset])
print("expanding valid data")
expanded_valid_data,expanded_valid_labels=expand_data(valid_images[:valid_subset],valid_labels[:valid_subset])
print("expanding test data")
expanded_test_data,expanded_test_labels=expand_data(test_images[:test_subset],test_labels[:test_subset])

expanding train data
initial size:  31732 31732
0%.........1%.........3%.........4%.........6%.........7%.........9%.........11%.........12%.........14%.........15%.........17%.........18%.........20%.........22%.........23%.........25%.........26%.........28%.........29%.........31%.........33%.........34%.........36%.........37%.........39%.........40%.........42%.........44%.........45%.........47%.........48%.........50%.........51%.........53%.........55%.........56%.........58%.........59%.........61%.........63%.........64%.........66%.........67%.........69%.........70%.........72%.........74%.........75%.........77%.........78%.........80%.........81%.........83%.........85%.........86%.........88%.........89%.........91%.........92%.........94%.........96%.........97%.........99%....transformed size:  101331 101331
expanding valid data
initial size:  1670 1670
0%.........29%.........59%.........89%...transformed size:  5328 5328
expanding test data
initial size:  13068 13068


In [6]:
class IndexGenerator(object):
    """generates the next index of the data from wich we can take a subset of length batch_size"""
    def __init__(self, labels, batch_size):
        self._index=0
        self._labels=labels
        self._length=len(labels)
        self._batch_size=batch_size
    def _next(self):
        curr_index=self._index
        self._index=self._index+self._batch_size
        #make sure we are at the starting of an image (just after a blank label)
        while self._index < self._length and reduce_label(self._labels[self._index-1])!=blanc_label:
            self._index=self._index+1
        if self._index+self._batch_size > self._length:
            self._index=0
        #if(curr_index!=0):
        #    print(self._labels[curr_index-1],self._labels[curr_index])
        return curr_index
        
train_batch_size=128
valid_batch_size=128
test_batch_size=128
train_index_generator=IndexGenerator(expanded_train_labels,train_batch_size)
valid_index_generator=IndexGenerator(expanded_valid_labels,valid_batch_size)
test_index_generator=IndexGenerator(expanded_test_labels,test_batch_size)
#print("train indexes:",train_index_generator._next(),train_index_generator._next(),train_index_generator._next(),train_index_generator._next())


## Build the graph

In [13]:
patch_size = 5
conv1_size = 48
conv2_size = 64
conv3_size = 128

num_nodes = 64 #lstm nodes
num_hidden = 64 #convolution features

graph = tf.Graph()
with graph.as_default():
    
    #conv parameters
    conv1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, conv1_size], stddev=0.1))
    conv1_biases = tf.Variable(tf.zeros([conv1_size]))
    conv2_weights = tf.Variable(tf.truncated_normal(
          [patch_size, patch_size, conv1_size, conv2_size], stddev=0.1))
    conv2_biases = tf.Variable(tf.constant(1.0, shape=[conv2_size]))  
    conv3_weights = tf.Variable(tf.truncated_normal(
          [patch_size, patch_size, conv2_size, conv3_size], stddev=0.1))
    conv3_biases = tf.Variable(tf.zeros([conv3_size]))
    layer4_weights = tf.Variable(tf.truncated_normal(
            [img_size // 4 * img_size // 8 * num_hidden, num_hidden], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    def conv(data):
        """apply our three layer convolution on the image"""
        #print("data shape",data.get_shape().as_list())
        conv1 = tf.nn.conv2d(data, conv1_weights, [1, 1, 1, 1], padding='SAME')
        hidden1 = tf.nn.relu(conv1 + conv1_biases)
        pool1 = tf.nn.max_pool(hidden1, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
        
        conv2 = tf.nn.conv2d(pool1, conv2_weights, [1, 1, 1, 1], padding='SAME')
        hidden2 = tf.nn.relu(conv2 + conv2_biases)
        pool2 = tf.nn.max_pool(hidden2, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
        
        conv3 = tf.nn.conv2d(pool2, conv3_weights, [1, 1, 1, 1], padding='SAME')
        hidden3 = tf.nn.relu(conv3 + conv3_biases)
        pool3 = tf.nn.max_pool(hidden3, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
        shape = pool3.get_shape().as_list()
        #print("shape: ",shape)
        reshape = tf.reshape(pool3, [shape[0], shape[1] * shape[2] * shape[3]])
        
        return tf.nn.relu(tf.matmul(reshape, layer4_weights) + layer4_biases)
        
    
    # Cell Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([num_hidden, num_nodes], stddev=0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.1))
    ib = tf.Variable(tf.ones([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([num_hidden, num_nodes], stddev=0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.1))
    fb = tf.Variable(tf.ones([1, num_nodes]))
    # Memory cell: input, state and bias.                     
    cx = tf.Variable(tf.truncated_normal([num_hidden, num_nodes], stddev=0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.1))
    cb = tf.Variable(tf.ones([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([num_hidden, num_nodes], stddev=0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.1))
    ob = tf.Variable(tf.ones([1, num_nodes]))
    
    
    # Classifier weights and biases.
    weights = tf.Variable(tf.truncated_normal([num_nodes, distinct_labels_size], stddev=0.1))
    biases = tf.Variable(tf.ones([distinct_labels_size]))
        
    # Definition of the cell computation.
    
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""        
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)        
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state
    
    initial_output = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
    initial_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
    
    tf_blanc_label=tf.constant(reshape_label(blanc_label))
    
    def lstm_cell_data(input_data,input_labels):
        outputs=list()
        output=initial_output
        state=initial_state
        for i,label in zip(input_data,input_labels):
            #first apply conv2d beofre running lstm cell
            conv_i=conv(i)
            output, state = lstm_cell(conv_i, output, state)
            outputs.append(output)
            #reinitialize the state in case of end of image
            is_blanc_label=tf.reduce_all(tf.equal(tf_blanc_label,label))
            output=tf.cond(is_blanc_label,lambda :initial_output, lambda :output)
            #output=tf.Print(output,[is_blanc_label,label,tf_blanc_label],message="Blanc")
            state=tf.cond(is_blanc_label,lambda :initial_state, lambda :state)
        return outputs
       
    def model(data):
        """model the data"""
        return tf.matmul(data,weights)+biases
    
    def make_place_holder_list(size,shape,name):
        place_holder_list=list()
        for i in range(size):
            place_holder_list.append(tf.placeholder(tf.float32, shape=shape,name=name+"_"+str(i)))
        return place_holder_list
            
    tf_train_data=make_place_holder_list(train_batch_size,[1,img_size,img_size,num_channels],"tf_train_data")
    tf_train_labels=make_place_holder_list(train_batch_size,[1,distinct_labels_size],"tf_train_labels")
    tf_valid_data=make_place_holder_list(valid_batch_size,[1,img_size,img_size,num_channels],"tf_valid_data")
    tf_valid_labels=make_place_holder_list(valid_batch_size,[1,distinct_labels_size],"tf_valid_labels")
    tf_test_data=make_place_holder_list(test_batch_size,[1,img_size,img_size,num_channels],"tf_test_data")
    tf_test_labels=make_place_holder_list(test_batch_size,[1,distinct_labels_size],"tf_test_labels")
    
    tf_train_outputs=lstm_cell_data(tf_train_data,tf_train_labels)
    tf_valid_outputs=lstm_cell_data(tf_valid_data,tf_valid_labels)
    tf_test_outputs=lstm_cell_data(tf_test_data,tf_test_labels)
    
    # Classifier.
    tf_used_labels=tf.concat(0,tf_train_labels)
    logits = model(tf.concat(0,tf_train_outputs))
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            logits, tf_used_labels))
        
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions.
    train_prediction=tf.nn.softmax(logits)
    valid_prediction=tf.nn.softmax(model(tf.concat(0,tf_valid_outputs)))
    test_prediction=tf.nn.softmax(model(tf.concat(0,tf_test_outputs)))
  
  # Sampling and validation eval: batch 1, no unrolling.TODO

In [15]:
num_steps = 1000001

def accuracy(predictions, labels):
    predictions=np.reshape(np.array(predictions),(-1,distinct_labels_size)).astype(np.float32)
    labels=np.reshape(np.array(labels),(-1,distinct_labels_size)).astype(np.float32)
    #print(predictions.shape,labels.shape)
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

def populate_feed_dict(feed_dict,data,labels,index,size,data_variable,labels_variable):
    for i in range(size):
        feed_dict[data_variable[i]] = data[index+i]
        feed_dict[labels_variable[i]] = labels[index+i]
    return feed_dict

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    try:
        for step in range(num_steps):
            train_index=train_index_generator._next()
            feed_dict=populate_feed_dict(dict(),expanded_train_data,expanded_train_labels,train_index,
                                         train_batch_size,tf_train_data,tf_train_labels)
            _, l, predictions,used_labels = session.run([optimizer, loss, train_prediction,tf_used_labels],feed_dict=feed_dict)
            if (step % 10 == 0):
                labels=expanded_train_labels[train_index:train_index+train_batch_size]
                print('Loss at step %d: %f' % (step, l))
                print('Training accuracy: %.1f%%' % accuracy(
                        predictions, labels))
                #print('Verifying acuracy, accuracy: %.1f%%' % accuracy(
                #        used_labels, labels))
                # Calling .eval() on valid_prediction is basically like calling run(), but
                # just to get that one numpy array. Note that it recomputes all its graph
                # dependencies
                valid_index=valid_index_generator._next()
                feed_dict=populate_feed_dict(dict(),expanded_valid_data,expanded_valid_labels,valid_index,
                                         valid_batch_size,tf_valid_data,tf_valid_labels)
                valid_labels=expanded_valid_labels[valid_index:valid_index+valid_batch_size]
                print('Validation accuracy: %.1f%%' % accuracy(
                        session.run([valid_prediction],feed_dict=feed_dict),valid_labels))
    except KeyboardInterrupt:        
        test_index=test_index_generator._next()
        feed_dict=populate_feed_dict(dict(),expanded_test_data,expanded_test_labels,test_index,
                                             test_batch_size,tf_test_data,tf_test_labels)
        test_labels=expanded_test_labels[test_index:test_index+test_batch_size]
        print('testation accuracy: %.1f%%' % accuracy(
                session.run([test_prediction],feed_dict=feed_dict),test_labels))

Initialized
Loss at step 0: 2.422327
Training accuracy: 8.6%
Validation accuracy: 18.0%
Loss at step 10: 2.106087
Training accuracy: 32.8%
Validation accuracy: 31.2%
testation accuracy: 33.6%


In [None]:
help(tf.nn.conv2d)

In [None]:
sess = tf.InteractiveSession()

# Some tensor we want to print the value of
a = tf.constant([1.0, 3.0])

# Add print operation
a = tf.Print(a, [a],message="printing a")

# Add more elements of the graph using a
b = tf.add(a, a).eval()
print(a.eval())
sess.close()

In [None]:
help(tf.Print)