In [1]:
import os
import sys
import tarfile
import tensorflow as tf
import numpy as np
from scipy import ndimage
from six.moves import cPickle as pickle
from six.moves.urllib.request import urlretrieve
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
top_data_folder = './data/notMNIST/'
dataset_small = 'notMNIST_small'
dataset_large = 'notMNIST_large'

logs_folder = './logs'

if not os.path.exists(top_data_folder):
    os.makedirs(top_data_folder)
    
def path_to(f):
    return os.path.join(top_data_folder, f)
    
if not os.path.exists(logs_folder):
    os.makedirs(logs_folder)

In [3]:
class DownloadProgress:
    def __init__(self):
        self.last_percent_reported = None

    def __call__(self, count, blockSize, totalSize):
        percent = int(count * blockSize * 100 / totalSize)

        if self.last_percent_reported != percent:
            if percent % 5 == 0:
                sys.stdout.write("%s%%" % percent)
                sys.stdout.flush()
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
      
            self.last_percent_reported = percent

In [4]:
def download_and_extract(dataset_name):
    dataset_archive = dataset_name + '.tar.gz'
    filename = path_to(dataset_archive)
    if not os.path.exists(filename):
        url = 'http://commondatastorage.googleapis.com/books1000/' + dataset_archive
        urlretrieve(url, filename, reporthook=DownloadProgress())

    letters_folder = path_to(dataset_name)

    if not os.path.exists(letters_folder):
        with tarfile.open(filename) as tar:
            tar.extractall(path=top_data_folder)

In [5]:
download_and_extract(dataset_small)

In [6]:
download_and_extract(dataset_large)

In [7]:
image_size = 28
class_count = 10
image_channels = 1

def load_folder(folder):
    image_filenames = [f for f in os.listdir(folder) if not os.path.isdir(os.path.join(folder, f))]
    image_count = len(image_filenames)
    
    dataset = np.ndarray(shape=(image_count, image_size, image_size, image_channels), dtype=np.float32)
    
    image_counter = 0
    
    for f in image_filenames:
        image_filename = os.path.join(folder, f)
        try:
            image_data = np.expand_dims(ndimage.imread(image_filename).astype(float), axis=2)
            dataset[image_counter, :, :] = image_data
            image_counter += 1
        except IOError as e:
            # There are plenty of images, I skip this one
            pass
        
    dataset = dataset[0:image_counter, :, :]
    
    print('Full dataset tensor for %s:' % folder, dataset.shape)
    
    return dataset

In [8]:
def get_complete_dataset(dataset_name):
    image_depth = 256
    
    pickle_file = path_to(dataset_name + '_conv.pickle')
    
    if not os.path.exists(pickle_file):
        image_pixels = []
        image_labels = []
        image_labels_one_hot = []
        
        for i, letter in enumerate("ABCDEFGHIJ"):
            letter_dataset = (load_folder(path_to(os.path.join(dataset_name, letter))) * 2 - image_depth) / image_depth
            
            letter_image_count = len(letter_dataset)
            
            letter_labels = np.ones(shape=(letter_image_count,), dtype=np.float32) * i
            letter_labels_one_hot = np.ndarray(shape=(letter_image_count, class_count), dtype=np.float32)
            
            label_one_hot = np.zeros(shape = (class_count,), dtype=np.float32)
            label_one_hot[i] = 1.0
            letter_labels_one_hot[:] = label_one_hot
            
            image_pixels.append(letter_dataset)
            image_labels.append(letter_labels)
            image_labels_one_hot.append(letter_labels_one_hot)
            
        dataset = { 'pixels': np.concatenate(image_pixels), 
                    'labels': np.concatenate(image_labels),
                    'labels_one_hot': np.concatenate(image_labels_one_hot) }
            
        with open(pickle_file, 'wb') as f:
            pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            
        return dataset
    else:
        with open(pickle_file, 'rb') as f:
            return pickle.load(f)

In [9]:
def split(dataset_name, test_size=0.2, validate_size=0.25, random_state=42):
    dataset = get_complete_dataset(dataset_name)
    answer = {}
    
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_validate_indices, test_indices = list(sss1.split(dataset['pixels'], dataset['labels']))[0]

    answer['X_test'] = dataset['pixels'][test_indices]
    answer['y_test'] = dataset['labels_one_hot'][test_indices]

    train_validate_pixels = dataset['pixels'][train_validate_indices]
    train_validate_labels = dataset['labels'][train_validate_indices]
    train_validate_labels_one_hot = dataset['labels_one_hot'][train_validate_indices]
    
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=validate_size, random_state=random_state * random_state)
    train_indices, validate_indices = list(sss2.split(train_validate_pixels, train_validate_labels_one_hot))[0]
    
    answer['X_train'] = train_validate_pixels[train_indices]
    answer['y_train'] = train_validate_labels_one_hot[train_indices]
    
    answer['X_validate'] = train_validate_pixels[validate_indices]
    answer['y_validate'] = train_validate_labels_one_hot[validate_indices]
    
    return answer    

In [10]:
split_dataset = split(dataset_small)
#split_dataset = split(dataset_large)d

In [11]:
for k in split_dataset.keys():
    print(k, split_dataset[k].shape)

('X_test', (3745, 28, 28, 1))
('X_train', (11234, 28, 28, 1))
('X_validate', (3745, 28, 28, 1))
('y_validate', (3745, 10))
('y_train', (11234, 10))
('y_test', (3745, 10))


In [12]:
# I try to make it overfit
batch_size = split_dataset['X_train'].shape[0] / 2
learning_rate = 0.8

patch_size_1 = 7
depth_1 = 16

patch_size_2 = 5
depth_2 = 8

stride_1 = 2
strides_1 = [1, stride_1, stride_1, 1]

stride_2 = 2
strides_2 = [1, stride_2, stride_2, 1]

hidden_node_count= 64

In [13]:
graph = tf.Graph()

with graph.as_default():
    # Placeholders for train datasets/labels
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, image_channels),
                                     name='train_dataset')
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, class_count), name='train_labels')
    
    # Constants for validate/test data
    tf_validate_dataset = tf.constant(split_dataset['X_validate'], name='validate_dataset')
    tf_validate_labels = tf.constant(split_dataset['y_validate'], name='validate_labels')
    tf_test_dataset = tf.constant(split_dataset['X_test'], name='test_dataset')
    tf_test_labels = tf.constant(split_dataset['y_test'], name='test_labels')
    
    dropout_keep_prob = tf.placeholder(tf.float32) 
    
    # Variables
    layer_1_conv_filter = tf.Variable(tf.truncated_normal([patch_size_1, patch_size_1, image_channels, depth_1], stddev=0.1),
                                     name='layer_1_conv_filter')
    layer_1_biases = tf.Variable(tf.zeros([depth_1]), name='layer_1_biases')
    layer_2_conv_filter = tf.Variable(tf.truncated_normal([patch_size_2, patch_size_2, depth_1, depth_2], stddev=0.1), 
                                      name='layer_2_conv_filter')
    layer_2_biases = tf.Variable(tf.ones([depth_2]), name='layer_2_biases')
    
    layer_1_output_size = image_size // stride_1
    layer_2_output_size = layer_1_output_size // stride_2
    
    layer_3_weights = tf.Variable(tf.truncated_normal([layer_2_output_size * layer_2_output_size * depth_2, hidden_node_count],
                                                     stddev=0.1), name='layer_3_weights')
    layer_3_biases = tf.Variable(tf.ones([hidden_node_count]), name='layer_3_biases')
    
    layer_4_weights = tf.Variable(tf.truncated_normal([hidden_node_count, class_count], stddev=0.1), name='layer_4_weights')
    layer_4_biases = tf.Variable(tf.ones(class_count), name='layer_4_biases')
    
    one = tf.constant(1.0)
    
    # Model
    def model(data, keep_prob=one):
        conv1 = tf.nn.conv2d(data, layer_1_conv_filter, strides_1, padding='SAME')
        hidden1 = tf.nn.relu(conv1 + layer_1_biases)
        
        hidden1 = tf.cond(tf.less(keep_prob, one), lambda: tf.nn.dropout(hidden1, keep_prob), lambda: hidden1)
        
        conv2 = tf.nn.conv2d(hidden1, layer_2_conv_filter, strides_2, padding='SAME')
        hidden2 = tf.nn.relu(conv2 + layer_2_biases)
        
        hidden2 = tf.cond(tf.less(keep_prob, one), lambda: tf.nn.dropout(hidden2, keep_prob), lambda: hidden2)
        
        hidden2_shape = hidden2.get_shape().as_list()
        hidden2_reshaped = tf.reshape(hidden2, [hidden2_shape[0], hidden2_shape[1] * hidden2_shape[2] * hidden2_shape[3]])
        hidden3 = tf.nn.relu(tf.matmul(hidden2_reshaped, layer_3_weights) + layer_3_biases)
        hidden3 = tf.cond(tf.less(keep_prob, one), lambda: tf.nn.dropout(hidden3, keep_prob), lambda: hidden3)
            
        output = tf.nn.relu(tf.matmul(hidden3, layer_4_weights) + layer_4_biases)
        return output
       
    # Train
    logits_train = model(tf_train_dataset, dropout_keep_prob)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits_train, labels=tf_train_labels))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
    # Predictions
    prediction_train = tf.nn.softmax(logits_train)
    prediction_validate = tf.nn.softmax(model(tf_validate_dataset))
    prediction_test = tf.nn.softmax(model(tf_test_dataset))
    
    def accuracy(predictions, labels):
        correct_predictions = tf.equal(tf.argmax(predictions, 1), tf.argmax(labels, 1))
        return 100.0 * tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    
    # Accuracies
    accuracy_train = accuracy(prediction_train, tf_train_labels)
    accuracy_validate = accuracy(prediction_validate, tf_validate_labels)
    accuracy_test = accuracy(prediction_test, tf_test_labels)
    
    # Summaries
    tf.scalar_summary("Minibatch Loss", loss)
    tf.scalar_summary("Minibatch Accuracy", accuracy_validate)
    tf.scalar_summary("Validate Accuracy", accuracy_validate)
    tf.scalar_summary("Test Accuracy", accuracy_test)
    
    summary_op = tf.merge_all_summaries()

In [14]:
num_steps = 1001

def train(keep_prob=1.0):
    with tf.Session(graph=graph) as session:
        writer = tf.train.SummaryWriter(logs_folder, graph=graph)
    
        session.run(tf.initialize_all_variables())
        print("Initialized")
    
        for step in range(num_steps):
            offset = (step * batch_size) % (split_dataset['y_train'].shape[0] - batch_size)
    
            batch_data = split_dataset['X_train'][offset:(offset + batch_size), :]
            batch_labels = split_dataset['y_train'][offset:(offset + batch_size), :]
    
            feed_dict = {tf_train_dataset : batch_data, 
                         tf_train_labels : batch_labels,
                         dropout_keep_prob : keep_prob}
            _, l, summary, acc_tr, acc_val, acc_te = session.run(
                [optimizer, loss, summary_op, 
                 accuracy_train, accuracy_validate, accuracy_test], 
                feed_dict=feed_dict)
        
            writer.add_summary(summary, step)
        
            if (step % 100 == 0):
                print("Minibatch loss at step %d: %f" % (step, l))
                print("Minibatch accuracy: %.2f%%" % acc_tr)
                print("Validation accuracy: %.2f%%" % acc_val)
        print("Test accuracy: %.2f%%" % acc_te)

In [15]:
train(0.9)

Initialized
Minibatch loss at step 0: 2.861829
Minibatch accuracy: 8.88%
Validation accuracy: 8.49%
Minibatch loss at step 100: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 200: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 300: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 400: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 500: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 600: 2.302569
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 700: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 800: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 900: 2.302563
Minibatch accuracy: 10.33%
Validation accuracy: 10.28%
Minibatch loss at step 1000: 2.302563
Minibatch accuracy: 10.33%
Validat

In [16]:
train(0.5)

Initialized
Minibatch loss at step 0: 4.767598
Minibatch accuracy: 9.77%
Validation accuracy: 9.16%
Minibatch loss at step 100: 2.303085
Minibatch accuracy: 10.61%
Validation accuracy: 10.28%
Minibatch loss at step 200: 2.259469
Minibatch accuracy: 16.54%
Validation accuracy: 18.10%
Minibatch loss at step 300: 1.692468
Minibatch accuracy: 40.63%
Validation accuracy: 46.09%
Minibatch loss at step 400: 1.626614
Minibatch accuracy: 42.94%
Validation accuracy: 46.92%
Minibatch loss at step 500: 1.594313
Minibatch accuracy: 43.94%
Validation accuracy: 47.32%
Minibatch loss at step 600: 1.569720
Minibatch accuracy: 44.53%
Validation accuracy: 47.61%
Minibatch loss at step 700: 1.567476
Minibatch accuracy: 44.83%
Validation accuracy: 47.93%
Minibatch loss at step 800: 1.549823
Minibatch accuracy: 45.49%
Validation accuracy: 48.09%
Minibatch loss at step 900: 1.535314
Minibatch accuracy: 45.54%
Validation accuracy: 48.44%
Minibatch loss at step 1000: 1.529958
Minibatch accuracy: 45.84%
Validat

In [17]:
train(0.1)

Initialized
Minibatch loss at step 0: 20.313150
Minibatch accuracy: 10.29%
Validation accuracy: 9.37%
Minibatch loss at step 100: 2.302680
Minibatch accuracy: 10.20%
Validation accuracy: 10.28%
Minibatch loss at step 200: 2.302757
Minibatch accuracy: 10.47%
Validation accuracy: 10.28%
Minibatch loss at step 300: 2.302331
Minibatch accuracy: 10.40%
Validation accuracy: 10.28%
Minibatch loss at step 400: 2.302730
Minibatch accuracy: 10.29%
Validation accuracy: 10.28%
Minibatch loss at step 500: 2.302505
Minibatch accuracy: 10.49%
Validation accuracy: 10.28%
Minibatch loss at step 600: 2.302767
Minibatch accuracy: 10.24%
Validation accuracy: 10.28%
Minibatch loss at step 700: 2.302639
Minibatch accuracy: 10.27%
Validation accuracy: 10.28%
Minibatch loss at step 800: 2.302946
Minibatch accuracy: 10.20%
Validation accuracy: 10.28%
Minibatch loss at step 900: 2.302620
Minibatch accuracy: 10.34%
Validation accuracy: 10.28%
Minibatch loss at step 1000: 2.302437
Minibatch accuracy: 10.33%
Valid