In [1]:
import os
import sys
import tarfile
import tensorflow as tf
import numpy as np
from scipy import ndimage
from six.moves import cPickle as pickle
from six.moves.urllib.request import urlretrieve
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
top_data_folder = './data/notMNIST/'
dataset_small = 'notMNIST_small'
dataset_large = 'notMNIST_large'

if not os.path.exists(top_data_folder):
    os.makedirs(top_data_folder)

def path_to(f):
    return os.path.join(top_data_folder, f)

In [3]:
class DownloadProgress:
    def __init__(self):
        self.last_percent_reported = None

    def __call__(self, count, blockSize, totalSize):
        percent = int(count * blockSize * 100 / totalSize)

        if self.last_percent_reported != percent:
            if percent % 5 == 0:
                sys.stdout.write("%s%%" % percent)
                sys.stdout.flush()
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
      
            self.last_percent_reported = percent

In [4]:
def download_and_extract(dataset_name):
    dataset_archive = dataset_name + '.tar.gz'
    filename = path_to(dataset_archive)
    if not os.path.exists(filename):
        url = 'http://commondatastorage.googleapis.com/books1000/' + dataset_archive
        urlretrieve(url, filename, reporthook=DownloadProgress())

    letters_folder = path_to(dataset_name)

    if not os.path.exists(letters_folder):
        with tarfile.open(filename) as tar:
            tar.extractall(path=top_data_folder)

In [5]:
download_and_extract(dataset_small)

In [6]:
download_and_extract(dataset_large)

In [7]:
image_size = 28
image_pixels = image_size * image_size
class_count = 10

def load_folder(folder):
    image_filenames = [f for f in os.listdir(folder) if not os.path.isdir(os.path.join(folder, f))]
    image_count = len(image_filenames)
    
    dataset = np.ndarray(shape=(image_count, image_pixels), dtype=np.float32)
    
    image_counter = 0
    
    for f in image_filenames:
        image_filename = os.path.join(folder, f)
        try:
            image_data = np.ndarray.flatten(ndimage.imread(image_filename).astype(float))
            dataset[image_counter, :] = image_data
            image_counter += 1
        except IOError as e:
            # There are plenty of images, I skip this one
            pass
        
    dataset = dataset[0:image_counter, :]
    
    print('Full dataset tensor for %s:' % folder, dataset.shape)
    
    return dataset

In [8]:
def get_complete_dataset(dataset_name):
    image_depth = 256
    
    pickle_file = path_to(dataset_name + '.pickle')
    
    if not os.path.exists(pickle_file):
        image_pixels = []
        image_labels = []
        image_labels_one_hot = []
        
        for i, letter in enumerate("ABCDEFGHIJ"):
            letter_dataset = (load_folder(path_to(os.path.join(dataset_name, letter))) * 2 - image_depth) / image_depth
            
            letter_image_count = len(letter_dataset)
            
            letter_labels = np.ones(shape=(letter_image_count,), dtype=np.float32) * i
            letter_labels_one_hot = np.ndarray(shape=(letter_image_count, class_count), dtype=np.float32)
            
            label_one_hot = np.zeros(shape = (class_count,), dtype=np.float32)
            label_one_hot[i] = 1.0
            letter_labels_one_hot[:] = label_one_hot
            
            image_pixels.append(letter_dataset)
            image_labels.append(letter_labels)
            image_labels_one_hot.append(letter_labels_one_hot)
            
        dataset = { 'pixels': np.concatenate(image_pixels), 
                    'labels': np.concatenate(image_labels),
                    'labels_one_hot': np.concatenate(image_labels_one_hot) }
            
        with open(pickle_file, 'wb') as f:
            pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            
        return dataset
    else:
        with open(pickle_file, 'rb') as f:
            return pickle.load(f)

In [9]:
def split(dataset_name, test_size=0.2, validate_size=0.25, random_state=42):
    dataset = get_complete_dataset(dataset_name)
    answer = {}
    
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_validate_indices, test_indices = list(sss1.split(dataset['pixels'], dataset['labels']))[0]

    answer['X_test'] = dataset['pixels'][test_indices]
    answer['y_test'] = dataset['labels_one_hot'][test_indices]

    train_validate_pixels = dataset['pixels'][train_validate_indices]
    train_validate_labels = dataset['labels'][train_validate_indices]
    train_validate_labels_one_hot = dataset['labels_one_hot'][train_validate_indices]
    
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=validate_size, random_state=random_state * random_state)
    train_indices, validate_indices = list(sss2.split(train_validate_pixels, train_validate_labels_one_hot))[0]
    
    answer['X_train'] = train_validate_pixels[train_indices]
    answer['y_train'] = train_validate_labels_one_hot[train_indices]
    
    answer['X_validate'] = train_validate_pixels[validate_indices]
    answer['y_validate'] = train_validate_labels_one_hot[validate_indices]
    
    return answer    

In [10]:
#split_dataset = split(dataset_small)
split_dataset = split(dataset_large)

In [11]:
for k in split_dataset.keys():
    print(k, split_dataset[k].shape)

('X_test', (105823, 784))
('X_train', (317468, 784))
('X_validate', (105823, 784))
('y_validate', (105823, 10))
('y_train', (317468, 10))
('y_test', (105823, 10))


In [12]:
batch_size = 128
hidden_node_count = 1024
learning_rate = 0.01

graph = tf.Graph()

with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_pixels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, class_count))
    tf_validate_dataset = tf.constant(split_dataset['X_validate'])
    tf_test_dataset = tf.constant(split_dataset['X_test'])
    
    weights_0 = tf.Variable(tf.truncated_normal([image_pixels, hidden_node_count]))
    biases_0 = tf.Variable(tf.zeros([hidden_node_count]))
    weights_1 = tf.Variable(tf.truncated_normal([hidden_node_count, class_count]))
    biases_1 =tf.Variable(tf.zeros([class_count]))

    hidden_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights_0) + biases_0)
    logits_train = tf.matmul(hidden_train, weights_1) + biases_1
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits_train, tf_train_labels))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
    prediction_train = tf.nn.softmax(logits_train)
    
    hidden_validate = tf.nn.relu(tf.matmul(tf_validate_dataset, weights_0) + biases_0)
    prediction_validate = tf.nn.softmax(tf.matmul(hidden_validate, weights_1) + biases_1)
    
    hidden_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights_0) + biases_0)
    prediction_test = tf.nn.softmax(tf.matmul(hidden_test, weights_1) + biases_1)

In [13]:
num_steps = 15001

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

with tf.Session(graph=graph) as session:
    session.run(tf.initialize_all_variables())
    print("Initialized")
    
    for step in range(num_steps):
        offset = (step * batch_size) % (split_dataset['y_train'].shape[0] - batch_size)
    
        batch_data = split_dataset['X_train'][offset:(offset + batch_size), :]
        batch_labels = split_dataset['y_train'][offset:(offset + batch_size), :]
    
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, prediction_train], feed_dict=feed_dict)
    
        if (step % 1000 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(prediction_validate.eval(), split_dataset['y_validate']))
    print("Test accuracy: %.1f%%" % accuracy(prediction_test.eval(), split_dataset['y_test']))

Initialized
Minibatch loss at step 0: 783.856262
Minibatch accuracy: 11.7%
Validation accuracy: 14.4%
Minibatch loss at step 1000: 69.177742
Minibatch accuracy: 76.6%
Validation accuracy: 79.3%
Minibatch loss at step 2000: 53.349903
Minibatch accuracy: 74.2%
Validation accuracy: 80.8%
Minibatch loss at step 3000: 31.755339
Minibatch accuracy: 82.8%
Validation accuracy: 81.3%
Minibatch loss at step 4000: 23.094814
Minibatch accuracy: 90.6%
Validation accuracy: 82.4%
Minibatch loss at step 5000: 20.160719
Minibatch accuracy: 82.8%
Validation accuracy: 83.3%
Minibatch loss at step 6000: 15.451889
Minibatch accuracy: 84.4%
Validation accuracy: 83.1%
Minibatch loss at step 7000: 16.480713
Minibatch accuracy: 84.4%
Validation accuracy: 83.8%
Minibatch loss at step 8000: 12.578831
Minibatch accuracy: 85.2%
Validation accuracy: 84.2%
Minibatch loss at step 9000: 12.746622
Minibatch accuracy: 89.8%
Validation accuracy: 84.7%
Minibatch loss at step 10000: 11.533239
Minibatch accuracy: 84.4%
Vali