In [1]:
import os
import sys
import tarfile
import tensorflow as tf
import numpy as np
from scipy import ndimage
from six.moves import cPickle as pickle
from six.moves.urllib.request import urlretrieve
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
top_data_folder = './data/notMNIST/'
dataset_small = 'notMNIST_small'
dataset_large = 'notMNIST_large'

if not os.path.exists(top_data_folder):
    os.makedirs(top_data_folder)

def path_to(f):
    return os.path.join(top_data_folder, f)

In [3]:
class DownloadProgress:
    def __init__(self):
        self.last_percent_reported = None

    def __call__(self, count, blockSize, totalSize):
        percent = int(count * blockSize * 100 / totalSize)

        if self.last_percent_reported != percent:
            if percent % 5 == 0:
                sys.stdout.write("%s%%" % percent)
                sys.stdout.flush()
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
      
            self.last_percent_reported = percent

In [4]:
def download_and_extract(dataset_name):
    dataset_archive = dataset_name + '.tar.gz'
    filename = path_to(dataset_archive)
    if not os.path.exists(filename):
        url = 'http://commondatastorage.googleapis.com/books1000/' + dataset_archive
        urlretrieve(url, filename, reporthook=DownloadProgress())

    letters_folder = path_to(dataset_name)

    if not os.path.exists(letters_folder):
        with tarfile.open(filename) as tar:
            tar.extractall(path=top_data_folder)

In [5]:
download_and_extract(dataset_large)

0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%

In [6]:
image_size = 28
image_pixels = image_size * image_size
class_count = 10

def load_folder(folder):
    image_filenames = [f for f in os.listdir(folder) if not os.path.isdir(os.path.join(folder, f))]
    image_count = len(image_filenames)
    
    dataset = np.ndarray(shape=(image_count, image_pixels), dtype=np.float32)
    
    image_counter = 0
    
    for f in image_filenames:
        image_filename = os.path.join(folder, f)
        try:
            image_data = np.ndarray.flatten(ndimage.imread(image_filename).astype(float))
            dataset[image_counter, :] = image_data
            image_counter += 1
        except IOError as e:
            # There are plenty of images, I skip this one
            pass
        
    dataset = dataset[0:image_counter, :]
    
    print('Full dataset tensor for %s:' % folder, dataset.shape)
    
    return dataset

In [7]:
def get_complete_dataset(dataset_name):
    image_depth = 256
    
    pickle_file = path_to(dataset_name + '.pickle')
    
    if not os.path.exists(pickle_file):
        image_pixels = []
        image_labels = []
        image_labels_one_hot = []
        
        for i, letter in enumerate("ABCDEFGHIJ"):
            letter_dataset = (load_folder(path_to(os.path.join(dataset_name, letter))) * 2 - image_depth) / image_depth
            
            letter_image_count = len(letter_dataset)
            
            letter_labels = np.ones(shape=(letter_image_count,), dtype=np.float32) * i
            letter_labels_one_hot = np.ndarray(shape=(letter_image_count, class_count), dtype=np.float32)
            
            label_one_hot = np.zeros(shape = (class_count,), dtype=np.float32)
            label_one_hot[i] = 1.0
            letter_labels_one_hot[:] = label_one_hot
            
            image_pixels.append(letter_dataset)
            image_labels.append(letter_labels)
            image_labels_one_hot.append(letter_labels_one_hot)
            
        dataset = { 'pixels': np.concatenate(image_pixels), 
                    'labels': np.concatenate(image_labels),
                    'labels_one_hot': np.concatenate(image_labels_one_hot) }
            
        with open(pickle_file, 'wb') as f:
            pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            
        return dataset
    else:
        with open(pickle_file, 'rb') as f:
            return pickle.load(f)

In [8]:
def split(dataset_name, test_size=0.2, validate_size=0.25, random_state=42):
    dataset = get_complete_dataset(dataset_name)
    answer = {}
    
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_validate_indices, test_indices = list(sss1.split(dataset['pixels'], dataset['labels']))[0]

    answer['X_test'] = dataset['pixels'][test_indices]
    answer['y_test'] = dataset['labels_one_hot'][test_indices]

    train_validate_pixels = dataset['pixels'][train_validate_indices]
    train_validate_labels = dataset['labels'][train_validate_indices]
    train_validate_labels_one_hot = dataset['labels_one_hot'][train_validate_indices]
    
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=validate_size, random_state=random_state * random_state)
    train_indices, validate_indices = list(sss2.split(train_validate_pixels, train_validate_labels_one_hot))[0]
    
    answer['X_train'] = train_validate_pixels[train_indices]
    answer['y_train'] = train_validate_labels_one_hot[train_indices]
    
    answer['X_validate'] = train_validate_pixels[validate_indices]
    answer['y_validate'] = train_validate_labels_one_hot[validate_indices]
    
    return answer    

In [9]:
split_dataset = split(dataset_large)

('Full dataset tensor for ./data/notMNIST/notMNIST_large/A:', (52909, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/B:', (52911, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/C:', (52912, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/D:', (52911, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/E:', (52912, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/F:', (52912, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/G:', (52912, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/H:', (52912, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/I:', (52912, 784))
('Full dataset tensor for ./data/notMNIST/notMNIST_large/J:', (52911, 784))


In [10]:
for k in split_dataset.keys():
    print(k, split_dataset[k].shape)

('X_test', (105823, 784))
('X_train', (317468, 784))
('X_validate', (105823, 784))
('y_validate', (105823, 10))
('y_train', (317468, 10))
('y_test', (105823, 10))


In [11]:
learning_rate = 0.10
training_epochs = 20
batch_size = 1000
display_step = 1

In [12]:
x = tf.placeholder(tf.float32, [None, image_pixels]) 
y = tf.placeholder(tf.float32, [None, class_count])

In [13]:
W = tf.Variable(tf.zeros([image_pixels, class_count]))
b = tf.Variable(tf.zeros([class_count]))

In [14]:
pred = tf.nn.softmax(tf.matmul(x, W) + b)

In [15]:
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(pred), reduction_indices=1))

In [16]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

In [17]:
init = tf.initialize_all_variables()

In [18]:
with tf.Session() as sess:
    sess.run(init)

    X_train = split_dataset['X_train']
    y_train = split_dataset['y_train']
    
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(X_train)/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_indices = range(i * batch_size, min(len(X_train), (i+1) * batch_size))
            batch_xs = X_train[batch_indices] 
            batch_ys = y_train[batch_indices]
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs, y: batch_ys})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print "Epoch:", '%03d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)

    print "Optimization Finished!"

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy for 3000 examples
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print "Accuracy:", accuracy.eval({x: split_dataset['X_test'], y: split_dataset['y_test']})

Epoch: 001 cost= 0.692705597
Epoch: 002 cost= 0.644977921
Epoch: 003 cost= 0.636006793
Epoch: 004 cost= 0.630900176
Epoch: 005 cost= 0.627562904
Epoch: 006 cost= 0.625197656
Epoch: 007 cost= 0.623418302
Epoch: 008 cost= 0.622017868
Epoch: 009 cost= 0.620877333
Epoch: 010 cost= 0.619923928
Epoch: 011 cost= 0.619110656
Epoch: 012 cost= 0.618405733
Epoch: 013 cost= 0.617786754
Epoch: 014 cost= 0.617237339
Epoch: 015 cost= 0.616745241
Epoch: 016 cost= 0.616301037
Epoch: 017 cost= 0.615897387
Epoch: 018 cost= 0.615528396
Epoch: 019 cost= 0.615189377
Epoch: 020 cost= 0.614876428
Optimization Finished!
Accuracy: 0.83291
