# Load MNIST dataset
Use `load_mnist()` method to load the dataset.<br>
This method will return a tuple of (training_data, validation_data, test_data)

**training_data** is a list of 50,000 2-tuples `(x, y)`.<br>
`x` is a 784-dimensional numpy.ndarray containing the input image.<br>
`y` is a 10-dimensional numpy.ndarray representing the unit vector corresponding to the correct digit for ``x``.

**validation_data** and **test_data** are lists containing 10,000 2-tuples `(x, y)`. <br>
`x` is a 784-dimensional numpy.ndarry containing the input image.<br>
`y` is the corresponding classification, i.e., the digit values (integers) corresponding to ``x``.

In [1]:
import os
import urllib2
import cPickle
import gzip
import numpy as np

In [2]:
DATA_PATH = '../.tmp'
DATA_FILE = DATA_PATH + '/mnist.pkl.gz'

def dl_mnist():
    '''
    download MNIST dataset
    '''
    # create DATA dir if not exist
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
    # download data to DATA dir
    url = 'https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz'
    mnist = urllib2.urlopen(url)
    with open(DATA_FILE, 'wb') as output:
        while True:
            data = mnist.read(4096)
            if data:
                output.write(data)
            else:
                break
        print('Downloaded: ' + DATA_FILE)
    
def load_mnist():
    '''
    Load MNIST dataset into:
    (1) Training data: 50,000 images
    (2) Validation data: 10,000 images
    (3) Test data: 10,000 images
    
    Each image containts (image pixel data, label)
    So, data structures is: np.narray( ... (x, y) ... )
    While:
      * x is images data: np.narray with size of 28*28=784 pixels
      * y is labels           : one-hot data for training data, and number for other data
    '''
    # download Dataset if not exist
    if not os.path.exists(DATA_FILE):
        dl_mnist()
    
    # open Dataset
    f = gzip.open(DATA_FILE, 'rb')
    # pickle Dataset
    training_data, validation_data, test_data = cPickle.load(f)
    f.close()
    
    # encode result
    # training data
    inputs = [x.reshape((784, 1)) for x in training_data[0]]
    labels = [vectorized_label(y) for y in training_data[1]]
    training_data = zip(inputs, labels)
    # validation data
    inputs = [np.reshape(x, (784, 1)) for x in validation_data[0]]
    validation_data = zip(inputs, validation_data[1])
    # test data
    inputs = [np.reshape(x, (784, 1)) for x in test_data[0]]
    test_data = zip(inputs, test_data[1])

    return (training_data, validation_data, test_data)

def vectorized_label(label):
    '''
    One-hot label vector
    '''
    ret = np.zeros((10, 1))
    ret[label] = 1.0
    return ret