# title?

Sourced from: 
- Fundamentals of Deep Learning<br>
by Nikhil Buduma<br>
Published by O'Reilly Media, Inc., 2016

In [1]:
import numpy as np

In [3]:
import theano.tensor as T
import theano as th
from theano.printing import pydotprint

a = T.dvector('a')
b = T.dvector('b')
c = a**3 + b**3
f = th.function(inputs=[a,b], outputs=c)
f([1, 2, 3], [2, 3, 4])

array([  9.,  35.,  91.])

In [6]:
import theano.tensor as T 
from theano import function
from theano import shared

state = shared(0)
query = T.dvector('query')
W = T.dvector('W') # model parameter vector
result = T.dot(W, query) > 0
sentiment = function(inputs=[query], 
                     outputs=result, 
                     updates=[(state, state + 1)],
                     givens={W : np.array([1, -2, 3, -0.5, 1])})

In [8]:
state.get_value()

array(2)

In [9]:
sentiment([1, 6, 0, 9, 0])

array(0, dtype=int8)

In [10]:
state.get_value()

array(3)

In [11]:
sentiment([1, -6, 0, -9, 0])

array(1, dtype=int8)

In [12]:
state.get_value()

array(4)

# Logistic regression (FDL)

In [13]:
def __init__(self, input, input_dim, output_dim):
    """
    PARAM input : theano.tensor.TensorType
    A symbolic variable that we'll use to represent one minibatch of our
    dataset

    PARAM input_dim : int
    This will represent the number of input neurons in our model

    PARAM ouptut_dim : int 
    This will represent the number of neurons in the output layer (i.e. 
    the number of possible classifications for the input)
    """
    
    # We initialize the weight matrix W of size (input_dim, output_dim)
    self.W = theano.shared(
            value=np.zeros((input_dim, output_dim)),
            name='W',
            borrow=True
        )

    # We initialize a bias vector for the neurons of the output layer
    self.b = theano.shared(
            value=np.zeros(output_dim),
            name='b',
            borrow='True'
        )

    # Symbolic description of how to compute class membership probabilities
    self.output = T.nnet.softmax(T.dot(input, self.W) + self.b)

    # Symbolic description of the final prediction
    self.predicted = T.argmax(self.output, axis=1)

In [14]:
def logistic_network_cost(self, y, lambda_l2=0):
    """
    Here we express the cost incurred by an example given the correct
    distribution

    PARAM y : theano.tensor.TensorType
    These are the correct answers, and we compute the cost with 
    respect to this ground truth (over the entire minibatch). This 
    means that y is of size (minibatch_size, output_dim)

    PARAM lambda : float
    This is the L2 regularization parameter that we use to penalize large
    values for components of W, thus discouraging potential overfitting
    """
    # Calculate the log probabilities of the softmax output
    log_probabilities = T.log(self.output)

    # We use these log probabilities to compute the negative log likelihood
    negative_log_likelihood = -T.mean(log_probabilities[T.arange(y.shape[0]), y])
        
    # Compute the L2 regularization component of the cost function
    l2_regularization = lambda_l2 * (self.W ** 2).sum()
        
    # Return a symbolic description of the cost function
    return negative_log_likelihood + l2_regularization

In [15]:
def error_rate(self, y):
    """
    Here we return the error rate of the model over a set of given labels
    (perhaps in a minibatch, in the validation set, or the test set)

    PARAM y : theano.tensor.TensorType
    These are the correct answers, and we compute the cost with 
    respect to this ground truth (over the entire minibatch). This 
    means that y is of size (minibatch_size, output_dim) 
    """

    # Make sure y is of the correct dimension 
    assert y.ndim == self.predicted.ndim

    # Make sure that ys  contains values of the correct data type (ints)
    assert y.dtype.startswith('int')

    # Return the error rate on the data 
    return T.mean(T.neq(self.predicted, y))

In [16]:
"""
We will use this class to represent a simple logistic regression
classifier. We'll represent this in Theano as a neural network 
with no hidden layers. This is our first attempt at building a 
neural network model to solve interesting problems. Here, we'll
use this class to crack the MNIST handwritten digit dataset problem,
but this class has been constructed so that it can be reappropriated
to any use!

References:
    - textbooks: "Pattern Recognition and Machine Learning", Christopher M. Bishop, section 4.3.2
    - websites: http://deeplearning.net/tutorial, Lisa Lab
"""

import numpy as np
import theano.tensor as T 
import theano

class LogisticNetwork(object):
    """
    The logistic regression class is described by two parameters (which
    we will want to learn). The first is a weight matrix. We'll refer to
    this weight matrix as W. The second is a bias vector b. Refer to the 
    text if you want to learn more about how this network works. Let's get
    started!
    """

    def __init__(self, input, input_dim, output_dim):
        """
        We first initialize the logistic network object with some important
        information.

        PARAM input : theano.tensor.TensorType
        A symbolic variable that we'll use to represent one minibatch of our
        dataset

        PARAM input_dim : int
        This will represent the number of input neurons in our model

        PARAM ouptut_dim : int 
        This will represent the number of neurons in the output layer (i.e. 
        the number of possible classifications for the input)
        """

        # We initialize the weight matrix W of size (input_dim, output_dim)
        self.W = theano.shared(
                value=np.zeros((input_dim, output_dim)),
                name='W',
                borrow=True
            )

        # We initialize a bias vector for the neurons of the output layer
        self.b = theano.shared(
                value=np.zeros(output_dim),
                name='b',
                borrow='True'
            )

        # Symbolic description of how to compute class membership probabilities
        self.output = T.nnet.softmax(T.dot(input, self.W) + self.b)

        # Symbolic description of the final prediction
        self.predicted = T.argmax(self.output, axis=1)

    def logistic_network_cost(self, y, lambda_l2=0):
        """
        Here we express the cost incurred by an example given the correct
        distribution

        PARAM y : theano.tensor.TensorType
        These are the correct answers, and we compute the cost with 
        respect to this ground truth (over the entire minibatch). This 
        means that y is of size (minibatch_size, output_dim)

        PARAM lambda : float
        This is the L2 regularization parameter that we use to penalize large
        values for components of W, thus discouraging potential overfitting
        """
        # Calculate the log probabilities of the softmax output
        log_probabilities = T.log(self.output)

        # We use these log probabilities to compute the negative log likelihood
        negative_log_likelihood = -T.mean(log_probabilities[T.arange(y.shape[0]), y])
        
        # Compute the L2 regularization component of the cost function
        l2_regularization = lambda_l2 * (self.W ** 2).sum()
        
        # Return a symbolic description of the cost function
        return negative_log_likelihood + l2_regularization

    def error_rate(self, y):
        """
        Here we return the error rate of the model over a set of given labels
        (perhaps in a minibatch, in the validation set, or the test set)

        PARAM y : theano.tensor.TensorType
        These are the correct answers, and we compute the cost with 
        respect to this ground truth (over the entire minibatch). This 
        means that y is of size (minibatch_size, output_dim)
        """

        # Make sure y is of the correct dimension 
        assert y.ndim == self.predicted.ndim

        # Make sure that y contains values of the correct data type (ints)
        assert y.dtype.startswith('int')

        # Return the error rate on the data 
        return T.mean(T.neq(self.predicted, y))

In [17]:
def shared_dataset(data_xy):
    """
    We store the data in a shared variable because it allows Theano to copy it
    into GPU memory (if GPU utilization is enabled). By default, if a variable is
    not shared, it is moved to GPU at every use. This results in a big performance
    hit because that means the data will be copied one minibatch at a time. Instead,
    if we use shared variables, we don't have to worry about copying data 
    repeatedly.
    """

    data_x, data_y = data_xy
    shared_x = shared(np.asarray(data_x, dtype=config.floatX), borrow=True)
    shared_y = shared(np.asarray(data_y, dtype='int32'), borrow=True)
    return shared_x, shared_y

# We now instantiate the shared datasets
training_set_x , training_set_y = shared_dataset(training_set)
validation_set_x, validation_set_y = shared_dataset(validation_set)
test_set_x, test_set_y = shared_dataset(test_set) 

NameError: name 'training_set' is not defined

In [18]:
# Lets compute the number of minibatches for training, validation, and testing
n_training_batches = training_set_x.get_value(borrow=True).shape[0] / BATCH_SIZE
n_validation_batches = validation_set_x.get_value(borrow=True).shape[0] / BATCH_SIZE
n_test_batches = test_set_x.get_value(borrow=True).shape[0] / BATCH_SIZE

# Now it's time for us to build the model! 
#Let's start of with an index to the minibatch we're using
index = T.lscalar() 

# Generate symbolic variables for the input (a minibatch)
x = T.dmatrix('x')
y = T.ivector('y')

# Construct the logistic network model
# Keep in mind MNIST image is of size (28, 28)
# Also number of output class is is 10 (digits 0, 1, ..., 9)
model = logistic_network.LogisticNetwork(input=x, input_dim=28*28, output_dim=10)

# Obtain a symbolic expression for the objective function
# EXPERIMENT!!! Play around with L2 regression parameter!
objective = model.logistic_network_cost(y, lambda_l2=0.0001)

# Obtain a symbolic expression for the error incurred
error = model.error_rate(y)

# Compute symbolic gradients of objective with respect to model parameters
dW, db = T.grad(objective, model.W), T.grad(objective, model.b)

NameError: name 'training_set_x' is not defined

In [19]:
# Compile theano function for training with a minibatch
train_model = function(
        inputs=[index],
        outputs=objective, 
        updates=[
            (model.W, model.W - LEARNING_RATE * dW),
            (model.b, model.b - LEARNING_RATE * db)
        ],
        givens={
            x : training_set_x[index * BATCH_SIZE : (index + 1) * BATCH_SIZE],
            y : training_set_y[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        }
    )

# Compile theano functions for validation and testing
validate_model = function(
        inputs=[index],
        outputs=error,
        givens={
            x : validation_set_x[index * BATCH_SIZE : (index + 1) * BATCH_SIZE],
            y : validation_set_y[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        }
    )

test_model = function(
        inputs=[index],
        outputs=error,
        givens={
            x : test_set_x[index * BATCH_SIZE : (index + 1) * BATCH_SIZE],
            y : test_set_y[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        }
    )

NameError: name 'index' is not defined

In [21]:
# Let's set up the early stopping parameters (based on the validation set)
# Must look at this many examples no matter what
patience = 5000

# Wait this much longer if a new best is found                     
patience_increase = 2

# This is when an improvement is significant
improvement_threshold = 0.995

# We go through this number of minbatches before we check on the validation set
validation_freq = min(n_training_batches, patience / 2)

# We keep of the best loss on the validation set here
best_loss = np.inf

# We also keep track of the epoch we are in
epoch = 0

# A boolean flag that propagates when patience has been exceeded
exceeded_patience = False

# Now we're ready to start training the model
print ("... TRAINING MODEL ...")
start_time = time.clock()
while (epoch < N_EPOCHS) and not exceeded_patience:
    epoch = epoch + 1
    for minibatch_index in xrange(n_training_batches):
        minibatch_objective = train_model(minibatch_index)
        iteration = (epoch - 1) * n_training_batches + minibatch_index

        if (iteration + 1) % validation_freq == 0:
            # Compute loss on validation set
            validation_losses = [validate_model(i) for i in xrange(n_validation_batches)]
            validation_loss = np.mean(validation_losses)

            print ('epoch %i, minibatch %i/%i, validation error: %f %%' % (
                    epoch,
                    minibatch_index + 1,
                    n_training_batches,
                    validation_loss * 100
                ))

            if validation_loss < best_loss:
                if validation_loss < best_loss * improvement_threshold:
                    patience = max(patience, iteration * patience_increase)
                best_loss = validation_loss

        if patience <= iteration:
            exceeded_patience = True
            break
end_time = time.clock()

# Let's compute how well we do on the test set
test_losses = [test_model(i) for i in xrange(n_test_batches)]
test_loss = np.mean(test_losses)

NameError: name 'n_training_batches' is not defined

In [2]:
"""
We'll now use the LogisticNetwork object we built in logistic_network.py in 
order to tackle the MNIST dataset challenge. We will use minibatch gradient
descent to train this simplistic network model. 

References:
    - textbooks: "Pattern Recognition and Machine Learning", Christopher M. Bishop, section 4.3.2
    - websites: http://deeplearning.net/tutorial, Lisa Lab
"""

__docformat__ = 'restructedtext en'

#import cPickle
import gzip
import os
import time
import urllib
from theano import function, shared, config
import theano.tensor as T 
import numpy as np
import logistic_network

In [None]:


# Let's start off by defining some constants
# EXPERIMENT!!! Play around the the learning rate!
LEARNING_RATE = 0.2
N_EPOCHS = 1000
DATASET = 'mnist.pkl.gz'
BATCH_SIZE = 600

# Time to check if we have the data and if we don't, let's download it 
print ("... LOADING DATA ..." )

data_path = os.path.join(
        os.path.split(__file__)[0],
        "..",
        "data",
        DATASET
    )

if (not os.path.isfile(data_path)):
    import urllib
    origin = (
            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
        )
    print ('Downloading data from %s' % origin)
    urllib.urlretrieve(origin, data_path)


In [None]:

# Time to build our models
print ("... BUILDING MODEL ...")

# Load the dataset
data_file = gzip.open(data_path, 'rb')
training_set, validation_set, test_set = cPickle.load(data_file)
data_file.close()

# Define a quick function to established a shared dataset (for efficiency)

def shared_dataset(data_xy):
    """
    We store the data in a shared variable because it allows Theano to copy it
    into GPU memory (if GPU utilization is enabled). By default, if a variable is
    not shared, it is moved to GPU at every use. This results in a big performance
    hit because that means the data will be copied one minibatch at a time. Instead,
    if we use shared variables, we don't have to worry about copying data 
    repeatedly.
    """

    data_x, data_y = data_xy
    shared_x = shared(np.asarray(data_x, dtype=config.floatX), borrow=True)
    shared_y = shared(np.asarray(data_y, dtype='int32'), borrow=True)
    return shared_x, shared_y

# We now instantiate the shared datasets
training_set_x , training_set_y = shared_dataset(training_set)
validation_set_x, validation_set_y = shared_dataset(validation_set)
test_set_x, test_set_y = shared_dataset(test_set) 

# Lets compute the number of minibatches for training, validation, and testing
n_training_batches = training_set_x.get_value(borrow=True).shape[0] / BATCH_SIZE
n_validation_batches = validation_set_x.get_value(borrow=True).shape[0] / BATCH_SIZE
n_test_batches = test_set_x.get_value(borrow=True).shape[0] / BATCH_SIZE

# Now it's time for us to build the model! 
#Let's start of with an index to the minibatch we're using
index = T.lscalar() 

# Generate symbolic variables for the input (a minibatch)
x = T.dmatrix('x')
y = T.ivector('y')

# Construct the logistic network model
# Keep in mind MNIST image is of size (28, 28)
# Also number of output class is is 10 (digits 0, 1, ..., 9)
#model = logistic_network.LogisticNetwork(input=x, input_dim=28*28, output_dim=10)
model = LogisticNetwork(input=x, input_dim=28*28, output_dim=10)

# Obtain a symbolic expression for the objective function
# EXPERIMENT!!! Play around with L2 regression parameter!
objective = model.logistic_network_cost(y, lambda_l2=0.0001)

# Obtain a symbolic expression for the error incurred
error = model.error_rate(y)

# Compute symbolic gradients of objective with respect to model parameters
dW, db = T.grad(objective, model.W), T.grad(objective, model.b)

# Compile theano function for training with a minibatch
train_model = function(
        inputs=[index],
        outputs=objective, 
        updates=[
            (model.W, model.W - LEARNING_RATE * dW),
            (model.b, model.b - LEARNING_RATE * db)
        ],
        givens={
            x : training_set_x[index * BATCH_SIZE : (index + 1) * BATCH_SIZE],
            y : training_set_y[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        }
    )

# Compile theano functions for validation and testing
validate_model = function(
        inputs=[index],
        outputs=error,
        givens={
            x : validation_set_x[index * BATCH_SIZE : (index + 1) * BATCH_SIZE],
            y : validation_set_y[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        }
    )

test_model = function(
        inputs=[index],
        outputs=error,
        givens={
            x : test_set_x[index * BATCH_SIZE : (index + 1) * BATCH_SIZE],
            y : test_set_y[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        }
    )

# Let's set up the early stopping parameters (based on the validation set)

# Must look at this many examples no matter what
patience = 5000

# Wait this much longer if a new best is found                     
patience_increase = 2

# This is when an improvement is significant
improvement_threshold = 0.995

# We go through this number of minbatches before we check on the validation set
validation_freq = min(n_training_batches, patience / 2)

# We keep of the best loss on the validation set here
best_loss = np.inf

# We also keep track of the epoch we are in
epoch = 0

# A boolean flag that propagates when patience has been exceeded
exceeded_patience = False

# Now we're ready to start training the model
print ("... TRAINING MODEL ...")
start_time = time.clock()
while (epoch < N_EPOCHS) and not exceeded_patience:
    epoch = epoch + 1
    for minibatch_index in xrange(n_training_batches):
        minibatch_objective = train_model(minibatch_index)
        iteration = (epoch - 1) * n_training_batches + minibatch_index

        if (iteration + 1) % validation_freq == 0:
            # Compute loss on validation set
            validation_losses = [validate_model(i) for i in xrange(n_validation_batches)]
            validation_loss = np.mean(validation_losses)

            print ('epoch %i, minibatch %i/%i, validation error: %f %%' % (
                    epoch,
                    minibatch_index + 1,
                    n_training_batches,
                    validation_loss * 100
                ))

            if validation_loss < best_loss:
                if validation_loss < best_loss * improvement_threshold:
                    patience = max(patience, iteration * patience_increase)
                best_loss = validation_loss

        if patience <= iteration:
            exceeded_patience = True
            break
end_time = time.clock()

# Let's compute how well we do on the test set
test_losses = [test_model(i) for i in xrange(n_test_batches)]
test_loss = np.mean(test_losses)

# Print out the results!
print ('\n')
print ('Optimization complete with best validation score of %f %%' % (best_loss * 100))
print ('And with a test score of %f %%' % (test_loss * 100))
print ('\n')
print ('The code ran for %d epochs and for a total time of %.1f seconds' % (epoch, end_time - start_time))
print ('\n')