In [1]:
import six.moves.cPickle as pickle
import gzip
import os
import sys
import timeit

import numpy
from theano import *
import theano.tensor as T

Using cuDNN version 5103 on context None
Preallocating 10867/11439 Mb (0.950000) on cuda
Mapped name None to device cuda: Tesla K40c (0000:81:00.0)


We want to implement a probabilistic, linear classifier parameterized by a weight matrix $W$ and a bias vector $b$. \newline
The probability that an input vector $x$ is a member of a class $i$, which we will call the value of a stochastic variable $Y$, is
$$ P(Y = i \vert x,W,b) = softmax(Wx + b) $$
$$ = \frac{e^{W_ix + b_i}}{\sum_j e^{W_j+b_j}} $$

The model's prediction $y_{pred}$ is the class that has the highest probability, or:
$$ y_{pred} = argmax_iP(Y = i \vert x,W,b) $$

We will define a likelihood $\mathcal{L}$ and loss $l$:
$$ \mathcal{L}(\theta = \{ W,b \}, \mathcal{D}) = \sum_{i=0}^{\vert \mathcal{D} \vert} \log(P(Y = y^{(i)} \vert x^{(i)},W,b)) $$
$$ l(\theta = \{ W,b \},\mathcal{D}) = -\mathcal{L}(\theta = \{ W,b \}, \mathcal{D}) $$
We will use the SGD algorithm w/ mini-batches.

In [2]:
class LogisticRegression(object):
    def __init__(self, input, n_in, n_out):
        """
        initialize parameters:
        input: theano.tensor.TensorType
        input param: symbolic var describing input (1 minibatch)
        
        type n_in: int
        param n_in: # of input units, dimension of space of datapoints
        
        type n_out: int
        param n_out: # of output units
        """
        
        #initialize weights W as a matrix of shape (n_in, n_out) w/ 0s
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in,n_out),
                dtype = theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        
        #initialize biases as vector of n_out zeros
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )
        
        
        #now we will compute matrix of class-membership probabilities where
        # W is a mtrix where column-k is separation hyperplane for class k
        # x is a matrix where row-j is input trainign sample j
        # b is a vector where element k is free parameter of hyperplane k
        self.p_y_given_x = T.nnet.softmax(T.dot(input,self.W + self.b))
        
        # the actual prediction is the class w/ greatest probability
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        
        # model parameters
        self.params = [self.W,self.b]
        
        #keep track of input
        self.input = input
        
    def negative_log_likelihood(self,y):
        """
        Return mean of the negative log likelihood of prediction of this model under given target distribution
        """
        
        # y.shape[0] is the number of rows in y, e.g. number of examples in minibatch
        # T.arange(y.shape[0]) is a vector which is a matrix of log probabilities
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
    
    def errors(self,y):
        """
        return float w/ num of errors in minibatch over total # examples in minibatch
        """
        
        #check y vs y_pred dimensions
        if y.ndim != self.y_pred.ndim:
            raise TypeError(
                'y should have the same shape as self.y_pred',
                ('y',y.type,'y_pred',self.y_pred.type)
            )
            
        #check y is correct type
        if y.dtype.startswith('int'):
            return T.mean(T.neq(self.y_pred,y))
        else:
            raise NotImplementedError()

In [None]:
def load_data(dataset):
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        new_path = os.path.join(
            os.path.split(__file__)[0],
            "..",
            "data",
            dataset
        )
        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
            from six.moves import urllib
            origin = (
            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
            )
            print('Downloading data from %s' % origin)
            urllib.request.urlretrieve(origin, dataset)
        
        print('... loading data')
        
        #load dataset
        with gzip.open(dataset,'rb') as f:
            try:
                train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
            except:
                train_set,valid_set,test_set = pickle.load(f)
                
        def shared_dataset(data_xy, borrow=True):
            #load dataset into shared vars.
            data_x,data_y = data_xy
            shared_x = theano.shared(numpy.asarray(data_x,
                                                dtype=theano.config.floatX),
                                   borrow=borrow)
            shared_y = theano.shared(numpy.asarray(data_y,
                                                   dtype=theano.config.floatY),
                                     borrow=borrow)
            return shared_x, T.cast(shared_y, 'int32')
        
        test_set_x, test_set_y = shared_dataset(test_set)
        valid_set_x, valid_set_y = shared_dataset(valid_set)
        train_set_x, train_set_y = shared_dataset(train_set)
        
        rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
               (test_set_x, test_set_y)]
        return rval

Now we instantiate the class:

In [3]:
x = T.matrix('x')
y = T.ivector('y') # labels as 1D vector of [int] labels

# cosntruct log reg class
# each MNIST image has size 28x28
classifier = LogisticRegression(input=x, n_in = 28*28, n_out = 10)

And define a cost var to minimize

In [4]:
cost = classifier.negative_log_likelihood(y)

Now we learn the model:

In [5]:
g_W = T.grad(cost=cost, wrt=classifier.W)
g_b = T.grad(cost=cost, wrt=classifier.b)

Now we'll write a function to perform one step of gradient descent:

In [11]:
learning_rate = 0.0001
index = T.iscalar('index')

updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

# compiling a Theano function `train_model` that returns the cost, but in
# the same time updates the parameter of the model based on the rules
# defined in `updates`
train_model = theano.function(
    inputs=[index],
    outputs=cost,
    updates=updates,
    givens={
        x: train_set_x[index * batch_size: (index + 1) * batch_size],
        y: train_set_y[index * batch_size: (index + 1) * batch_size]
    }
)

NameError: name 'train_set_x' is not defined