In [2]:
import theano
import time
import theano.tensor as T
import numpy as np
from sklearn import datasets, linear_model
import matplotlib.pyplot as plt

Using gpu device 0: GeForce 820M (CNMeM is enabled with initial size: 95.0% of memory, cuDNN not available)


In [9]:
class Test:
    def __init__(self, train_x):
        self.x = T.fmatrix('x')
        self.y = self.x * 2;
        self.pred_f = theano.function([self.x], self.y)
    def predict(self, x):
        return self.pred_f(np.float32(x))
        
t = Test(11)
print t.pred_f(np.float32([[18, 1],[32, 2]]))

x = T.fmatrix('x')
y = x * 2
profile = theano.compile.ProfileStats()
f = theano.function([x],y, profile=profile)
f(np.float32([[1,2],[3,4]]))
f.profile.summary()

[[ 36.   2.]
 [ 64.   4.]]


Function profiling
  Message: None
  Time in 1 calls to Function.__call__: 0.000000e+00s
  Total compile time: 5.699992e-02s
    Number of Apply nodes: 3
    Theano Optimizer time: 2.400017e-02s
       Theano validate time: 0.000000e+00s
    Theano Linker time (includes C, CUDA code generation/compiling): 8.999825e-03s
       Import time 0.000000e+00s

Time in all call to theano.grad() 0.000000e+00s
Time since theano import 711.089s
  No execution time accumulated (hint: try config profiling.time_thunks=1)
Here are tips to potentially make your code run faster
                 (if you think of new ones, suggest them on the mailing list).
                 Test them first, as they are not guaranteed to always provide a speedup.
  Sorry, no tip for today.


In [64]:
class NNModel:
    def __init__(self, layers, epsilon = 0.01, reg_lambda = 0.01):
        self.layers = layers # number of nodes in each layer
        self.epsilon = np.float32(epsilon) # learning rate for gradient descent
        self.reg_lambda = np.float32(reg_lambda) # regularization strength        
        
        # Initialize the parameters (W and b) to random values. We need to learn these.        
        np.random.seed(int(time.time()) % 1000)
        hidden_layer_num = len(layers) - 1
        if hidden_layer_num != 2:
            print 'only support 2 hidden layer'
            exit(0)
        self.W1 = theano.shared(np.random.randn(layers[0], layers[1]).astype('float32'))
        self.b1 = theano.shared(np.zeros(layers[1]).astype('float32'))
        self.W2 = theano.shared(np.random.randn(layers[1], layers[2]).astype('float32'))
        self.b2 = theano.shared(np.zeros(layers[2]).astype('float32'))

    # This function learns parameters for the neural network from training dataset
    # - num_passes: Number of passes through the training data for gradient descent
    # - print_loss: If True, print the loss every 1000 iterations
    def train(self, train_X, train_y, num_passes=20000, print_loss=False):
        num_class = np.max(train_y) + 1
        num_examples = len(train_X)
        train_y_onehot = np.eye(num_class)[train_y]
        
        # GPU NOTE: Conversion to float32 to store them on the GPU!
        X = theano.shared(train_X.astype('float32'))
        y = theano.shared(train_y_onehot.astype('float32'))
        
        # Forward propagation
        z1 = X.dot(self.W1) + self.b1
        a1 = T.tanh(z1)
        z2 = a1.dot(self.W2) + self.b2
        y_hat = T.nnet.softmax(z2)
        
        #Prediction
        prediction = T.argmax(y_hat, axis=1)

        #Loss function
        loss_reg = 1. / num_examples * self.reg_lambda / 2 * (T.sum(T.sqr(self.W1)) + T.sum(T.sqr(self.W2)))
#         loss_reg_ws = 0
#         for W in self.Ws:
#             loss_reg_ws = loss_reg_ws + T.sum(T.sqr(W))
#         loss_reg = 1./num_examples * self.reg_lambda/2 * loss_reg_ws
        loss = T.nnet.categorical_crossentropy(y_hat, y).mean() + loss_reg

        # Gradients
        dW2 = T.grad(loss, self.W2)
        db2 = T.grad(loss, self.b2)
        dW1 = T.grad(loss, self.W1)
        db1 = T.grad(loss, self.b1)

        # Note that we removed the input values because we will always use the same shared variable
        # GPU NOTE: Removed the input values to avoid copying data to the GPU.
        forward_prop = theano.function([], y_hat)
        calculate_loss = theano.function([], loss)
        predict = theano.function([], prediction)

        # GPU NOTE: Removed the input values to avoid copying data to the GPU.
        self.gradient_step = theano.function(
            [],
            # profile=True,
            updates=((self.W2, self.W2 - self.epsilon * dW2),
                     (self.W1, self.W1 - self.epsilon * dW1),
                     (self.b2, self.b2 - self.epsilon * db2),
                     (self.b1, self.b1 - self.epsilon * db1)))
        
        # Gradient descent. For each batch...
        for i in xrange(0, num_passes):
            # This will update our parameters Ws and bs!
            self.gradient_step()

            # Optionally print the loss.
            # This is expensive because it uses the whole dataset, so we don't want to do it too often.
            if print_loss and i % 1000 == 0:
                print "Loss after iteration %i: %f" %(i, calculate_loss())

        self.predictInit()

    def predictInit(self):
        X = T.fmatrix('X')

        # Forward propagation
        z1 = X.dot(self.W1) + self.b1
        a1 = T.tanh(z1)
        z2 = a1.dot(self.W2) + self.b2
        y_hat = T.nnet.softmax(z2)
        
        prediction = T.argmax(y_hat, axis=1)
        
        self.predict_by_X = theano.function([X], prediction)
    
    def predictExecute(self, predict_X):
        return self.predict_by_X(predict_X.astype('float32'))

In [65]:
def generate_data(random_seed, n_samples):
    np.random.seed(random_seed)
    X, y = datasets.make_moons(n_samples, noise=0.20)
    return X, y

def visualize(X, y, model):
    plt.title("tanh_cross_entropy_ann_classification")
    plot_decision_boundary(lambda x:model.predictExecute(x), X, y)

def plot_decision_boundary(pred_func, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole gid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
    plt.show()

In [66]:
class Config:
    epsilon = 0.01  # learning rate for gradient descent
    reg_lambda = 0.01  # regularization strength
    layers = [2, 4, 2] # number of nodes in each layer
    num_passes = 20000
    print_loss = True
    
    random_seed = 6
    num_samples = 2000

X, y = generate_data(Config.random_seed, Config.num_samples)
model = NNModel(Config.layers, Config.epsilon, Config.reg_lambda)
model.train(X, y, Config.num_passes, Config.print_loss)
visualize(X, y, model)
%timeit model.gradient_step()

Loss after iteration 0: 0.436574
Loss after iteration 1000: 0.291402
Loss after iteration 2000: 0.285377
Loss after iteration 3000: 0.281749
Loss after iteration 4000: 0.278547
Loss after iteration 5000: 0.275202
Loss after iteration 6000: 0.271164
Loss after iteration 7000: 0.265419
Loss after iteration 8000: 0.255923
Loss after iteration 9000: 0.240057
Loss after iteration 10000: 0.219436
Loss after iteration 11000: 0.200856
Loss after iteration 12000: 0.186399
Loss after iteration 13000: 0.174619
Loss after iteration 14000: 0.164550
Loss after iteration 15000: 0.155717
Loss after iteration 16000: 0.147826
Loss after iteration 17000: 0.140683
Loss after iteration 18000: 0.134160
Loss after iteration 19000: 0.128179
1000 loops, best of 3: 1.44 ms per loop
