In [39]:
import sys
import os
import theano
from theano import tensor as T
import lasagne
from lasagne.layers import *
from lasagne.init import *
from lasagne.updates import *
from lasagne.objectives import *
from lasagne.nonlinearities import *
import math
sys.path.append("../../modules/")
import helper as hp
import random

For an MLP on MNIST with one hidden layer, we have this many params:<br />
$784h + h + 10h + 10$, where $h$ is the number of units in that layer.

Now, if we have a regression MLP on MNIST, the network capacity will be:<br />
$784m + m + 1h + 1$, where $m$ is the number of units in that particular layer.

We would like to choose a regression MLP with similar capacity. We can do this by solving for $m$:

$784h + h + 10h + 10 = 784m + m + m + 1$

----

$784h + h + 10h + 10 = 784h + h + hm + m + 1m + 1$

In [2]:
def find_m(h):
    return (784.0*h + h + 10*h + 10 - 1) / 786.0

In [5]:
train_set, valid_set, test_set = hp.load_mnist("../../data/mnist.pkl.gz")

In [29]:
Xt, yt = train_set
Xv, yv = valid_set
Xt, Xv = Xt.reshape(Xt.shape[0], -1), Xv.reshape(Xv.shape[0], -1)
yt, yv = yt.astype("int32"), yv.astype("int32")

In [42]:
Xt.shape

(50000, 784)

In [7]:
find_m(10)

10.125954198473282

In [17]:
784*10 + 10 + 10*10 + 10

7960

In [26]:
784*10.125954198473282 + 10.125954198473282 + 10.125954198473282 + 1

7960.000000000001

In [151]:
def train_net(X_train, y_train, X_valid, y_valid, net, num_epochs, batch_size, shuffle=True, regression=False):
    k = 10
    train_fn, loss_fn, predict_fn = \
        net["train_fn"], net["loss_fn"], net["predict_fn"]
    idxs = [x for x in range(0, X_train.shape[0])]
    for epoch in range(0, num_epochs):
        if shuffle:
            random.shuffle(idxs)
            X_train = X_train[idxs]
            y_train = y_train[idxs]
        b = 0
        losses = []
        while True:
            if b*batch_size >= X_train.shape[0]:
                break
            this_loss = train_fn(
                X_train[b*batch_size : (b+1)*batch_size], y_train[b*batch_size : (b+1)*batch_size])
            losses.append(this_loss)
            b += 1
        valid_loss = loss_fn(X_valid, y_valid)
        valid_preds = predict_fn(X_valid)
        if not regression:
            valid_acc = 1.0*np.sum( np.argmax(valid_preds,axis=1) == y_valid ) / y_valid.shape[0]
            valid_kappa = hp.weighted_kappa( np.argmax(valid_preds,axis=1), y_valid, num_classes=k )
        else:
            valid_acc = 1.0*np.sum( np.round(valid_preds.flatten()) == y_valid ) / y_valid.shape[0]
            valid_preds = valid_preds.flatten()
            valid_preds = np.round(valid_preds)
            valid_preds = np.clip(valid_preds, 0, k-1).astype("int32")
            valid_kappa = hp.weighted_kappa(valid_preds, y_valid, num_classes=k )
        print np.mean(losses), valid_loss, valid_acc, valid_kappa

In [75]:
def kappa_net(h):
    l_in = InputLayer( (None, 784) )
    l_dense = DenseLayer(l_in, num_units=h, nonlinearity=rectify)
    l_out = DenseLayer(l_dense, num_units=10, nonlinearity=softmax)
    X = T.fmatrix('X')
    y = T.ivector('y')
    loss = categorical_crossentropy( get_output(l_out, X), y ).mean()
    params = get_all_params(l_out)
    updates = nesterov_momentum(loss, params, 0.01, 0.9)
    train_fn = theano.function([X, y], loss, updates=updates)
    predict_fn = theano.function([X], get_output(l_out, X) )
    loss_fn = theano.function([X, y], loss )
    for layer in get_all_layers(l_out):
        print layer, layer.output_shape
    print "num params: %i" % count_params(l_out)
    return {
        "l_out": l_out,
        "loss_fn": loss_fn,
        "predict_fn": predict_fn,
        "train_fn": train_fn
    }

In [170]:
def regression_net(h):
    l_in = InputLayer( (None, 784) )
    l_dense = DenseLayer(l_in, num_units=int(math.ceil(find_m(h))), nonlinearity=rectify)
    l_dense2 = DenseLayer(l_dense, num_units=50)
    l_out = DenseLayer(l_dense2, num_units=1, nonlinearity=rectify)
    X = T.fmatrix('X')
    y = T.ivector('y')
    loss = squared_error( get_output(l_out, X).flatten(), y ).mean()
    params = get_all_params(l_out)
    updates = nesterov_momentum(loss, params, 0.01, 0.9)
    train_fn = theano.function([X, y], loss, updates=updates)
    predict_fn = theano.function([X], get_output(l_out, X) )
    loss_fn = theano.function([X, y], loss )
    for layer in get_all_layers(l_out):
        print layer, layer.output_shape
    print "num params: %i" % count_params(l_out)
    return {
        "l_out": l_out,
        "loss_fn": loss_fn,
        "predict_fn": predict_fn,
        "train_fn": train_fn
    }

In [77]:
np.random.seed(0)
random.seed(0)
net1 = kappa_net(h=100)
train_net(Xt, yt, Xv, yv, net1, num_epochs=10, batch_size=128)

<lasagne.layers.input.InputLayer object at 0x116ef6b10> (None, 784)
<lasagne.layers.dense.DenseLayer object at 0x116ef6e50> (None, 100)
<lasagne.layers.dense.DenseLayer object at 0x10db509d0> (None, 10)
num params: 79510
0.527547673256 0.281596790733 0.9204 0.914195806339
0.275767977681 0.227222227246 0.9358 0.929149831255
0.226221666204 0.193686448013 0.9473 0.943246645645
0.194075181997 0.174644071854 0.9515 0.949400558054
0.170752735327 0.158337847591 0.9563 0.952048127169
0.152516940589 0.146896764836 0.9596 0.955028264402
0.137903819275 0.137545285882 0.9617 0.958068272781
0.125884998087 0.127995953336 0.9653 0.961882955465
0.115659199156 0.122573221227 0.9672 0.96287212872
0.107364979546 0.115932924497 0.9682 0.965002163947


In [171]:
np.random.seed(0)
random.seed(0)
net2 = regression_net(h=100)
train_net(Xt, yt, Xv, yv, net2, num_epochs=100, batch_size=128, regression=True)

<lasagne.layers.input.InputLayer object at 0x116fa5410> (None, 784)
<lasagne.layers.dense.DenseLayer object at 0x116fa5a90> (None, 102)
<lasagne.layers.dense.DenseLayer object at 0x116fa58d0> (None, 50)
<lasagne.layers.dense.DenseLayer object at 0x10df6b9d0> (None, 1)
num params: 85271
1.95547046366 0.864033297581 0.6657 0.941859224752
0.794919867854 0.667761066021 0.7072 0.956595150239
0.599707949842 0.595677632487 0.755 0.960501815188
0.500449393926 0.564038435128 0.7831 0.964147328511
0.422642527919 0.548872833469 0.8038 0.964828505383
0.355411596305 0.524059937415 0.8106 0.966771357743
0.321185620451 0.453460602907 0.8391 0.971873103402
0.280818066905 0.471797070507 0.8382 0.970213328916
0.257209942895 0.454267637244 0.8499 0.971819277842
0.230514491097 0.446139490961 0.8482 0.972143150609
0.209161169454 0.432824039682 0.8572 0.972866941029
0.194637222607 0.42308851331 0.8708 0.973973393745
0.180312535024 0.4662180237 0.8622 0.971412897637
0.173164784005 0.415833710732 0.8673 0.974

KeyboardInterrupt: 

In [122]:
Xt.shape

(50000, 784)

In [172]:
np.clip( np.round( net2["predict_fn"](Xt).flatten().astype("int32") ), 0, 9 )

array([4, 0, 4, ..., 7, 4, 8], dtype=int32)

In [173]:
yv

array([3, 8, 6, ..., 5, 6, 8], dtype=int32)

In [147]:
np.clip([-2,3,5], 0, 4)

array([0, 3, 4])