In [1]:
import theano
from theano import tensor as T
import lasagne
from lasagne.layers import *
from lasagne.nonlinearities import *
from lasagne.objectives import *
from lasagne.regularization import *
from lasagne.random import get_rng
from lasagne.updates import *
from lasagne.init import *
import numpy as np
import sys
sys.path.append("../../modules/")
import helper as hp

from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import matplotlib.pyplot as plt
%matplotlib inline

import os
import cPickle as pickle

from theano.tensor import TensorType

from theano.ifelse import ifelse

from time import time

%load_ext rpy2.ipython

from scipy import stats

import deep_residual_learning_CIFAR10

import math



https://github.com/Lasagne/Lasagne/issues/531

----

In [166]:
class BinomialDropLayer(Layer):
    def __init__(self, incoming, nonlinearity=rectify, p=0.5,
                 **kwargs):
        super(BinomialDropLayer, self).__init__(incoming, **kwargs)
        self.nonlinearity = (identity if nonlinearity is None
                             else nonlinearity)
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.p = p

    def get_output_for(self, input, deterministic=False, **kwargs):
        if deterministic:
            return self.p*input
        else:
            # TODO: same rand num for all examples in the minibatch
            mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],),
                dtype=input.dtype)
            mask = mask.dimshuffle(0,'x','x','x')
            return mask*input

In [None]:
            """
            return ifelse(
                T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p),
                input,
                self.nonlinearity(input)
            )
            """

In [69]:
def stochastic_depth_block(incoming, p):
    layer_before_incoming = None
    for prev_layer in get_all_layers(incoming)[::-1][1::]:
        if "ignore" not in prev_layer.name and prev_layer.name != "nonlinearity":
            layer_before_incoming = prev_layer
            break
    if layer_before_incoming == None:
        raise Exception("Cannot find an appropriate layer before layer: %s" % incoming.name)
    if layer_before_incoming.output_shape != incoming.output_shape:         

        l_pool = Pool2DLayer(
            layer_before_incoming, 
            pool_size=incoming.filter_size if hasattr(incoming, "filter_size") else incoming.pool_size,
            stride=incoming.stride,
            mode="average_inc_pad", 
            name="ignore_pool"
        )
        #print "incoming", incoming.output_shape
        #print "l_pool", l_pool.output_shape

        if layer_before_incoming.output_shape[1] < incoming.output_shape[1]:
            diff_in_fms = incoming.output_shape[1]-layer_before_incoming.output_shape[1]
            if diff_in_fms % 2 == 0: 
                width_tp = ((diff_in_fms/2, diff_in_fms/2),)
            else:
                width_tp = (((diff_in_fms/2)+1, diff_in_fms/2),)
            l_pool = pad(
                l_pool, 
                batch_ndim=1, 
                width=width_tp,
                name="ignore_fm_pad"
            )
        #print "l_pool again: %s" % str(l_pool.output_shape)
        l_binom_drop = BinomialDropLayer(incoming, p=p, name="ignore_binom")
        l_sum = ElemwiseSumLayer([l_binom_drop, l_pool], name="ignore_elemsum") 
        return l_sum
    else:
        l_binom_drop = BinomialDropLayer(incoming, p=p, name="ignore_binom")
        l_sum = ElemwiseSumLayer([l_binom_drop, layer_before_incoming], name="ignore_elemsum")
        return l_sum

In [59]:
def net1():

    l_in = InputLayer( (None, 1, 28, 28), name="input" )

    l_conv1 = Conv2DLayer(l_in, num_filters=8, filter_size=3, name="l_conv1", nonlinearity=None)
    l_sd1 = NonlinearityLayer(stochastic_depth_block(l_conv1, p=0.5), nonlinearity=rectify, name="nonlinearity")

    l_mp1 = MaxPool2DLayer(l_sd1, pool_size=(2,2), name="l_mp1")
    l_sd2 = stochastic_depth_block(l_mp1, p=0.5)

    l_conv2 = Conv2DLayer(l_sd2, num_filters=8, filter_size=3, name="l_conv2", nonlinearity=None)
    l_sd3 = NonlinearityLayer(stochastic_depth_block(l_conv2, p=0.5), nonlinearity=rectify, name="nonlinearity")

    l_mp2 = MaxPool2DLayer(l_sd3, pool_size=(2,2), name="l_mp2")
    l_sd4 = stochastic_depth_block(l_mp2, p=0.5)

    l_conv3 = Conv2DLayer(l_sd4, num_filters=16, filter_size=3, name="l_conv3", nonlinearity=None)
    l_sd5 = NonlinearityLayer(stochastic_depth_block(l_conv3, p=0.5), nonlinearity=rectify, name="nonlinearity")

    l_fc = DenseLayer(l_sd5, num_units=10, nonlinearity=softmax, name="l_fc")

    l_out = l_fc

    for layer in get_all_layers(l_out):
        print layer.name, layer.output_shape
    print count_params(l_out)
    
    return l_out

In [49]:
def net2():

    l_in = InputLayer( (None, 1, 28, 28), name="input" )

    l_conv1 = Conv2DLayer(l_in, num_filters=8, filter_size=3, name="l_conv1", nonlinearity=None)
    l_sd1 = stochastic_depth_block(l_conv1, p=0.5)

    l_conv2 = Conv2DLayer(l_sd1, num_filters=8, filter_size=1, stride=1, name="l_conv2", nonlinearity=None)
    l_sd2 = NonlinearityLayer(stochastic_depth_block(l_conv2, p=0.5), nonlinearity=rectify, name="nonlinearity")

    l_fc = DenseLayer(l_sd2, num_units=10, nonlinearity=softmax, name="l_fc")

    l_out = l_fc

    for layer in get_all_layers(l_out):
        print layer.name, layer.output_shape
    print count_params(l_out)
    
    return l_out

In [79]:
l_out = net1()

input (None, 1, 28, 28)
l_conv1 (None, 8, 26, 26)
ignore_binom (None, 8, 26, 26)
ignore_pool (None, 1, 26, 26)
ignore_fm_pad (None, 8, 26, 26)
ignore_elemsum (None, 8, 26, 26)
nonlinearity (None, 8, 26, 26)
l_mp1 (None, 8, 13, 13)
ignore_binom (None, 8, 13, 13)
ignore_pool (None, 8, 13, 13)
ignore_elemsum (None, 8, 13, 13)
l_conv2 (None, 8, 11, 11)
ignore_binom (None, 8, 11, 11)
ignore_pool (None, 8, 11, 11)
ignore_elemsum (None, 8, 11, 11)
nonlinearity (None, 8, 11, 11)
l_mp2 (None, 8, 5, 5)
ignore_binom (None, 8, 5, 5)
ignore_pool (None, 8, 5, 5)
ignore_elemsum (None, 8, 5, 5)
l_conv3 (None, 16, 3, 3)
ignore_binom (None, 16, 3, 3)
ignore_pool (None, 8, 3, 3)
ignore_fm_pad (None, 16, 3, 3)
ignore_elemsum (None, 16, 3, 3)
nonlinearity (None, 16, 3, 3)
l_fc (None, 10)
3282


----

In [111]:
"""

nChannels, nOutChannels

conv @ 3x3 @ stride @ nChannels2->nOutChannels
batch norm
relu
conv @ 3x3 @ 1 @ nOutChannels->nOutChannels
batch norm

skip:

if stride > 1
  average pool @ 1x1 @ stride



"""

'\n\nnChannels, nOutChannels\n\nconv @ 3x3 @ stride @ nChannels2->nOutChannels\nbatch norm\nrelu\nconv @ 3x3 @ 1 @ nOutChannels->nOutChannels\nbatch norm\n\nskip:\n\nif stride > 1\n  average pool @ 1x1 @ stride\n\n\n\n'

https://github.com/Lasagne/Lasagne/issues/531

In [164]:
l_in = InputLayer( (1, 3, 28, 28) )

def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2):
    print "input =", layer.output_shape
    conv = layer
    if (num_filters != layer.output_shape[1]) or (stride != 1):
        #layer = Conv2DLayer(layer, num_filters, filter_size=1, stride=stride, nonlinearity=None, b=None)
        # this is equivalent to yu's code
        layer = MaxPool2DLayer(layer, pool_size=1, stride=stride)
        diff = num_filters-layer.output_shape[1]
        if diff % 2 == 0: 
            width_tp = ((diff/2, diff/2),)
        else:
            width_tp = (((diff/2)+1, diff/2),)
        layer = pad(
            layer, 
            batch_ndim=1, 
            width=width_tp
        )
        print "layer =", layer.output_shape
    for _ in range(num_layers):
        conv = Conv2DLayer(conv, num_filters, filter_size, stride=stride, pad='same')
        print "conv =", conv.output_shape
        stride = 1
    nonlinearity = conv.nonlinearity
    conv.nonlinearity = lasagne.nonlinearities.identity
    return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity)
    
#for layer in get_all_layers(residual_block(l_in, num_filters=3, stride=2)):
#    print layer, layer.output_shape

l_out = residual_block(l_in, num_filters=16, stride=2)
print l_out.output_shape
print MaxPool2DLayer(l_out, pool_size=1, stride=2).output_shape

input = (1, 3, 28, 28)
layer = (1, 16, 14, 14)
conv = (1, 16, 14, 14)
conv = (1, 16, 14, 14)
(1, 16, 14, 14)
(1, 16, 7, 7)


----

In [80]:
X = T.tensor4('X')
y = T.ivector('y')

net_out = get_output(l_out, X)
loss = categorical_crossentropy(net_out, y).mean()
params = get_all_params(l_out, trainable=True)
grads = T.grad(loss, params)
updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9)
train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)

In [81]:
fn_test = theano.function([X], get_output(l_out, X))

In [82]:
fn_test( np.ones((1, 1, 28, 28)) )

array([[ 0.07945322,  0.05831984,  0.08652536,  0.08706014,  0.18493246,
         0.14282879,  0.05461725,  0.0947271 ,  0.14081899,  0.07071684]])

In [95]:
sys.stderr.write("loading mnist...\n")
train_data, valid_data, _ = hp.load_mnist("../../data/mnist.pkl.gz")

loading mnist...


In [96]:
X_train, y_train = train_data
X_train = X_train.astype("float32")
y_train = y_train.astype("int32")

In [97]:
bs = 32
n_batches = X_train.shape[0] // bs
num_epochs = 10
for epoch in range(0, num_epochs):
    train_losses = []
    for b in range(0, n_batches):
        train_losses.append(train_fn(X_train[b*bs : (b+1)*bs], y_train[b*bs : (b+1)*bs]))
    print (epoch+1), np.mean(train_losses)

KeyboardInterrupt: 

In [87]:
train_fn(X_train[0:1], y_train[0:1])

array(1.9055026730172253)