## [Chainer Deep Learning Framework](http://chainer.org/)

- It doesn't seem to support mulit-core cpu ...

In [1]:
import numpy as np
from chainer import Variable, FunctionSet
from chainer import functions, optimizers

In [2]:
import cPickle
(train_X, train_y), (valid_X, valid_y), (test_X, test_y) = cPickle.load(open("../data/mnist.pkl"))
print train_X.shape, train_y.shape, valid_X.shape, valid_y.shape, test_X.shape, test_y.shape

(50000, 784) (50000,) (10000, 784) (10000,) (10000, 784) (10000,)


### Fundamentals 
- Much like in theano, the "minions" in chainer are `Variable`s, which are wrappers of numpy.ndarray (so far only float32 supported due to cuda limit).
- forward/backward computation of `Variable`s
    - forward computation results can be retrived from `data` memeber of `Variable`
    - the variables record both its data and its "computation network"
    - backword computation happends by calling `backward()` on a variable, and the result is in `grad` member
- parameterized functions
    - ***Most functions in chainer accept mini-batch input, which are matrices of shape (N, d), where N is the batchs ize, and d is the input dimension of input vectors***
    - most of them are defined in `functions` module, and can be extended by inheritating `Function` class in chainer
    - it provides a way to calulate the gradient w.r.t to parameters (instead of just inputs)
    - the parameters in those functions are fixed by names, e.g., `f.W` or `f.b`, and their gradients are `f.gW` and `f.gb`
    - steps of calculating parameter gradients: see code below for details
- `FunctionSet` as neural networks - it is essentially a set of functions, which wraps up all parameters and their gradients in an interface that can be used with an optimzier. As a *benefit*, the parameters of the model can be automatically updated within one call.

In [3]:
## forward and backward caclulation of variables (including vectors)


x = Variable(np.array([[1, 2, 3], [4, 5, 6]], dtype = np.float32))
y = x**2 + 2*x + 1
print "forward computation of y"
print y.data
## ITS NECESSARY TO INITIALIZZE the OUTPUT graident for vector data
y.grad = np.ones((2, 3), dtype = np.float32)
y.backward()
print "gradient of x w.r.t y"
print x.grad

forward computation of y
[[  4.   9.  16.]
 [ 25.  36.  49.]]
gradient of x w.r.t y
[[  4.   6.   8.]
 [ 10.  12.  14.]]


In [4]:
## parameterized functions - forward and backward


f = functions.Linear(3, 2) ## inputsize = 3, outputsize = 2
## parameters W, b are initalized in specific way
print "initialized parameters"
print f.W
print f.b
## forward
y = f(x)
print y.data
## backward, w.r.t parameters
y.grad = np.ones(y.data.shape, dtype = np.float32)
f.gW.fill(0)
f.gb.fill(0)
y.backward()
print f.gW
print f.gb

initialized parameters
[[-0.38723999 -0.42784363 -0.77418971]
 [-0.16656621  0.7822451  -0.05404196]]
[ 0.  0.]
[[-3.56549644  1.23579812]
 [-8.3333168   2.92070913]]
[[ 5.  7.  9.]
 [ 5.  7.  9.]]
[ 2.  2.]


In [5]:
## set of functions - wrapping parameters in a unified interface with optimizers

model = FunctionSet(
    l1 = functions.Linear(4, 3),
    l2 = functions.Linear(3, 2)
)
## layers starting from l1, ...
model.l3 = functions.Linear(2, 2)
## design matrix representing minibatch data
x = Variable(np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype = np.float32))
## forward calculation, layer by layer
h1 = model.l1(x)
h2 = model.l2(h1)
y = model.l3(h2)
print y.data

[[ 1.93210864 -0.59107018]
 [ 4.9927206  -1.47177756]]


In [6]:
## model working with optimizers

## connect with parameters
optimizer = optimizers.SGD()
optimizer.setup(model.collect_parameters())
## zeroize every gradients via optimizer now
optimizer.zero_grads()

### MLP 

mlp with three hidden layers by ReLU activations, working on mnist classification

- same logic as with theano - wrapper objects (minions) around numpy/cuda array, which supports backpropagation via dependency network; as well as a set of functions that can be applied to those objects
- richer support for build-in functions
- a model is a chain of parameterized functions, and everythign, including inputs, outputs and parameters are chainer variables. optimizers decide the way of using those gradients.

In [7]:
import numpy
numpy.show_config()

lapack_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    define_macros = [('HAVE_CBLAS', None)]
    language = c
    runtime_library_dirs = ['/usr/local/lib']
blas_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    define_macros = [('HAVE_CBLAS', None)]
    language = c
    runtime_library_dirs = ['/usr/local/lib']
openblas_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    define_macros = [('HAVE_CBLAS', None)]
    language = c
    runtime_library_dirs = ['/usr/local/lib']
openblas_lapack_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    define_macros = [('HAVE_CBLAS', None)]
    language = c
    runtime_library_dirs = ['/usr/local/lib']
blas_mkl_info:
  NOT AVAILABLE


In [16]:
#%env OMP_NUM_THREADS=1

env: OMP_NUM_THREADS=1


In [18]:
from sklearn import utils

nunits = 128

## 1. define the arthitecuter of model
model = FunctionSet(
      l1 = functions.Linear(784, nunits) # 784 input, 100 hidden
    , l2 = functions.Linear(nunits, nunits) # another layer of 100 hidden
    , l3 = functions.Linear(nunits, 10) # 10 output - recommended to be always linear
)


## 2. you need to do the forward calculation manually, as a price of being flexible
## Note activation is not part of model in chainer, as they dont have any params
def forward(model, x_data, y_data, train = True):
    """
    x_data, y_data: numpy array (or cuda array), design matrix format
    """
    x = Variable(x_data.astype(np.float32))
    t = Variable(y_data.astype(np.int32))
    h1 = functions.leaky_relu(model.l1(x)) # no way of iterating all layers??
    h2 = functions.leaky_relu(model.l2(h1))
    y = model.l3(h2)
    #print y.data.dtype, t.data.dtype
    cost = functions.softmax_cross_entropy(y, t)
    return cost, functions.accuracy(y, t) ## both cost and accuracy work on SCORE of class

## 3. set an optimizer
opt= optimizers.SGD()#optimizers.Adam()#
opt.setup(model.collect_parameters())

## 4. learning loop with (1) forward cal, (2) backward cal, and (3) optimizer's update
batch_size = 100
for epoch in xrange(6):
    index = utils.shuffle(xrange(train_X.shape[0]))
    for b in xrange(0, train_X.shape[0], batch_size):
        batchx, batchy = train_X[b:b+batch_size, :], train_y[b:b+batch_size]
        ## forward calculation
        cost, acc = forward(model, batchx, batchy)
        ## backward calculation
        opt.zero_grads() ## preventing accumulating
        cost.backward() 
        ## parameter updates
        opt.update()
    if (epoch % 5 == 0):
        print 'epoch', epoch, 
        _, train_acc = forward(model, train_X, train_y)
        _, valid_acc = forward(model, valid_X, valid_y)
        print "train accuracy %g, validation accuracy %g" % (train_acc.data, valid_acc.data)
    
## prediction and test on new data
_, test_acc = forward(model, test_X, test_y, train = False)
print "accuracy on test data", test_acc.data

epoch 0 train accuracy 0.8248, validation accuracy 0.8487
epoch 5 train accuracy 0.91334, validation accuracy 0.9194
accuracy on test data 0.919799983501


In [9]:
x = Variable(test_X)
h1 = functions.leaky_relu(model.l1(x))
h2 = functions.leaky_relu(model.l2(h1))
y = model.l3(h2)

### Recurrent NN

Recurrent NN for variable length sequential modelling

text frrom ["a whiting and a snail" from "Alice in wonder land"](http://www.durrant.co.uk/alice/)

In [10]:
text = r"""‘Will you walk a little faster?’ said a whiting to a snail. ‘There’s a porpoise close behind us, and he’s treading on my tail. See how eagerly the lobsters and the turtles all advance! They are waiting on the shingle - will you come and join the dance? Will you, won’t you, will you, won’t you, will you join the dance? Will you, won’t you, will you, won’t you, won’t you join the dance?
‘You can really have no notion how delightful it will be When they take us up and throw us, with the lobsters, out to sea!’ But the snail replied ‘Too far, too far!’ and gave a look askance - Said he thanked the whiting kindly, but he would not join the dance. Would not, could not, would not, could not, would not join the dance. Would not, could not, would not, could not, could not join the dance.
‘What matters it how far we go?’ his scaly friend replied. ‘There is another shore, you know, upon the other side. The further off from England the nearer is to France - Then turn not pale, beloved snail, but come and join the dance. Will you, won’t you, will you, won’t you, will you join the dance? Will you, won’t you, will you, won’t you, won’t you join the dance?’
"""

text = r"""Mary had a little lamb,
Little lamb, little lamb,
Mary had a little lamb,
Its fleece was white as snow

And everywhere that Mary went,
Mary went, Mary went,
Everywhere that Mary went
The lamb was sure to go

It followed her to school one day
School one day, school one day
It followed her to school one day
Which was against the rules.

It made the children laugh and play,
Laugh and play, laugh and play,
It made the children laugh and play
To see a lamb at school

And so the teacher turned it out,
Turned it out, turned it out,
And so the teacher turned it out,
But still it lingered near

And waited patiently about,
Patiently about, patiently about,
And waited patiently about
Till Mary did appear

"Why does the lamb love Mary so?"
Love Mary so? Love Mary so?
"Why does the lamb love Mary so?"
The eager children cry

"Why, Mary loves the lamb, you know."
Loves the lamb, you know, loves the lamb, you know
"Why, Mary loves the lamb, you know."
The teacher did reply
"""
import re 
pat = re.compile("\w+")
word_seq = [w.lower() for w in pat.findall(text)] * 50
voc = np.unique(word_seq)
print len(voc), len(word_seq)

57 9150


In [11]:
## hashing (words to integers) by murmur hash
#import mmh3

w2i = dict([(w, i) for i, w in enumerate(voc)])
i2w = dict([(i, w) for w, i in w2i.items()])
hashed_word_seq = [w2i[w] for w in word_seq]

In [12]:
model = FunctionSet(
    embed = functions.EmbedID(len(voc), 20)
    , x_to_h = functions.Linear(20, 50)
    , h_to_h = functions.Linear(50, 50)
    , h_to_y = functions.Linear(50, len(voc))
)

def forward_one_step(model, h, cur_word, next_word):
    word = Variable(np.array([cur_word], dtype=np.int32))
    t = Variable(np.array([next_word], dtype=np.int32))
    x = functions.tanh(model.embed(word))
    h = functions.tanh(model.x_to_h(x) + model.h_to_h(h))
    y = model.h_to_y(h)
    cost = functions.softmax_cross_entropy(y, t)
    return h, cost, y

def forward(model, words):
    h = Variable(np.zeros(( 1, 50), dtype = np.float32))
    cost = 0
    ys = []
    for cur_word, next_word in zip(words[:-1], words[1:]):
        h, cur_cost, y = forward_one_step(model, h, cur_word, next_word)
        cost += cur_cost
        ys.append(y.data.argmax())
    return cost * 1. / len(words), ys ## averaged softmax cross entropy

def predict(model, start_word, nsteps):
    h = Variable(np.zeros((1, 50), dtype=np.float32))
    word = start_word#Variable(np.array([start_word], dtype=np.int32))
    predicted_words = []
    for i in xrange(nsteps):
        h, _, next_word_var = forward_one_step(model, h, word, word)
        next_word = next_word_var.data.argmax()
        predicted_words.append(next_word)
        word = next_word#Variable(np.array([next_word], dtype=np.int32))
    return predicted_words



In [13]:

optimizer = optimizers.SGD()
optimizer.setup(model.collect_parameters())

for epoch in xrange(50):
    optimizer.zero_grads()
    cost, ys = forward(model, hashed_word_seq)
    cost.backward()
    optimizer.update()
    if epoch % 20 == 0:
        print epoch, cost.data

0 4.27738209623


KeyboardInterrupt: 

In [17]:
" ".join([i2w[i] for i in predict(model, w2i["mary"], 100)])

'go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure that went go why loves sure'