Following the [`theano` Tutorial](http://deeplearning.net/software/theano/tutorial/index.html)

In [1]:
from theano import *
import theano.tensor as T

## Baby Steps - Algebra
### Adding two Scalars

In [2]:
import numpy
from theano import function

In [3]:
x = T.dscalar('x')
y = T.dscalar('y')
z = x+y
f = function([x,y],z)

In [7]:
print type(x), type(y), type(z), type(f) # good to know what these 
# new classes are in theano

<class 'theano.tensor.var.TensorVariable'> <class 'theano.tensor.var.TensorVariable'> <class 'theano.tensor.var.TensorVariable'> <class 'theano.compile.function_module.Function'>


In [8]:
f(2,3)

array(5.0)

In [9]:
numpy.allclose(f(16.3,12.1),28.4)

True

In [10]:
x.type

TensorType(float64, scalar)

In [11]:
T.dscalar

TensorType(float64, scalar)

In [12]:
x.type is T.dscalar

True

"Prefer constructors like `matrix, vector` and `scalar` to `dmatrix, dvector` and `dscalar` because the former will give you `float32` variables when `floatX=float32`." - cf. [Using the GPU Theano tutorial](http://deeplearning.net/software/theano/tutorial/using_gpu.html)

In [13]:
xf = T.scalar('xf')
yf = T.scalar('yf')
zf = xf + yf
ff = function([xf,yf],zf)

In [14]:
from theano import pp

In [15]:
print(pp(z))

(x + y)


In [16]:
print(pp(zf))

(xf + yf)


In [17]:
x = T.dmatrix('x')
y = T.dmatrix('y')
z = x + y 
f = function([x,y],z)

In [18]:
f([[1,2],[3,4]], [[10,20],[30,40]])

array([[ 11.,  22.],
       [ 33.,  44.]])

In [19]:
f(numpy.array([[1,2],[3,4]]),numpy.array([[10,20],[30,40]]))

array([[ 11.,  22.],
       [ 33.,  44.]])

In [20]:
xf = T.matrix('xf')
xy = T.matrix('yf')
zf = xf + yf
ff = function([xf,yf],zf)

In [22]:
ff([[1,2],[3,4]], [[10,20],[30,40]])

TypeError: ('Bad input argument to theano function with name "<ipython-input-20-f711db7fb599>:4"  at index 1(0-based)', 'Wrong number of dimensions: expected 0, got 2 with shape (2, 2).')

Adding exercise 1, cf. http://deeplearning.net/software/theano/tutorial/adding.html

In [25]:
a = theano.tensor.vector()
b = theano.tensor.vector()
out = a**2 + b**2 + 2 * a * b
f = theano.function([a,b],out)
print(f([1,2],[4,5]))

[ 25.  49.]


"At this point it would be wise to begin familiarizing yourself more systematically with Theano’s fundamental objects and operations by browsing this section of the library: [Basic Tensor Functionality](http://deeplearning.net/software/theano/library/tensor/basic.html#libdoc-basic-tensor)." cf. [More Examples](http://deeplearning.net/software/theano/tutorial/examples.html)

### Custom tensor types

In [28]:
dtensor5 = T.TensorType('float64', (False,)*5)

In [29]:
x = dtensor5()
z = dtensor5('z')

In [32]:
my_dmatrix = T.TensorType('float64', (False,)*2)
x = my_dmatrix()
my_dmatrix == T.dmatrix

True

### Converting from Python Objects

In [33]:
x = shared(numpy.random.randn(3,4))

Back to More Examples... http://deeplearning.net/software/theano/tutorial/examples.html

In [34]:
x = T.dmatrix('x')
s = 1 / ( 1 + T.exp(-x))
logistic = theano.function([x],s)
logistic([[0,1],[-1,-2]])

array([[ 0.5       ,  0.73105858],
       [ 0.26894142,  0.11920292]])

\begin{gathered}
s(x) = \frac{1}{1+\exp{-x} } = \frac{1+\tanh{(x/2) } }{2}
\end{gathered}

In [35]:
s2 = (1 + T.tanh(x/2))/2
logistic2 = theano.function([x],s2)
logistic2([[0,1],[-1,-2]])

array([[ 0.5       ,  0.73105858],
       [ 0.26894142,  0.11920292]])

## Computing More than one Thing at the Same Time (!!!)

In [36]:
a,b = T.dmatrices('a','b')
diff = a-b
abs_diff = abs(diff)
diff_squared = diff**2
f = theano.function([a,b],[diff,abs_diff,diff_squared])

In [37]:
f([[1,1],[1,1]], [[0,1],[2,3]])

[array([[ 1.,  0.],
        [-1., -2.]]), array([[ 1.,  0.],
        [ 1.,  2.]]), array([[ 1.,  0.],
        [ 1.,  4.]])]

## Setting a Default Value for an Argument

In [38]:
from theano import In
from theano import function
x,y = T.dscalars('x','y')
z = x + y
f= function([x,In(y,value=1)],z)

In [39]:
f(33)

array(34.0)

In [40]:
f(33,2)

array(35.0)

"Inputs with default values must follow inputs without default values (like Python’s functions). There can be multiple inputs with default values. These parameters can be set positionally or by name, as in standard Python:"

In [42]:
x,y,w = T.dscalars('x', 'y', 'w')
z = (x+y)*w
f = function([x,In(y,value=1),In(w,value=2,name='w_by_name')],z)
f(33)

array(68.0)

In [43]:
f(33,2)

array(70.0)

In [44]:
f(33,0,1)

array(33.0)

In [45]:
f(33,w_by_name=1)

array(34.0)

In [46]:
f(33,w_by_name=1,y=0)

array(33.0)

## Using Shared Variables

In [47]:
from theano import shared
state = shared(0)
inc = T.iscalar('inc')
accumulator = function([inc],state,updates=[(state,state+inc)])

In [48]:
print(state.get_value())

0


In [49]:
accumulator(1)

array(0)

In [50]:
print(state.get_value())

1


In [51]:
accumulator(300)

array(1)

In [52]:
print(state.get_value())

301


"It is possible to reset the state.  Just use the `.set_value()` method:"

In [53]:
state.set_value(-1)

In [54]:
accumulator(3)

array(-1)

In [55]:
print(state.get_value())

2


In [56]:
decrementor = function([inc],state, updates=[(state,state-inc)])

In [57]:
decrementor(2)

array(2)

In [58]:
print(state.get_value())

0


"Also, Theano has more control over where and how shared variables are allocated, which is one of the important elements of getting good performance on the GPU."

In [60]:
fn_of_state = state * 2 + inc
# The type of foo must match the shared variable we are replacing
# with the "givens"
foo = T.scalar(dtype=state.dtype)
skip_shared = function([inc,foo], fn_of_state, givens=[(state,foo)])
skip_shared(1,3)

array(7)

In [61]:
print(state.get_value())

0


## Copying functions

In [64]:
inc = T.iscalar('inc')
accumulator = theano.function([inc],state, updates=[(state,state+inc)])
accumulator(10)

array(0)

In [65]:
print(state.get_value())

10


"We can use `copy()` to create a similar accumulator but with its own internal state using the swap parameter, which is a dictionary of shared variables to exchange:"

In [66]:
new_state = theano.shared(0)
new_accumulator = accumulator.copy(swap={state:new_state})
new_accumulator(100)

[array(0)]

In [67]:
print(new_state.get_value())

100


In [68]:
print(state.get_value())

10


In [69]:
null_accumulator = accumulator.copy(delete_updates=True)
null_accumulator(9000)

UnusedInputError: theano.function was asked to create a function computing outputs given certain inputs, but the provided input variable at index 0 is not part of the computational graph needed to compute the outputs: inc.
To make this error into a warning, you can pass the parameter on_unused_input='warn' to theano.function. To disable it completely, use on_unused_input='ignore'.

In [70]:
print(state.get_value())

10


## Using Random Numbers

In [71]:
from theano.tensor.shared_randomstreams import RandomStreams
from theano import function
srng = RandomStreams(seed=234)
rv_u = srng.uniform((2,2)) # represents a random stream of 2x2 matrices
rv_n = srng.normal((2,2))
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True) # Not updating rv_n.rng
nearly_zeros = function([],rv_u+rv_u - 2 * rv_u)

"The RandomStream only work on the CPU, MRG31k3p work on the CPU and GPU. CURAND only work on the GPU." cf. http://deeplearning.net/software/theano/tutorial/examples.html#other-implementations

In [74]:
from theano.sandbox.rng_mrg import MRG_RandomStreams

In [76]:
from theano.sandbox.cuda import CURAND_RandomStreams

In [77]:
f_val0 = f()

In [78]:
f_val1 = f()

"When we add the extra argument `no_default_updates=True` to function (as in `g`), then the random number generator state is not affected by calling the returned function. So, for example, calling `g` multiple times will return the same numbers."

In [79]:
g_val0 = g() # different numbers from f_val0 and f_val1

In [80]:
g_val1 = g()

"An important remark is that a random variable is drawn at most once during any single function execution. So the nearly_zeros function is guaranteed to return approximately 0 (except for rounding error) even though the `rv_u` random variable appears three times in the output expression."

### Seeding Streams

In [82]:
rng_val = rv_u.rng.get_value(borrow=True) # Get the ring for rv_u
rng_val.seed(89234) # seeds the generator
rv_u.rng.set_value(rng_val, borrow=True) # Assign back seeded rng

In [83]:
srng.seed(902340) # seeds rv_u and rv_n with different seeds each

### Sharing Streams Between Functions

In [84]:
state_after_v0 = rv_u.rng.get_value().get_state()

In [85]:
nearly_zeros()  # this affects rv_u's generator

array([[ 0.,  0.],
       [ 0.,  0.]])

In [86]:
v1 = f()
rng = rv_u.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_u.rng.set_value(rng,borrow=True)

In [87]:
v2 =f() # v2 != v1

In [88]:
v3=f() # v3 == v1

In [90]:
v2.view()

array([[ 0.33919835,  0.85344878],
       [ 0.14881562,  0.79659413]])

In [91]:
v1.view()

array([[ 0.5025809 ,  0.99544429],
       [ 0.75073355,  0.17926032]])

In [92]:
v3.view()

array([[ 0.5025809 ,  0.99544429],
       [ 0.75073355,  0.17926032]])

### Copying Random State Between Theano Graphs

In [93]:
from __future__ import print_function
from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.tensor.shared_randomstreams import RandomStreams

In [94]:
class Graph():
    def __init__(self, seed=123):
        self.rng = RandomStreams(seed)
        self.y = self.rng.uniform(size=(1,))

In [96]:
g1 = Graph(seed=123)
f1 = theano.function([], g1.y)
g2 = Graph(seed=987)
f2 = theano.function([], g2.y)

# By default, the two functions are out of sync.
f1()

array([ 0.72803009])

In [97]:
f2()

array([ 0.55056769])

In [98]:
def copy_random_state(g1,g2):
    if isinstance(g1.rng, MRG_RandomStreams):
        g2.rng.rstate = g1.rng.rstate
    for (su1, su2) in zip(g1.rng.state_updates, g2.rng.state_updates):
        su2[0].set_value(su1[0].get_value())

In [99]:
# We now copy the state of the theano random number generators.
copy_random_state(g1, g2)
f1()

array([ 0.59044123])

In [100]:
f2()

array([ 0.59044123])

### Other Random Distributions

are found here at [other distributions implemented](http://deeplearning.net/software/theano/library/tensor/raw_random.html#libdoc-tensor-raw-random)

# A Real Example: Logistic Regression


In [101]:
import numpy
import theano
import theano.tensor as T
rng = numpy.random

N = 400      # training sample size
feats = 784  # number of input variables

# generate a dataset: D = (input_values, target_class)
D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
training_steps = 10000

# Declare Theano symbolic variables
x = T.dmatrix("x")
y = T.dvector("y")

# initialize the weight vector w randomly
#
# this and the following bias variable b
# are shared so they keep their values
# between training iterations (updates)
w = theano.shared(rng.randn(feats), name="w")

# initialize the bias term
b = theano.shared(0., name="b")

print("Initial model:")
print(w.get_value())
print(b.get_value())

# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))  # Probability that target =  1
prediction = p_1 > 0.5                   # The prediction thresholded
xent = -y * T.log(p_1) - (1-y * T.log(1-p_1)) # Cross-entropy loss function
cost = xent.mean() + 0.01 * ( w** 2).sum() # The cost to minimize
gw, gb = T.grad(cost, [w,b])             # Compute the gradient of the cost
                                    # w.r.t weight vector w and 
                                     # bias term b
                                    # (we shall return to this in a 
                                    # following section of this tutorial)

# Compile
train = theano.function(
            inputs=[x,y],
            outputs=[prediction, xent],
            updates=((w,w-0.1 *gw), (b,b-0.1 * gb)))
predict = theano.function(inputs=[x], outputs=prediction)

# Train
for i in range(training_steps):
    pred, err = train(D[0], D[1])
    
print("Final model:")
print(w.get_value())
print(b.get_value())
print("target values for D:")
print(D[1])
print("prediction on D:")
print(predict(D[0]))

                
                


Initial model:
[  1.02090375e+00   8.73947142e-01   1.95614041e+00   7.07377428e-01
  -6.53816172e-01   7.42545996e-01  -9.80754517e-01   5.61128882e-01
   1.78775715e+00   2.55831279e-01  -8.67444321e-02   7.94001501e-01
  -6.70527935e-01  -1.96668538e+00   2.54200412e+00   8.89567452e-01
   9.74970709e-01   8.48005589e-01  -6.09636316e-01   8.50633037e-01
   1.68271257e+00   4.93346994e-01  -5.51691867e-02   2.37368331e-01
   1.62746146e-01   1.55918000e+00  -8.34594814e-01   1.38944861e+00
  -1.23368104e+00  -3.93048716e-01   1.05566395e+00  -1.31731246e+00
   7.08793200e-01  -2.69903059e-01   1.59489513e+00  -5.41472996e-01
  -1.51657778e+00  -1.74491648e+00  -4.87264078e-01  -1.40829581e+00
   9.16277590e-01   1.13288831e-01  -8.15545784e-01   3.63391354e-01
   4.59170798e-01  -4.92534498e-01   1.59924371e-01  -5.79592495e-01
   8.28797390e-01   1.01823000e+00   9.47734065e-01   1.07973174e+00
   4.30089731e-02  -8.06497075e-02   8.95636221e-01  -1.18478568e-02
  -5.81466361e-01  

## [Derivatives in Theano](http://deeplearning.net/software/theano/tutorial/gradients.html)

### Computing Gradients

In [1]:
import numpy
import theano
import theano.tensor as T
from theano import pp

For this 

$
\begin{gathered}
\frac{d (x^2) }{ dx} = 2 \cdot x     
\end{gathered}
$


In [2]:
x = T.dscalar('x')
y = x ** 2
gy = T.grad(y,x)
pp(gy) # print out the gradient prior to optimization

'((fill((x ** TensorConstant{2}), TensorConstant{1.0}) * TensorConstant{2}) * (x ** (TensorConstant{2} - TensorConstant{1})))'

`fill((x ** 2), 1.0)` means to make a matrix of the same shape as `x ** 2 ` and fill it with `1.0`

In [3]:
f = theano.function([x],gy)
f(4)

array(8.0)

In [4]:
numpy.allclose(f(94.2), 188.4)

True

### A plot of the gradient of the logistic function, with x on the x-axis and $ds(x)/dx$ on the y-axis

In [6]:
x = T.dmatrix('x')
s = T.sum(1 / (1 + T.exp(-x)))
gs = T.grad(s, x)
dlogistic = theano.function([x], gs)
dlogistic([[0, 1], [-1, -2]])

array([[ 0.25      ,  0.19661193],
       [ 0.19661193,  0.10499359]])

### Computing the Jacobian

In [7]:
x = T.dvector('x')
y = x ** 2
J, updates = theano.scan(lambda i, y, x: T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y,x] )
f = theano.function([x], J, updates=updates)
f([4, 4])

array([[ 8.,  0.],
       [ 0.,  8.]])

### Computing the Hessian

In [8]:
x = T.dvector('x')
y = x ** 2
cost = y.sum()
gy = T.grad(cost, x)
H, updates = theano.scan(lambda i, gy, x : T.grad(gy[i], x), sequences=T.arange(gy.shape[0]), non_sequences=[gy, x] )
f = theano.function([x], H, updates=updates)
f([4,4])

array([[ 2.,  0.],
       [ 0.,  2.]])

### R-operator

In [9]:
W = T.dmatrix('W')
V = T.dmatrix('V')
x = T.dvector('x')
y = T.dot(x,W)
JV = T.Rop(y, W, V)
f = theano.function([W,V,x],JV)
f([[1,1], [1,1]],[[2,2],[2,2]],[0,1])

array([ 2.,  2.])

### L-operator

In [10]:
W = T.dmatrix('W')
v = T.dvector('v')
x = T.dvector('x')
y = T.dot(x,W)
VJ = T.Lop(y,W,v)
f = theano.function([v,x],VJ)
f([2,2],[0,1])

array([[ 0.,  0.],
       [ 2.,  2.]])

### Hessian times a Vector

In [11]:
x = T.dvector('x')
v = T.dvector('v')
y = T.sum(x ** 2)
gy = T.grad(y, x)
vH = T.grad(T.sum( gy * v), x)
f= theano.function([x,v], vH)
f([4,4], [2,2])

array([ 4.,  4.])

or, making use of the *R-operator*:

In [12]:
x = T.dvector('x')
v = T.dvector('v')
y = T.sum( x ** 2)
gy = T.grad(y,x)
Hv = T.Rop(gy, x, v)
f = theano.function([x,v], Hv)
f([4,4],[2,2])

array([ 4.,  4.])

## Conditions

In [13]:
from theano import tensor as T
from theano.ifelse import ifelse
import theano, time, numpy

a,b = T.scalars('a','b')
x,y = T.matrices('x','y')

In [14]:
z_switch = T.switch(T.lt(a,b), T.mean(x), T.mean(y))
z_lazy   = ifelse(T.lt(a, b), T.mean(x), T.mean(y))