## API tutorial

### Expression Building

(This tutorial is tested on DyNet 2.0.3+ and Python 2.7)

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import dynet_config
dynet_config.set(random_seed=0)
import dynet as dy

import numpy as np

## ==== Create a new computation graph
# (it is a singleton, we have one at each stage.
# dy.renew_cg() clears the current one and starts a new one)
dy.renew_cg();
# set random seed to have the same result each time


#### Create Expressions
Expressions are used as an interface to the various functions that can be used to build DyNet computation graphs. 

In [2]:
# create a scalar expression.
value = 5.0
x = dy.scalarInput(value)

In [3]:
# create a vector expression.
dimension = 3
v = dy.vecInput(dimension)
v.set([1,2,3])

In [4]:
# create a matrix expression from list
mat1 = dy.inputTensor([[1,2], [3,4]]) # Row major

# or, using numpy array
mat2 = dy.inputTensor(np.array([[1,2], [3,4]]))

mat3 = dy.inputTensor(np.zeros((2,3)))

In [5]:
## ==== We can take the value of an expression. 
# For complex expressions, this will run forward propagation.
print(mat1.value())   
print(mat1.npvalue())    # as numpy array
print(v.vec_value())     # as vector, if vector
print(x.scalar_value())  # as scalar, if scalar
print(x.value())         # choose the correct one

[[1. 2.]
 [3. 4.]]
[[1. 2.]
 [3. 4.]]
[1.0, 2.0, 3.0]
5.0
5.0


#### Create Parameters
Parameters are things that are optimized. in contrast to a system like Torch where computational modules may have their own parameters, in DyNet parameters are just parameters.

In [6]:
# Parameters are things we tune during training.
# Usually a matrix or a vector.

# First we create a parameter collection and add the parameters to it.
m = dy.ParameterCollection() 
W = m.add_parameters((8,8)) # an 8x8 matrix
b = m.add_parameters(8) # an 8x1 vector

It should be noticed that in DyNet 2.0+ and later version, the dy.parameters() is depecated so explicitly adding parameters to the computation graph is no longer necessary. Any used parameter will be added automatically.

In [7]:
# There are several ways to initial parameters
# Specifiying parameter initialization
scale = 1
mean = 0
stddev = 1

# Creates 3x5 matrix filled with 0 (or any other float)
p = m.add_parameters((3,5), init=0)
# Creates 3x5 matrix initialized with U([-scale, scale])
p = m.add_parameters((3,5), init='uniform', scale=scale)
# Creates 3x5 matrix initialized with N(mean, stddev)
p = m.add_parameters((3,5), init='normal', mean=mean, std=stddev)
# Creates 5x5 identity matrix
p = m.add_parameters((5,5), init='identity')
# Creates 3x5 matrix with glorot init
p = m.add_parameters((3,5), init='glorot')
p = m.add_parameters((3,5)) # By default, the init = 'glorot'
# Creates 3x5 matrix with he init
p = m.add_parameters((3,5), init='he')
# Creates 3x5 matrix from a numpy array (size is inferred)
p = m.add_parameters((3,5), np.ones((3,5)))
# Creates 3x5 matrix from a numpy array (size is inferred)

#### Create LookupParameters
LookupParameters represents a table of parameters. They are used to embed a set of discrete objects (e.g. word embeddings). These are sparsely updated.

In [8]:
## ===== Lookup parameters
# Similar to parameters, but are representing a "lookup table"
# that maps numbers to vectors.
# These are used for embedding matrices.
# for example, this will have VOCAB_SIZE rows, each of DIM dimensions.
VOCAB_SIZE = 100
DIM = 10
lp = m.add_lookup_parameters((VOCAB_SIZE, DIM))

In [9]:
# Ceate expressions from lookup parameters.
e5  = dy.lookup(lp, 5)   # create an Expression from row 5.
e5  = lp[5]              # same
e5c = dy.lookup(lp, 5, update=False)  # as before, but don't update when optimizing.

e45  = dy.lookup_batch(lp, [4, 5])   # create a batched Expression from rows 4 and 5.
e45  = lp.batch([4, 5])
print('e45 dim:', e45.dim())

e0_9 = dy.lookup_batch(lp, range(10))  # create a batched Expression from rows 0 to 9
e0_9 = lp.batch(range(10))
print('e0_9 dim:', e0_9.dim())

e5.set(10)  # now the e5 expression contains row 10
print('e5 dim after applying set method', e5.dim())
print(e5.value())

# We can check if it is actually containing row 10
e10 = lp[10]
print(e5.value() == e10.value())

e45 dim: ((10,), 2)
e0_9 dim: ((10,), 10)
e5 dim after applying set method ((10,), 1)
[0.034212298691272736, -0.10658015310764313, 0.07870276272296906, 0.14834508299827576, 0.05688869580626488, 0.16793382167816162, 0.20482590794563293, 0.21452514827251434, -0.021544886752963066, 0.20709896087646484]
True


In [10]:
# Similar to Parameters, we have several ways to
# initialize LookupParameters.
scale = 1
mean = 0
stddev = 1

# Creates 3x5 matrix filled with 0 (or any other float)
p = m.add_lookup_parameters((3,5), init=0)
# Creates 3x5 matrix initialized with U([-scale, scale])
p = m.add_lookup_parameters((3,5), init='uniform', scale=scale)
# Creates 3x5 matrix initialized with N(mean, stddev)
p = m.add_lookup_parameters((3,5), init='normal', mean=mean, std=stddev)
# Creates 5x5 identity matrix
p = m.add_lookup_parameters((5,5), init='identity')
# Creates 3x5 matrix with glorot init
p = m.add_lookup_parameters((3,5), init='glorot')
p = m.add_parameters((3,5)) # By default, the init = 'glorot'
# Creates 3x5 matrix with he init
p = m.add_lookup_parameters((3,5), init='he')
# Creates 3x5 matrix from a numpy array (size is inferred)
p = m.add_lookup_parameters((3,5), np.ones((3,5)))
# Creates 3x5 matrix from a numpy array (size is inferred)

#### Expression Manipulation
DyNet provides tons of operations on Expression. User can manipulate Expressions, build complex Expression easily.

In [11]:
# Fist we create some vector Expressions.
e1 = dy.vecInput(4)
e1.set([1, 2, 3, 4])

e2 = dy.vecInput(4)
e2.set([5, 6, 7, 8])

mat1 = dy.inputTensor(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))  # A 4x2 matrix
mat2 = dy.inputTensor(np.array([[1, 0], [0, 1]]))  # A 4x4 matrix

In [12]:
# Math Operations

# Add
e = e1 + e2  # Element-wise addition
print(e.value())  # Should be [6.0, 8.0, 10.0, 12.0]

# Minus
e = e2 - e1 # Element-wise minus
print(e.value())  # Should be [4.0, 4.0, 4.0, 4.0]
# Negative
e = -e1  # Should be [-1.0, -2.0, -3.0, -4.0]
print(e.value())

# Multiply
e = e1 * dy.transpose(e1)  #It's Matrix multiplication (like e1.dot(e2) in numpy)
print(e.value())

mat = mat1 * mat2
print(mat.value())

# Dot product
e = dy.dot_product(e1, e2)  # dot product = sum(component-wise multiply)
print(e.value())

# Component-wise multiply
e = dy.cmult(e1, e2)
print(e.value())

# Component-wise division
e = dy.cdiv(e1, e2)
print(e.value())

# Column-wise addition
# colwise_add(x, y)
#  x:  An MxN matrix
#  y:  A length M vector
mat = dy.colwise_add(mat1, e1)  # column-wise addition
print(mat.value())

[6.0, 8.0, 10.0, 12.0]
[4.0, 4.0, 4.0, 4.0]
[-1.0, -2.0, -3.0, -4.0]
[[ 1.  2.  3.  4.]
 [ 2.  4.  6.  8.]
 [ 3.  6.  9. 12.]
 [ 4.  8. 12. 16.]]
[[1. 2.]
 [3. 4.]
 [5. 6.]
 [7. 8.]]
70.0
[5.0, 12.0, 21.0, 32.0]
[0.20000000298023224, 0.3333333432674408, 0.4285714328289032, 0.5]
[[ 2.  3.]
 [ 5.  6.]
 [ 8.  9.]
 [11. 12.]]


In [13]:
# Matrix Shapes

# Reshape
new_dimension = (2, 2)
e = dy.reshape(e1, new_dimension)  # Col major
print(e.value())

# Transpose
e = dy.transpose(e1)
print('e1 dimension:', e1.dim())
print('e1 transpose dimension', e.dim())

[[1. 3.]
 [2. 4.]]
e1 dimension: ((4,), 1)
e1 transpose dimension ((1, 4), 1)


In [14]:
# Per-element unary functions.

# exp()
e = dy.exp(e1)

# sin()
e = dy.sin(e1)

# cos()
e =dy.cos(e1)

# tan()
e = dy.tan(e1)

# asin()
e = dy.asin(e1)

# acos()
e = dy.acos(e1)

# atan()
e = dy.atan(e1)

# sinh()
e = dy.sinh(e1)

# cosh()
e = dy.cosh(e1)

# tanh()
e = dy.tanh(e1)

# log()
e = dy.log(e1)

# sigmoid()
e = dy.logistic(e1)   # Sigmoid(x)

# relu()
e = dy.rectify(e1)    # Relu (= max(x,0))

# softsign()
e = dy.softsign(e1)    # x/(1+|x|)

# softmax
e = dy.softmax(e1)
print(e.value())

# log_softmax
# logsoftmax = logits - log(reduce_sum(exp(logits), dim))
# restrict is a set of indices. if not empty, only entries 
# in restrict are part of softmax computation, others get -inf.
e_log_softmax = dy.log_softmax(e1)
e_log_softmax = dy.log_softmax(e1, restrict=[0,1,2])
print(e_log_softmax.value())

[0.032058604061603546, 0.08714432269334793, 0.23688283562660217, 0.6439142823219299]
[-2.4076058864593506, -1.4076058864593506, -0.4076058864593506, -inf]


In [15]:
# Picking values from vector expressions

k = 1
v = 3
# Pick one element from a vector or matrix
# similar to python's e1[k] for list.
# k can be negative, which has exactly the same behavior
# as it is in python
e = dy.pick(e1, k)
print('The {} element of vector is {}'.format(k+1, e.value())) # index starts from 0
# which is also equivalent to:
e = e1[k]
# k can be negative. -1 means the last element
e = e1[-1]
print(e.value())

mat = dy.pick(mat1, k)
print('The {} element of matrix mat1 is {}'.format(k+1, mat.value()))
# which is equivalent to:
mat = mat1[k]

# Pick several elements from a vector or matrix
# similar to python's e1[k:v] for lists. 
# e1 is an Expression, k, v are integers.
# Important: v should not exceed the e1's dimension.
e = dy.pickrange(e1, k, v)
print('Pick range[k, v) from a vector', e.value())
# which is also equivalent to:
e = e1[k:v]
e = e1[:v]  # similar to python, you can neglect k
e = e1[:]   # or even both k and v
print(e.value())
# ERROR: Don't try this
# e = e1[0:10], the v value should not exceed the dimension.

mat = dy.pickrange(mat1, k, v)
print('Pick range[k, v) from a matrix', mat.value())

# Pick negative log_softmax
# which is equivalent to: dy.pick(-dy.log(dy.softmax(e1)), k)
e = dy.pickneglogsoftmax(e1, k)
e_ = dy.pick(-dy.log(dy.softmax(e1)), k)
print(e.value())
print(e_.value())

The 2 element of vector is 2.0
4.0
The 2 element of matrix mat1 is [3.0, 4.0]
Pick range[k, v) from a vector [2.0, 3.0]
[1.0, 2.0, 3.0, 4.0]
Pick range[k, v) from a matrix [[3. 4.]
 [5. 6.]]
2.44018983841
2.44018959999


In [16]:
# Expressions concatenation & other useful manipuulations

# This performs an elementwise sum over all the expressions included.
# All expressions should have the same dimension.
e = dy.esum([e1, e2])
# which is equivalent to:
e_ = e1 + e2
assert e.value() == e_.value()

# This performs an elementwise average over all the expressions included.
# All expressions should have the same dimension.
e = dy.average([e1, e2])
# which is equivalent to:
e_ = (e1 + e2)/2
assert e.value() == e_.value()

# Concate vectors/matrix column-wise
# All expressions should have the same dimension.
# e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate_cols([e1, e2])
print(e.value())

mat = dy.concatenate_cols([mat1, e2])
print(mat.value())

# Concate vectors/matrix
# All expressions should have the same dimension.
# e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate([e1, e2])
print(e.value())

mat = dy.concatenate([mat2, mat2])
print(mat.value())

# affine transform
e0 = dy.vecInput(2)
e0.set([-1, 0])
e = dy.affine_transform([e1,mat1,e0])

print(e.value())

[[1. 5.]
 [2. 6.]
 [3. 7.]
 [4. 8.]]
[[1. 2. 5.]
 [3. 4. 6.]
 [5. 6. 7.]
 [7. 8. 8.]]
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
[[1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]
[0.0, -1.0, -2.0, -3.0]


#### DyNet in Neural Networks
This part contains Neural Networks related issues.

In [17]:
# Noise and Dropout Expressions

# Add a noise to each element from a gausian distribution
# with standard-dev = stddev
stddev = 0.1
e = dy.noise(e1, stddev)
print(e.value())

# Apply dropout to the input expression
# There are two kinds of dropout methods 
# (http://cs231n.github.io/neural-networks-2)
# Dynet implement the Inverted dropout where dropout with prob p 
# and scaling others by 1/p at training time, and do not need 
# to do anything at test time. 
p = 0.5
e = dy.dropout(e1, p)    # apply dropout with probability p 
print(e.value()) # It should be [2.0, 4.0, 6.0, 0.0], the last element is dropped out and the rest are scaled

# If we set p=1, everything will be dropped out
e = dy.dropout(e1, 1)
print(e.value()) # Should be [nan, nan, ...]

# If we set p=0, everything will be kept
e = dy.dropout(e1, 0)
assert e.value() == e1.value()

[0.979181706905365, 1.8229671716690063, 2.9431982040405273, 4.195302963256836]
[2.0, 0.0, 6.0, 8.0]
[nan, nan, nan, nan]


In [18]:
# Loss Functions

# DyNet provides several ways to calculate "distance"
# between two expressions of the same dimension
# This is square_distance, defined as
# sum(square of(e1-e2)) for all elements
# in e1 and e2.
# Here e1 is a vector of [1,2,3,4]
# And e2 is a vector of [5,6,7,8]
# The square distance is sum((5-1)^2 + (6-2)^2+...)
e = dy.squared_distance(e1, e2)
print(e.value())

# This is the l1_distance, defined as 
# sum (abs(e1-e2)) for all elements in
# e1 and e2.
e = dy.l1_distance(e1, e2)
print(e.value())

# This is the huber_distance, definition 
# found here. (https://en.wikipedia.org/wiki/Huber_loss)
# The default threhold (delta) is 1.345.
# Here e1 is a vector of [1,2,3,4]
# And e2 is a vector of [5,6,7,8]
# because for each pair-wised element in
# e1 and e2, the abs(e1-e2)=4>delta=1.345,
# so the output is sum(delta*(abs(4)-1/2*delta))

e = dy.huber_distance(e1, e2, c=1.345)
print('huber distance:', e.value()) #TODO: has error here

# Binary logistic loss function
# This is similar to cross entropy loss function
# e1 must be a vector that takes values between 0 and 1
# ty must be a vector that takes values between 0 and 1
# e = -(ty * log(e1) + (1 - ty) * log(1 - e1))
ty = dy.vecInput(4)
ty.set([0, 0.5, 0.5, 1])
e_scale = ty = dy.vecInput(4)
e_scale.set([0.5, 0.5, 0.5, 0.5])
e = dy.binary_log_loss(e_scale, ty)
print(e.value())
# Te binary_log_loss is equivalent to the following:
e_equl = -(dy.dot_product(ty, dy.log(e_scale)) + dy.dot_product((dy.inputTensor([1,1,1,1]) - ty), dy.log(dy.inputTensor([1,1,1,1]) - e_scale)))
assert e_equl.value() == e.value()

# pairwise_rank_loss, a.k.a. Hinge loss
# e1 is row vector or scalar
# e2 is row vector or scalar
# m is number
# e = max(0, m - (e1 - e2))
e = dy.pairwise_rank_loss(dy.transpose(e1), dy.transpose(e2), m=1.0) # Row vector needed, so we transpose the vector.
print(e.value())  # Expect [[5. 5. 5. 5.]]

e = dy.pairwise_rank_loss(dy.transpose(e2), dy.transpose(e1), m=1.0) # Row vector needed, so we transpose the vector.
print(e.value())  # Expect [[0. 0. 0. 0.]]

64.0
16.0
huber distance: 35.8038978577
2.77258872986
[[5. 5. 5. 5.]]
[[0. 0. 0. 0.]]


In [19]:
# Convolutions

# DyNet can do convolutions similar to PyTorch.
# First we mock an image and a filter
# mat is a 3D tensor of dim{4,4,3}
# kernel is a 4d Tensor of shape {2,2,3,1}
mat = dy.inputTensor(np.array([[[1,2,1], [0,1,2], [0,0,1], [0,1,0]], [[1,0,2], [0,0,0], [1,1,1], [2,2,2]], [[0,1,2], [1,1,0], [0,0,1], [2,2,1]], [[2,2,0], [2,1,2], [2,2,1], [1,1,0]]]))
kernel = dy.inputTensor(np.array([[[[1], [0], [2]], [[1], [2], [0]]], [[[0], [1], [0]], [[2], [1], [1]]]]))
print(mat.dim(), kernel.dim())
# conv2d
# This is 2D convolution operator without bias parameters.
# dy.conv2d(Expression x, Expression f, vector[unsigned] stride, bool is_valid = True)
# x: The input feature maps: (H x W x Ci) x N (ColMaj), 3D tensor with an optional batch dimension
# f: 2D convolution filters: H x W x Ci x Co (ColMaj), 4D tensor
# stride: the row and column strides in a list
# is_valid: padding method. True for 'Valid' and False for 'Same'.
#     'Valid': output size shrinks by `filter_size - 1`, and the filters always sweep at valid 
#              positions inside the input maps. No padding needed.
#     'Same': output size is the same with input size. To do so, one needs to pad the input so 
#             the filter can sweep outside of the input maps.
e = dy.conv2d(mat, kernel, stride=[1, 1], is_valid=True)

# conv2d_bias
# This is 2D convolution operator with bias parameters.
# dy.conv2d_bias(Expression x, Expression f, Expression b, vector[unsigned] stride, bool is_valid = True)
# b: A vector representing bias. (Ci x 1)
b = dy.transpose(dy.inputTensor(np.array([-1, -1, -1])))
e = dy.conv2d_bias(mat, kernel, b, stride=[1, 1], is_valid=True)

((4, 4, 3), 1) ((2, 2, 3, 1), 1)


ValueError: Bad input dimensions in Conv2D: [{4,4,3} {2,2,3,1} {1,3}]

### Recipe

In [None]:
# Convolutions
# e1 \in R^{d x s} (input)
# e2 \in R^{d x m} (filter)
e = dy.conv1d_narrow(e1, e2) # e = e1 *conv e2
e = dy.conv1d_wide(e1, e2)   # e = e1 *conv e2
e = dy.filter1d_narrow(e1, e2) # e = e1 *filter e2

e = dy.kmax_pooling(e1, k) #  kmax-pooling operation (Kalchbrenner et al 2014)
e = dy.kmh_ngram(e1, k) # 
e = dy.fold_rows(e1, nrows=2) #

# create parameter collection
m = dy.ParameterCollection()

# add parameters to parameter collection
pW = m.add_parameters((10,30))
pB = m.add_parameters(10)
lookup = m.add_lookup_parameters((500, 10))
print "added"

# create trainer 
trainer = dy.SimpleSGDTrainer(m)

# Regularization is set via the --dynet-l2 commandline flag.
# Learning rate parameters can be passed to the trainer:
# alpha = 0.1  # learning rate
# trainer = dy.SimpleSGDTrainer(m, e0=alpha)

# function for graph creation
def create_network_return_loss(inputs, expected_output):
    """
    inputs is a list of numbers
    """
    dy.renew_cg()
    W = dy.parameter(pW) # from parameters to expressions
    b = dy.parameter(pB)
    emb_vectors = [lookup[i] for i in inputs]
    net_input = dy.concatenate(emb_vectors)
    net_output = dy.softmax( (W*net_input) + b)
    loss = -dy.log(dy.pick(net_output, expected_output))
    return loss

# function for prediction
def create_network_return_best(inputs):
    """
    inputs is a list of numbers
    """
    dy.renew_cg()
    W = dy.parameter(pW)
    b = dy.parameter(pB)
    emb_vectors = [lookup[i] for i in inputs]
    net_input = dy.concatenate(emb_vectors)
    net_output = dy.softmax( (W*net_input) + b)
    return np.argmax(net_output.npvalue())


# train network
for epoch in xrange(5):
    for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
        print inp, lbl
        loss = create_network_return_loss(inp, lbl)
        print loss.value() # need to run loss.value() for the forward prop
        loss.backward()
        trainer.update()

print create_network_return_best([1,2,3])


### Recipe (using classes)

In [None]:
import dynet as dy
# create parameter collection
m = dy.ParameterCollection()

# create a class encapsulating the network
class OurNetwork(object):
    # The init method adds parameters to the parameter collection.
    def __init__(self, pc):
        self.pW = pc.add_parameters((10,30))
        self.pB = pc.add_parameters(10)
        self.lookup = pc.add_lookup_parameters((500,10))
    
    # the __call__ method applies the network to an input
    def __call__(self, inputs):
        W = dy.parameter(self.pW)
        b = dy.parameter(self.pB)
        lookup = self.lookup
        emb_vectors = [lookup[i] for i in inputs]
        net_input = dy.concatenate(emb_vectors)
        net_output = dy.softmax( (W*net_input) + b)
        return net_output
    
    def create_network_return_loss(self, inputs, expected_output):
        dy.renew_cg()
        out = self(inputs)
        loss = -dy.log(dy.pick(out, expected_output))
        return loss
       
    def create_network_return_best(self, inputs):
        dy.renew_cg()
        out = self(inputs)
        return np.argmax(out.npvalue())
        
        
# create network
network = OurNetwork(m)

# create trainer 
trainer = dy.SimpleSGDTrainer(m)
   
# train network
for epoch in xrange(5):
    for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
        print inp, lbl
        loss = network.create_network_return_loss(inp, lbl)
        print loss.value() # need to run loss.value() for the forward prop
        loss.backward()
        trainer.update()

print
print network.create_network_return_best([1,2,3])


### or, alternatively, have the training outside of the network class

In [None]:
# create network
network = OurNetwork(m)

# create trainer 
trainer = dy.SimpleSGDTrainer(m)
   
# train network
for epoch in xrange(5):
    for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
        print inp, lbl
        dy.renew_cg()
        out = network(inp)
        loss = -dy.log(dy.pick(out, lbl))
        print loss.value() # need to run loss.value() for the forward prop
        loss.backward()
        trainer.update()

print
print np.argmax(network([1,2,3]).npvalue())