Copyright 2018 Chris Wendler

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
limitations under the License.

# Tensorflow with Sparse Tensors

This notebook implements a fully connected neural network layer parametrized by a sparse tensor. The layer takes a sparse tensor as input and outputs another sparse tensor. Note that for this toy implementation the nonzero indices of the input tensor must coincide with the ones of the weight tensor. The goal of this notebook is to show how to write custom operations with gradients for sparse tensors.

In [None]:
import numpy as np
import tensorflow as tf
#tf.enable_eager_execution()

## Dataset: XOR - problem

We create a dataset with sparse tensors (with fixed nonzero entries) as input and binary labels depending on the first two non-zero components of the sparse tensor are as output. Sample $x$ has label $x_0 \mbox{ xor } x_1$. 

In [2]:
model_dir = "/tmp/xor/model.ckpt"
NUM_EPOCHS = 100
BATCH_SIZE = 200
N = 5000
D = 1000
index = np.ones((5, 1))
index[:, 0] = np.random.randint(0,D,5)
#indices = np.asarray([index]*N)
values = np.ones((N, 5))
values[:,:4] = np.random.rand(N,4)*2 - 1
shape = [D]
labels = ((values[:,0] < 0) != (values[:, 1] < 0)).astype(np.int32)
print(values.shape, labels.shape, labels.sum()/N)

(5000, 5) (5000,) 0.5044


Now we wrap the data into a tf.data.Dataset, in order to feed it to our model later.

In [3]:
#train data
x_train = tf.data.Dataset.from_tensor_slices(values[:int(0.8*N)])
y_train = tf.data.Dataset.from_tensor_slices(labels[:int(0.8*N)]).map(lambda y: tf.one_hot(y, 2))
dataset = tf.data.Dataset.zip((x_train, y_train))      
dataset = dataset.map(lambda x, y: (tf.SparseTensor(index, x, shape), y), 6)
dataset = dataset.shuffle(1000).repeat(NUM_EPOCHS).batch(BATCH_SIZE)
#test data
x_test = tf.data.Dataset.from_tensor_slices(values[int(0.8*N):])
y_test = tf.data.Dataset.from_tensor_slices(labels[int(0.8*N):]).map(lambda y: tf.one_hot(y, 2))
testset = tf.data.Dataset.zip((x_test, y_test))      
testset = testset.map(lambda x, y: (tf.SparseTensor(index, x, shape), y), 6)
testset = testset.batch(BATCH_SIZE)
#data iterator
iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes, output_classes=(tf.SparseTensor, tf.Tensor))
next_element = iterator.get_next()
train_init_op = iterator.make_initializer(dataset)
test_init_op = iterator.make_initializer(testset)

## Custom Ops using Pyfunc

For any practical application this part should be implemented using a C++ kernel, however, for this toy example it suffices to implement the custom operations necessary for the implementation of the sparse tensor fully connected layer using tf.py_func.

In [None]:
from tensorflow.python.framework import ops

# Define custom py_func which takes also a grad op as argument:
def py_func(func, inp, Tout, stateful=True, name=None, grad=None):
    # Need to generate a unique name to avoid duplicates:
    rnd_name = 'PyFuncGrad' + str(np.random.randint(0, 1E+8))
    tf.RegisterGradient(rnd_name)(grad)  
    g = tf.get_default_graph()
    with g.gradient_override_map({"PyFunc": rnd_name}):
        return tf.py_func(func, inp, Tout, stateful=stateful, name=name)
    
def dense_np(w_idx, w_val, w_shape, x_idx, x_val, x_shape):
    """
    Computes fully connected layer for sparse tensors. Nonzero indices must be in matching order.
    Args:
        w ... the sparse weight tensor [n_neurons, n_nonzeros_in]
        x ... the sparse input batch tensor [batch_size, n_nonzeros_in]
    """
    batch_size = x_shape[0]
    n_neurons = w_shape[0]
    h_shape = np.array([batch_size, n_neurons])
    x = np.arange(batch_size)
    y = np.arange(n_neurons) 
    h_idx = np.transpose([np.repeat(x, len(y)), np.tile(y, len(x))])
    h_vals = np.zeros(batch_size * n_neurons)
    
    for bi in range(batch_size):
        #for ni in range(n_neurons):
        #    h_vals[bi*n_neurons + ni] = np.dot(w_val[w_idx[:,0] == ni], x_val[x_idx[:,0] == bi])
        for idx, val in zip(w_idx, w_val):
            h_vals[bi*n_neurons + idx[0]] += val * x_val[(x_idx == [bi, idx[1]]).all(axis=1)]      
    return h_idx.astype(np.int64), h_vals.astype(np.float32), h_shape.astype(np.int64)

def dense_grad_np(w_idx, w_val, w_shape, x_idx, x_val, x_shape, grad):
    batch_size = x_shape[0]
    n_neurons = w_shape[0]
    w_grad = np.zeros(w_val.shape)
    x_grad = np.zeros(x_val.shape)
    
    for j, idx in enumerate(w_idx):
        for bi in range(batch_size):
            w_grad[j] += grad[bi*n_neurons + idx[0]]*x_val[(x_idx==[bi, idx[1]]).all(axis=1)]
            
    for k, idx in enumerate(x_idx):
        for ni in range(n_neurons):
            x_grad[k] += grad[idx[0]*n_neurons + ni]*w_val[(w_idx==[ni, idx[1]]).all(axis=1)]
            
    return w_grad.astype(np.float32), x_grad.astype(np.float32)

def dense_grad_pyfunc(w_idx, w_val, w_shape, x_idx, x_val, x_shape, grad, name=None):
    with ops.name_scope(name, "dense_grad_pyfunc", [w_idx, w_val, w_shape, x_idx, x_val, x_shape, grad]) as name:
        return tf.py_func(dense_grad_np, [w_idx, w_val, w_shape, x_idx, x_val, x_shape, grad], [tf.float32, tf.float32], name=name)

def dense_grad(op, grad1, grad2, grad3):
    w_idx = op.inputs[0]
    w_val = op.inputs[1]
    w_shape = op.inputs[2]
    x_idx = op.inputs[3]
    x_val = op.inputs[4]
    x_shape = op.inputs[5]
    w_grad, x_grad = dense_grad_pyfunc(w_idx, w_val, w_shape, x_idx, x_val, x_shape, grad2)
    return None, w_grad, None, None, x_grad, None

def dense_op(w_idx, w_val, w_shape, x_idx, x_val, x_shape, name=None):
    with ops.name_scope(name, "dense_pyfunc", [w_idx, w_val, w_shape, x_idx, x_val, x_shape]) as name:
        return py_func(dense_np, [w_idx, w_val, w_shape, x_idx, x_val, x_shape], [tf.int64, tf.float32, tf.int64],
                       name = name, grad = dense_grad)
    


## Implementation of the Custom Layer

In [11]:
from tensorflow.python.framework import tensor_shape
from tensorflow.python.layers import base
from tensorflow.python.layers import utils
class SparseFullyConnected(base.Layer):
    def __init__(self, units, units_in, in_indices, activation, to_dense=False, name=None, trainable=True, **kwargs):
        super(SparseFullyConnected, self).__init__(trainable=trainable, name=name, **kwargs)
        self.n_units_out = units
        self.n_units_in = units_in
        self.activation = activation
        self.in_indices = in_indices
        self.to_dense = to_dense

    def build(self, input_shape):
        #expected input format: [None, dim]
        n_nonzero = len(self.in_indices)
        with tf.variable_scope(self.name, reuse=True) as scope:
            self.w_values = self.add_variable('W', shape=[self.n_units_out * n_nonzero], dtype=tf.float32, 
                                     initializer=tf.glorot_uniform_initializer(), trainable=True)
        self.w_shape = [self.n_units_out, self.n_units_in]
        x = np.arange(self.n_units_out)
        y = self.in_indices
        self.w_indices = np.transpose([np.repeat(x, len(y)), np.tile(y, len(x))])
        self.W = tf.SparseTensor(self.w_indices, self.w_values, self.w_shape)
        self.built = True
        
    def call(self, inputs):
        w_idx = self.W.indices
        w_val = self.W.values
        w_shape = self.W.dense_shape
        x_idx = inputs.indices
        x_val = inputs.values
        x_shape = inputs.dense_shape
        h_idx, h_val, h_shape = dense_op(w_idx, w_val, w_shape, x_idx, x_val, x_shape)
        #h = tf.SparseTensor(h_idx, h_val, h_shape)
        h = tf.SparseTensor(h_idx, self.activation(h_val), h_shape)
        if(self.to_dense):
            h = tf.sparse_add(tf.zeros(h_shape), h) 
        return h
        
def sparsefc(inputs, units, units_in, in_indices, activation, to_dense=False, name=None, reuse=None):
    layer = SparseFullyConnected(units, units_in, in_indices, activation, to_dense=to_dense, name=name, dtype=inputs.dtype.base_dtype, _scope=name, _reuse=reuse)
    return layer.apply(inputs)

## Defintion of the Model and Training/Test Loop

In [12]:
def nn_model(in_data):
    in_indices = index.reshape((-1))
    fc1 = sparsefc(in_data, 10, D, in_indices, tf.nn.relu)
    in_indices = np.arange(10)
    fc2 = sparsefc(fc1, 10, 10, in_indices, tf.nn.relu)
    fc3 = sparsefc(fc2, 2, 10, in_indices, tf.nn.relu, to_dense=True)
    return fc3

In [13]:
logits = nn_model(next_element[0])
# add the optimizer and loss
loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=next_element[1], logits=logits))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(train_init_op)
    writer = tf.summary.FileWriter("/tmp/xor", sess.graph)
    sess.run(loss)
    writer.close()
optimizer = tf.train.AdamOptimizer().minimize(loss)
# get accuracy
prediction = tf.argmax(logits, 1)
equality = tf.equal(prediction, tf.argmax(next_element[1], 1))
accuracy = tf.reduce_mean(tf.cast(equality, tf.float32))
init_op = tf.global_variables_initializer()

In [14]:
for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
    print(var)

<tf.Variable 'sparse_fully_connected/W:0' shape=(50,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected_1/W:0' shape=(100,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected_2/W:0' shape=(20,) dtype=float32_ref>
<tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>
<tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>
<tf.Variable 'sparse_fully_connected/W/Adam:0' shape=(50,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected/W/Adam_1:0' shape=(50,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected_1/W/Adam:0' shape=(100,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected_1/W/Adam_1:0' shape=(100,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected_2/W/Adam:0' shape=(20,) dtype=float32_ref>
<tf.Variable 'sparse_fully_connected_2/W/Adam_1:0' shape=(20,) dtype=float32_ref>


In [None]:
with tf.variable_scope('sparse_fully_connected', reuse=True):
    W = tf.get_variable('W', shape=[50])
    

# run the training
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(init_op)
    W0 = W.eval()
    sess.run(train_init_op)
    i = 0
    while True:
        try:
            l, _, acc = sess.run([loss, optimizer, accuracy])
            if i % 10 == 0:
                W1 = W.eval()
                print("Batch: {}, loss: {:.3f}, training accuracy: {:.2f}%".format(i, l, acc * 100))
                print("W change: ", np.sum((W0-W1)**2))
                W0 = W1
                save_path = saver.save(sess, model_dir, i)
                print("Model saved in path:", save_path)
            i += 1
        except tf.errors.OutOfRangeError:
            break
    # now setup the validation run
    valid_iters = 100
    # re-initialize the iterator, but this time with validation data
    sess.run(test_init_op)
    avg_acc = 0
    valid_iters = 0
    while True:
        try:
            acc = sess.run([accuracy])
            avg_acc += acc[0]
            valid_iters += 1
        except tf.errors.OutOfRangeError:
            break
    print("Average validation set accuracy over {} iterations is {:.2f}%".format(valid_iters, (avg_acc / valid_iters) * 100))

Batch: 0, loss: 138.608, training accuracy: 48.00%
W change:  4.9998966e-05
Model saved in path: /tmp/xor/model.ckpt-0
Batch: 10, loss: 138.493, training accuracy: 48.00%
W change:  0.0024471374
Model saved in path: /tmp/xor/model.ckpt-10
Batch: 20, loss: 138.073, training accuracy: 53.00%
W change:  0.0024763562
Model saved in path: /tmp/xor/model.ckpt-20
Batch: 30, loss: 138.208, training accuracy: 53.50%
W change:  0.0025087628
Model saved in path: /tmp/xor/model.ckpt-30
Batch: 40, loss: 138.094, training accuracy: 54.50%
W change:  0.0027642709
Model saved in path: /tmp/xor/model.ckpt-40
Batch: 50, loss: 137.687, training accuracy: 56.50%
W change:  0.0031841423
Model saved in path: /tmp/xor/model.ckpt-50
Batch: 60, loss: 136.970, training accuracy: 59.00%
W change:  0.0039656097
Model saved in path: /tmp/xor/model.ckpt-60
Batch: 70, loss: 136.564, training accuracy: 64.50%
W change:  0.005355005
Model saved in path: /tmp/xor/model.ckpt-70
Batch: 80, loss: 135.973, training accurac

Batch: 680, loss: 21.063, training accuracy: 99.50%
W change:  0.0002719172
Model saved in path: /tmp/xor/model.ckpt-680
Batch: 690, loss: 16.920, training accuracy: 99.50%
W change:  0.00024256074
Model saved in path: /tmp/xor/model.ckpt-690
Batch: 700, loss: 18.221, training accuracy: 100.00%
W change:  0.0002848062
Model saved in path: /tmp/xor/model.ckpt-700
Batch: 710, loss: 17.502, training accuracy: 99.50%
W change:  0.00021081
Model saved in path: /tmp/xor/model.ckpt-710
Batch: 720, loss: 16.036, training accuracy: 97.50%
W change:  0.00026090638
Model saved in path: /tmp/xor/model.ckpt-720
Batch: 730, loss: 12.693, training accuracy: 100.00%
W change:  0.00021241183
Model saved in path: /tmp/xor/model.ckpt-730


In [None]:
with tf.variable_scope('sparse_fully_connected', reuse=True):
    W = tf.get_variable('W', shape=[50])
with tf.Session() as sess:
    saver.restore(sess, model_dir)
    print(W.eval())