###### using http://www.easy-tensorflow.com/autoencoders/noise-removal

In [2]:
import numpy as np
import scipy.sparse as ss
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle

In [6]:
data = ss.random(1000, 23000000, density=0.003, format='csr', dtype=np.float64, random_state=10)

In [7]:
test = data = ss.random(300, 23000000, density=0.003, format='csr', dtype=np.float64, random_state=20)

In [8]:
# hyper-parameters
logs_path = "./logs/noiseRemoval_test_csr_matrix"  # path to the folder that we want to save the logs for Tensorboard
learning_rate = 0.001  # The optimization learning rate
epochs = 10  # Total number of training epochs
batch_size = 100  # Training batch size
display_freq = 100  # Frequency of displaying the training results

# Network Parameters
# We know that MNIST images are 28 pixels in each dimension.
# img_h = img_w = 100
num_features = data.shape[1]

# Images are stored in one-dimensional arrays of this length.
# img_size_flat = img_h * img_w

# number of units in the hidden layer
h1 = 1000

# level of the noise in noisy data
# noise_level = 0.6

load_batch_size = 100

In [9]:
# weight and bais wrappers
def weight_variable(name, shape):
    """
    Create a weight variable with appropriate initialization
    :param name: weight name
    :param shape: weight shape
    :return: initialized weight variable
    """
    initer = tf.truncated_normal_initializer(stddev=0.01)
    return tf.get_variable('W_' + name,
                           dtype=tf.float32,
                           shape=shape,
                           initializer=initer)

def bias_variable(name, shape):
    """
    Create a bias variable with appropriate initialization
    :param name: bias variable name
    :param shape: bias variable shape
    :return: initialized bias variable
    """
    initial = tf.constant(0., shape=shape, dtype=tf.float32)
    return tf.get_variable('b_' + name,
                           dtype=tf.float32,
                           initializer=initial)

def fc_layer(x, num_units, name, use_relu=True):
    """
    Create a fully-connected layer
    :param x: input from previous layer
    :param num_units: number of hidden units in the fully-connected layer
    :param name: layer name
    :param use_relu: boolean to add ReLU non-linearity (or not)
    :return: The output array
    """
    with tf.variable_scope(name):
        in_dim = x.get_shape()[1]
        W = weight_variable(name, shape=[in_dim, num_units])
        tf.summary.histogram('W', W)
        b = bias_variable(name, [num_units])
        tf.summary.histogram('b', b)
        layer = tf.matmul(x, W)
        layer += b
        if use_relu:
            layer = tf.nn.relu(layer)
        return layer

In [10]:
# Create graph
# Placeholders for inputs (x), outputs(y)
with tf.variable_scope('Input'):
    x_original = tf.placeholder(tf.float32, shape=[None, num_features], name='X_original')
#     x_noisy = tf.placeholder(tf.float32, shape=[None, num_features], name='X_noisy')

# fc1 = fc_layer(x_noisy, h1, 'Hidden_layer', use_relu=True)
fc1 = fc_layer(x_original, h1, 'Hidden_layer', use_relu=True)
out = fc_layer(fc1, num_features, 'Output_layer', use_relu=False)

# Define the loss function, optimizer, and accuracy
with tf.variable_scope('Train'):
    with tf.variable_scope('Loss'):
        loss = tf.reduce_mean(tf.losses.mean_squared_error(x_original, out), name='loss')
        tf.summary.scalar('loss', loss)
    with tf.variable_scope('Optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, name='Adam-op').minimize(loss)

# Initializing the variables
init = tf.global_variables_initializer()

In [11]:
def next_batch_data(batch_size):
    '''
    slice small number of data points from the large csr_matrix
    make a SparseTensor and return it 
    '''
    idx = np.arange(0 , data.shape[0])
    np.random.shuffle(idx)
    idx = idx[:batch_size]

    coo_matrix = data[idx].tocoo()
    tf_coo_matrix = tf.SparseTensorValue(
        indices=np.array([coo_matrix.row, coo_matrix.col]).T,
        values=coo_matrix.data,
        dense_shape=coo_matrix.shape)
    
    return tf.SparseTensor.from_value(tf_coo_matrix)

In [12]:
%%time
C = next_batch_data(5)

CPU times: user 78.6 ms, sys: 58.9 ms, total: 137 ms
Wall time: 137 ms


In [13]:
type(C)

tensorflow.python.framework.sparse_tensor.SparseTensor

In [14]:
C.get_shape()

TensorShape([Dimension(5), Dimension(23000000)])

### I need to skip the noise. The noise in general should have the chance to add values to zero elements as well. It is noise afterall and just adding it to the non-zero elements is not real!!!

#### prepare the test tensor

In [15]:
%%time
np.random.seed(10)
idx = np.arange(0 , test.shape[0])
np.random.shuffle(idx)
idx = idx[:200]

test_coo_matrix = test[idx].tocoo()
test_tf_coo_matrix = tf.SparseTensorValue(
    indices=np.array([test_coo_matrix.row, test_coo_matrix.col]).T,
    values=test_coo_matrix.data,
    dense_shape=test_coo_matrix.shape)

test_sp_tensor = tf.SparseTensor.from_value(test_tf_coo_matrix)

CPU times: user 1.8 s, sys: 387 ms, total: 2.19 s
Wall time: 2.19 s


In [None]:
test_sp_tensor.get_shape()

TensorShape([Dimension(200), Dimension(23000000)])

In [None]:
# Launch the graph (session)
# sess = tf.InteractiveSession() # using InteractiveSession instead of Session to test network in separate cell
sess = tf.Session() # using InteractiveSession instead of Session to test network in separate cell
sess.run(init)
train_writer = tf.summary.FileWriter(logs_path, sess.graph)
num_tr_iter = int(num_train / batch_size)
global_step = 0
for epoch in range(epochs):
    print('Training epoch: {}'.format(epoch + 1))
    for iteration in range(num_tr_iter):
        batch_x = next_batch_data(batch_size)
        print(batch_x.get_shape())
#         batch_x_noisy = batch_x + noise_level * np.random.normal(loc=0.0, scale=1.0, size=batch_x.shape)

        global_step += 1
        print(global_step)
        # Run optimization op (backprop)
#         feed_dict_batch = {x_original: batch_x, x_noisy: batch_x_noisy}
        feed_dict_batch = {x_original: batch_x}
        _, summary_tr = sess.run([optimizer, merged], feed_dict=feed_dict_batch)
        train_writer.add_summary(summary_tr, global_step)

        if iteration % display_freq == 0:
            # Calculate and display the batch loss and accuracy
            loss_batch = sess.run(loss,
                                  feed_dict=feed_dict_batch)
            print("iter {0:3d}:\t Reconstruction loss={1:.3f}".
                  format(iteration, loss_batch))

    # Run validation after every epoch
#     x_valid_original  = mnist.validation.images
    x_valid_original  = test_sp_tensor
#     x_valid_noisy = x_valid_original + noise_level * np.random.normal(loc=0.0, scale=1.0, size=x_valid_original.shape)

    feed_dict_valid = {x_original: x_valid_original}
    loss_valid = sess.run(loss, feed_dict=feed_dict_valid)
    print('---------------------------------------------------------')
    print("Epoch: {0}, validation loss: {1:.3f}".
          format(epoch + 1, loss_valid))
    print('---------------------------------------------------------')
 