In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.math as tm
import numpy as np
import random
import time
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import initializers
from tensorflow.keras import Model
from tensorflow.keras.layers import Flatten, Dense
from sklearn.metrics import accuracy_score

In [2]:
# Load MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Select binary data
label_sub = [0,1]
x_train_sub = [x.reshape(-1) for x, y in zip(x_train, y_train) if y in label_sub]
y_train_sub = [y.reshape(-1) for y in y_train if y in label_sub]
x_test_sub = [x.reshape(-1) for x, y in zip(x_test, y_test) if y in label_sub]
y_test_sub = [y.reshape(-1) for y in y_test if y in label_sub]


print('There are', len(x_train_sub), 'training images.')
print('There are', len(x_test_sub), 'test images.')

There are 12665 training images.
There are 2115 test images.


In [3]:
x_train_sub = x_train_sub[:100]
y_train_sub = y_train_sub[:100]

print('There are', len(x_train_sub), 'training images.')

There are 100 training images.


In [4]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train_sub, y_train_sub)).shuffle(10000).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((x_test_sub, y_test_sub)).batch(32)

2021-10-11 22:00:56.129151: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Standard BP
model_bp = keras.Sequential(
    [
        keras.Input(shape=(784,)),
        layers.Dense(32, activation = "sigmoid"),
        layers.Dense(1, activation = "sigmoid")
    ]
)

batch_size = 32
epochs = 200
#opt = tf.keras.optimizers.SGD(learning_rate=.1)
opt = tf.keras.optimizers.Adam(learning_rate=.01)
st = time.time()
model_bp.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy", "AUC"])
history = model_bp.fit(train_ds, batch_size=batch_size, epochs=epochs)
print(time.time() - st)

In [21]:
def convert2_zero_one(x):
    
    t = [tf.math.sigmoid(i) for i in x]    
    return t

def cont_bern_log_norm(lam, l_lim=0.49, u_lim=0.51):
    '''
    computes the log normalizing constant of a continuous Bernoulli distribution in a numerically stable way.
    returns the log normalizing constant for lam in (0, l_lim) U (u_lim, 1) and a Taylor approximation in
    [l_lim, u_lim].
    cut_y below might appear useless, but it is important to not evaluate log_norm near 0.5 as tf.where evaluates
    both options, regardless of the value of the condition.
    '''
    
    cut_lam = tf.where(tm.logical_or(tm.less(lam, l_lim), tm.greater(lam, u_lim)), lam, l_lim * tf.ones_like(lam))
    log_norm = tm.log(tm.abs(2.0 * tm.atanh(1 - 2.0 * cut_lam))) - tm.log(tm.abs(1 - 2.0 * cut_lam))
    taylor = tm.log(2.0) + 4.0 / 3.0 * tm.pow(lam - 0.5, 2) + 104.0 / 45.0 * tm.pow(lam - 0.5, 4)
    return tf.where(tm.logical_or(tm.less(lam, l_lim), tm.greater(lam, u_lim)), log_norm, taylor)

In [33]:
# MLP model
class StochasticMLP(Model):
    
    def __init__(self, hidden_layer_sizes=[100], n_outputs=10):
        super(StochasticMLP, self).__init__()
        self.hidden_layer_sizes = hidden_layer_sizes
        
        initializer = initializers.RandomUniform(minval=-0.001, maxval=0.001, seed=None)
        self.fc_layers = [Dense(layer_size, kernel_initializer = initializer) for layer_size in hidden_layer_sizes]
        
        self.output_layer = Dense(n_outputs, kernel_initializer = initializer)
    
    def call(self, x):
        
        #x = Flatten()(x)
        
        network = []
        
        for i, layer in enumerate(self.fc_layers):
            
            logits = layer(x)
            x = tfp.distributions.Bernoulli(logits=logits).sample()
            network.append(x)

        final_logits = self.output_layer(x) # initial the weight of output layer
            
        return network
    
    def target_log_prob(self, x, h, y):
        
        h_current = convert2_zero_one([tf.cast(h_i, dtype=tf.float32) for h_i in h])
        h_previous = [x] + h_current[:-1]
    
        nlog_prob = 0. # negative log probability
        
        for i, (cv, pv, layer) in enumerate(
            zip(h_current, h_previous, self.fc_layers)):
            
            logits = layer(pv)
            #print(cont_bern_log_norm(tf.nn.sigmoid(logits)))
            
            ce = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=cv, logits=logits)
            
            ce += cont_bern_log_norm(tf.nn.sigmoid(logits))
            
            nlog_prob += tf.reduce_sum(ce, axis = -1)
        
        fce = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=tf.cast(y, tf.float32), logits=self.output_layer(h_current[-1]))
        nlog_prob += tf.reduce_sum(fce, axis = -1)
            
        return -1 * nlog_prob

    def target_log_prob2(self, x, h, y):

        h_current = convert2_zero_one(tf.split(h, self.hidden_layer_sizes, axis = 1))
        h_previous = [x] + h_current[:-1]
        
        nlog_prob = 0.
        
        for i, (cv, pv, layer) in enumerate(
            zip(h_current, h_previous, self.fc_layers)):
            
            logits = layer(pv)
            ce = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=cv, logits=logits)
            
            ce += cont_bern_log_norm(tf.nn.sigmoid(logits))
            
            nlog_prob += tf.reduce_sum(ce, axis = -1)
        
        fce = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=tf.cast(y, tf.float32), logits=self.output_layer(h_current[-1]))
        nlog_prob += tf.reduce_sum(fce, axis = -1)
            
        return -1 * nlog_prob
    
    def generate_hmc_kernel(self, x, y, step_size = pow(1000, -1/4)):
        
        adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation(tfp.mcmc.HamiltonianMonteCarlo(
            target_log_prob_fn = lambda v: self.target_log_prob2(x, v, y),
            num_leapfrog_steps = 2,
            step_size = step_size),
            num_adaptation_steps=int(100 * 0.8))
        
        return adaptive_hmc
    
    # new proposing-state method with HamiltonianMonteCarlo
    def propose_new_state_hamiltonian(self, x, h, y, hmc_kernel, is_update_kernel = True):
    
        h_current = h
        h_current = [tf.cast(h_i, dtype=tf.float32) for h_i in h_current]
        h_current = tf.concat(h_current, axis = 1)

        # run the chain (with burn-in)
        num_burnin_steps = 0
        num_results = 1

        samples = tfp.mcmc.sample_chain(
            num_results = num_results,
            num_burnin_steps = num_burnin_steps,
            current_state = h_current, # may need to be reshaped
            kernel = hmc_kernel,
            trace_fn = None,
            return_final_kernel_results = True)
    
        # Generate new states of chains
        #h_state = rerange(samples[0][0])
        h_state = samples[0][0]
        h_new = tf.split(h_state, self.hidden_layer_sizes, axis = 1) 
        
        # Update the kernel if necesssary
        if is_update_kernel:
            new_step_size = samples[2].new_step_size.numpy()
            ker_new = self.generate_hmc_kernel(x, y, new_step_size)
            return(h_new, ker_new)
        else:
            return h_new
    
    def update_weights(self, x, h, y, lr = 0.0001):
        
        #optimizer = tf.keras.optimizers.SGD(learning_rate = lr)
        optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
        
        with tf.GradientTape() as tape:
            loss = -1 * tf.reduce_mean(self.target_log_prob(x, h, y))
        
        grads = tape.gradient(loss, self.trainable_weights)
        optimizer.apply_gradients(zip(grads, self.trainable_weights))

    def get_predictions(self, x):

        logits = 0.0
        for layer in self.fc_layers:
            logits = layer(x)
            x = tm.sigmoid(logits)
        
        logits = self.output_layer(x)
        probs = tm.sigmoid(logits)
        #print(probs)
        labels = tf.cast(tm.greater(probs, 0.5), tf.int32)

        return labels

In [34]:
model = StochasticMLP(hidden_layer_sizes = [32], n_outputs = 1)

In [35]:
network = [model.call(images) for images, labels in train_ds]

In [36]:
kernels = [model.generate_hmc_kernel(images, labels) for images, labels in train_ds]

In [37]:
for bs, (x, y) in enumerate(train_ds):
    if bs == 0:
        xx = x
        yy = y
        break

In [38]:
model.fc_layers[0].get_weights()

[array([[ 3.4396478e-04, -7.6621299e-04,  7.5266813e-04, ...,
          6.5700873e-04,  2.2961758e-06, -5.1512721e-04],
        [ 5.5426860e-04,  1.0352512e-04,  7.9948548e-04, ...,
          5.8831181e-05, -8.8455470e-04,  4.6112970e-04],
        [ 8.6348515e-04, -8.7413029e-04,  4.6454952e-05, ...,
         -8.8304403e-04, -8.4893347e-04, -8.3617930e-04],
        ...,
        [-7.7435141e-04,  7.2159490e-04, -2.8592360e-05, ...,
         -7.2757585e-04,  2.2524153e-05,  8.5306750e-04],
        [ 3.2542017e-04,  2.9347953e-05,  6.5111357e-04, ...,
          3.2382656e-04, -5.5936724e-04, -8.4990839e-04],
        [ 8.7197591e-04,  2.6409863e-04, -8.5072807e-04, ...,
          7.7586086e-04, -5.6392339e-04,  5.2133226e-04]], dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32)]

In [39]:
# Burn-in
burnin = 50

for i in range(burnin):

    if(i % 10 == 0): print("Step %d" % i)
    network_new = []
    kernels_new = []
    
    res = [model.propose_new_state_hamiltonian(x, net, y, ker) 
           for (x, y), net, ker in zip(train_ds, network, kernels)]
    
    network_new, kernels_new = zip(*res)
    network = network_new
    kernels = kernels_new

Step 0
Step 10
Step 20
Step 30
Step 40


In [40]:
for bs, (x, y) in enumerate(train_ds):
    print(model.target_log_prob(x, network[bs], y))

tf.Tensor(
[-45.51847  -45.77223  -47.125553 -53.94017  -53.374493 -49.632946
 -50.803955 -56.66916  -51.22321  -60.287422 -47.516476 -56.629993
 -54.50511  -48.16477  -51.416534 -47.318283 -56.720867 -49.27162
 -49.97294  -51.43395  -53.060726 -49.52051  -46.25586  -51.738476
 -57.953243 -45.689724 -51.1096   -48.981056 -48.100815 -48.491417
 -49.12381  -55.9877  ], shape=(32,), dtype=float32)
tf.Tensor(
[-52.47387  -47.216652 -48.447617 -57.324703 -46.34085  -51.767185
 -60.9736   -52.030544 -53.629562 -48.82621  -56.07774  -48.939644
 -58.93603  -52.572697 -48.59926  -55.73465  -56.79461  -48.856083
 -48.28761  -53.505035 -50.4091   -48.39776  -51.017643 -48.16271
 -58.02049  -67.630714 -57.618393 -46.710663 -52.12215  -56.521996
 -50.296543 -52.649612], shape=(32,), dtype=float32)
tf.Tensor(
[-44.641747 -50.371323 -58.580444 -45.97592  -59.742413 -49.370155
 -51.410183 -59.769863 -47.340805 -67.61995  -50.1108   -52.36943
 -51.186913 -48.359848 -46.741077 -45.44779  -45.280865 -49.

In [None]:
'''
# plot values of nodes
import matplotlib.pyplot as plt

#units = list(np.random.randint(100, size = 10))

for j in range(200):

    print(j)
    k = j % 10
    if(k == 0):
        fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(14, 20))
    
    ax[k // 2, k % 2].plot(np.arange(20000), [net_values[i][j] for i in range(20000)])
    ax[k // 2, k % 2].set_title('Unit %i' % j)
    ax[k // 2, k % 2].set_ylim([-6, 6])

    if(k == 9):
        plt.tight_layout()
        plt.savefig('plots/100_50_50/merge_20k_%d' % (j//10))
        plt.close()

'''

In [41]:
# Training
epochs = 200
loss_ls = []
acc_ls = []
start_time = time.time()

for epoch in range(epochs):
    
    loss = 0.0
    acc = 0.0
    for bs, (x, y) in enumerate(train_ds):
        
        # only one mini-batch
        model.update_weights(x, network[bs], y)
        res = [model.propose_new_state_hamiltonian(x, net, y, ker, is_update_kernel = False) \
                   for (x, y), net, ker in zip(train_ds, network, kernels)]
        network = res
        loss += -1 * tf.reduce_sum(model.target_log_prob(x, network[bs], y))
    
    preds = [model.get_predictions(images) for images, labels in train_ds]
    train_acc = accuracy_score(np.concatenate(preds), y_train_sub)
    loss_ls.append(loss / len(x_train_sub))
    acc_ls.append(train_acc)
    
    print("Epoch %d/%d: - %.4fs/step - loss: %.4f - accuracy: %.4f" 
          % (epoch + 1, epochs, (time.time() - start_time) / (epoch + 1), loss / len(x_train_sub), train_acc))

print("Time of HMC: ", time.time() - start_time)

Epoch 1/200: - 0.4226s/step - loss: 56.7870 - accuracy: 0.5700
Epoch 2/200: - 0.4234s/step - loss: 56.1462 - accuracy: 0.5700
Epoch 3/200: - 0.4278s/step - loss: 56.3811 - accuracy: 0.5700
Epoch 4/200: - 0.4271s/step - loss: 56.9586 - accuracy: 0.5700
Epoch 5/200: - 0.4370s/step - loss: 56.3652 - accuracy: 0.5700
Epoch 6/200: - 0.4314s/step - loss: 57.4679 - accuracy: 0.5700
Epoch 7/200: - 0.4281s/step - loss: 56.0944 - accuracy: 0.5700
Epoch 8/200: - 0.4254s/step - loss: 55.4469 - accuracy: 0.5700
Epoch 9/200: - 0.4230s/step - loss: 55.2860 - accuracy: 0.5700
Epoch 10/200: - 0.4214s/step - loss: 56.0765 - accuracy: 0.5700
Epoch 11/200: - 0.4208s/step - loss: 55.3205 - accuracy: 0.5700
Epoch 12/200: - 0.4198s/step - loss: 55.7927 - accuracy: 0.5700
Epoch 13/200: - 0.4232s/step - loss: 55.6413 - accuracy: 0.5700
Epoch 14/200: - 0.4222s/step - loss: 56.7844 - accuracy: 0.5700
Epoch 15/200: - 0.4211s/step - loss: 56.3408 - accuracy: 0.5700
Epoch 16/200: - 0.4202s/step - loss: 55.8783 - ac

Epoch 129/200: - 0.4148s/step - loss: 54.2274 - accuracy: 0.5700
Epoch 130/200: - 0.4150s/step - loss: 55.0305 - accuracy: 0.5700
Epoch 131/200: - 0.4149s/step - loss: 56.4043 - accuracy: 0.5700
Epoch 132/200: - 0.4148s/step - loss: 55.0844 - accuracy: 0.5700
Epoch 133/200: - 0.4150s/step - loss: 53.7010 - accuracy: 0.5700
Epoch 134/200: - 0.4153s/step - loss: 54.9485 - accuracy: 0.5700
Epoch 135/200: - 0.4156s/step - loss: 54.6202 - accuracy: 0.5700
Epoch 136/200: - 0.4155s/step - loss: 55.2389 - accuracy: 0.5700
Epoch 137/200: - 0.4154s/step - loss: 53.8436 - accuracy: 0.5700
Epoch 138/200: - 0.4154s/step - loss: 54.8221 - accuracy: 0.5700
Epoch 139/200: - 0.4155s/step - loss: 55.3510 - accuracy: 0.5700
Epoch 140/200: - 0.4156s/step - loss: 55.6311 - accuracy: 0.5700
Epoch 141/200: - 0.4158s/step - loss: 54.3086 - accuracy: 0.5700
Epoch 142/200: - 0.4160s/step - loss: 54.7205 - accuracy: 0.5700
Epoch 143/200: - 0.4161s/step - loss: 56.0715 - accuracy: 0.5700
Epoch 144/200: - 0.4162s/

In [32]:
len(x_train_sub)

100