In [1]:
import numpy as np

In [2]:
"""
1d input x_i and x_j, alias to x and x' in the paper
for 2d input we should use np.meshgrid or np.mgrid
"""

num_input = 5

x_i = np.linspace(0.1, 0.5, num=num_input)
x_j = np.linspace(0.6, 1.0, num=num_input)

display(x_i)
display(x_j)


array([0.1, 0.2, 0.3, 0.4, 0.5])

array([0.6, 0.7, 0.8, 0.9, 1. ])

In [3]:
# take outer product of x_i and x_j to get 100 x 100 matrix
# in the paper sigma was defined entry-wise, but we want to compute the matrix at once
n_0 = 1
beta = 0.1

def calc_sigma_1(x_i, x_j):
    return np.outer(x_i, x_j) / n_0 + beta**2

display(calc_sigma_1(x_i, x_j))

array([[0.07, 0.08, 0.09, 0.1 , 0.11],
       [0.13, 0.15, 0.17, 0.19, 0.21],
       [0.19, 0.22, 0.25, 0.28, 0.31],
       [0.25, 0.29, 0.33, 0.37, 0.41],
       [0.31, 0.36, 0.41, 0.46, 0.51]])

In [4]:
"""
f(x_i) and f(x_j) follow N(0, cov) where cov is a contatenation of 4 smaller matrices
"""

cov_ii = calc_sigma_1(x_i, x_i)
cov_ij = calc_sigma_1(x_i, x_j)
cov_ji = calc_sigma_1(x_j, x_i)
cov_jj = calc_sigma_1(x_j, x_j)

# cov_ij == cov_ji.T

In [5]:
cov_ij == cov_ji.T

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [6]:
# concatenate the 4 sigmas

v1 = np.vstack((cov_ii, cov_ji))
v2 = np.vstack((cov_ij, cov_jj))

cov = np.hstack((v1, v2))

display(cov.shape)
print(cov)

(10, 10)

[[0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1  0.11]
 [0.03 0.05 0.07 0.09 0.11 0.13 0.15 0.17 0.19 0.21]
 [0.04 0.07 0.1  0.13 0.16 0.19 0.22 0.25 0.28 0.31]
 [0.05 0.09 0.13 0.17 0.21 0.25 0.29 0.33 0.37 0.41]
 [0.06 0.11 0.16 0.21 0.26 0.31 0.36 0.41 0.46 0.51]
 [0.07 0.13 0.19 0.25 0.31 0.37 0.43 0.49 0.55 0.61]
 [0.08 0.15 0.22 0.29 0.36 0.43 0.5  0.57 0.64 0.71]
 [0.09 0.17 0.25 0.33 0.41 0.49 0.57 0.65 0.73 0.81]
 [0.1  0.19 0.28 0.37 0.46 0.55 0.64 0.73 0.82 0.91]
 [0.11 0.21 0.31 0.41 0.51 0.61 0.71 0.81 0.91 1.01]]


In [7]:
"""
now sample Y_i, Y_j from this distribution N(0, cov)
"""
mean = [0] * 2 * num_input

# sample once
np.random.seed(26)
sample = np.random.multivariate_normal(mean, cov, size=1, check_valid='warn')[0]
f_xi = sample[:num_input]
f_xj = sample[num_input:]
display(f_xi)
display(f_xj)

array([ 0.14236386,  0.09536292,  0.04836197,  0.00136103, -0.04563992])

array([-0.09264088, -0.13964182, -0.18664276, -0.23364372, -0.28064466])

In [8]:
# gaol: approximate expectation using a finite sum and take the mean
# approach 1: sample and compute outer product first, then take mean
def calc_sigma_2_v1(activation):
    num_samples = 100
    sum_N = np.zeros((num_input, num_input))
    np.random.seed(26)
    for i in range(num_samples):
        sample = np.random.multivariate_normal(mean, cov, size=1)[0]
        f_xi = sample[:num_input]
        f_xj = sample[num_input:]
        sum_N += np.outer(activation(f_xi), activation(f_xj))

    sigma_2 = (sum_N / num_samples) + beta**2
    
    return sigma_2

#     display(sigma_2)

In [9]:
# approach 2: vectorize the sampling and take mean of activated f, then do outer product
def calc_sigma_2_v2(activation):
    num_samples = 100
    np.random.seed(26)
    s = np.random.multivariate_normal(mean, cov, size=num_samples)
    # s: (num_samples, 10)

    f_xi = s[:, :num_input]
    f_xj = s[:, num_input:]

    # we may take the mean of the relu and then do outer product
    sig_f_xi = np.mean(activation(f_xi), axis=0)
    sig_f_xj = np.mean(activation(f_xj), axis=0)

    sigma_2 = np.outer(sig_f_xi, sig_f_xj) + beta**2
    
    return sigma_2

#     display(sigma_2)

In [10]:
"""
activation functions and their derivatives
"""

def relu_kth(x, k):
    return np.maximum(x**k, 0)

def relu(x):
    return np.maximum(x, 0)

def d_relu(x):
    return 1.0 * (x > 0)

def d_relu_kth(x, k):
    return k * x**(k-1) * (x > 0)

display(relu(f_xi))
display(d_relu(f_xi))


array([0.14236386, 0.09536292, 0.04836197, 0.00136103, 0.        ])

array([1., 1., 1., 1., 0.])

In [11]:
calc_sigma_2_v1(relu)

array([[0.04988892, 0.05572713, 0.06157944, 0.06743174, 0.07328404],
       [0.07919627, 0.08973607, 0.10028731, 0.11083854, 0.12138978],
       [0.108747  , 0.12404049, 0.13934277, 0.15464505, 0.16994733],
       [0.13841707, 0.15850368, 0.17859642, 0.19868916, 0.2187819 ],
       [0.16809138, 0.19297306, 0.21785822, 0.24274338, 0.26762853]])

In [12]:
calc_sigma_2_v2(relu)

array([[0.02293469, 0.02497931, 0.02702922, 0.02907913, 0.03112904],
       [0.02946764, 0.03254494, 0.0356302 , 0.03871546, 0.04180072],
       [0.03651265, 0.04070358, 0.04490534, 0.04910711, 0.05330887],
       [0.04391739, 0.0492788 , 0.05465409, 0.06002937, 0.06540465],
       [0.05158906, 0.05816316, 0.06475426, 0.07134536, 0.07793646]])

In [13]:
"""
Compute Theta 1 and 2 using Sigma
"""

THETA_1 = calc_sigma_1(x_i, x_j) # actually make a copy

sigma_2 = calc_sigma_2_v1(relu)
sigma_2_prime = calc_sigma_2_v1(d_relu)

THETA_2 = np.multiply(THETA_1, sigma_2_prime) + sigma_2

In [14]:
display(THETA_1)
display(THETA_2)

array([[0.07, 0.08, 0.09, 0.1 , 0.11],
       [0.13, 0.15, 0.17, 0.19, 0.21],
       [0.19, 0.22, 0.25, 0.28, 0.31],
       [0.25, 0.29, 0.33, 0.37, 0.41],
       [0.31, 0.36, 0.41, 0.46, 0.51]])

array([[0.07578892, 0.08452713, 0.09397944, 0.10343174, 0.11288404],
       [0.12989627, 0.14673607, 0.16488731, 0.18303854, 0.20118978],
       [0.184747  , 0.20984049, 0.23684277, 0.26384505, 0.29084733],
       [0.24341707, 0.27740368, 0.31389642, 0.35038916, 0.3868819 ],
       [0.30139138, 0.34417306, 0.39005822, 0.43594338, 0.48182853]])