## Import Packages, Environment Setting

In [1]:
import tensorflow as tf
import tensorflow_probability as tfp

class ANSI():
    BOLD = '\033[1m'
    END  = '\033[0m' 

## Tensor Declaration
We often use $\mathbf{X}$ to denote the feature matrix of the samples, where the where $x_{i,j}$ is the j-th feature of the i-th sample. Note that it is common to have higher dimensional feature tensor.
$$
\mathbf{X}=[x_{i,j}]=\begin{bmatrix}
    x_{1,1} & x_{1,2} & \cdots & x_{1,n}\\
    x_{2,1} & x_{2,2} & \cdots & x_{2,n}\\
    \vdots  & \vdots  & \ddots & \vdots \\
    x_{m,1} & x_{m,2} & \cdots & x_{m,n}\\
    \end{bmatrix}
$$

In [2]:
print(f'{ANSI.BOLD}[tf.eye]{ANSI.END} Identity matrix\n{tf.eye(4)}\n')
print(f'{ANSI.BOLD}[tf.ones]{ANSI.END} Tensor full of ones\n{tf.ones((4, 5))}\n')
print(f'{ANSI.BOLD}[tf.zeros]{ANSI.END} Tensor full of zeros\n{tf.zeros((4, 5))}\n')
X = tf.ones((4, 5))
print(f'{ANSI.BOLD}[tf.ones_like]{ANSI.END} Tensor full of ones with same dimension of provided matrix\n{tf.ones_like(X)}\n')
print(f'{ANSI.BOLD}[tf.zeros_like]{ANSI.END} Tensor full of zeros with same dimension of provided matrix\n{tf.zeros_like(X)}\n')
tf.random.set_seed(0)
X = tf.random.normal((4, 5), mean=0, stddev=0.5)
print(f'{ANSI.BOLD}[tf.random.normal]{ANSI.END} Tensor with random values from normal distribution\n{X}\n')
tf.random.set_seed(0)
X = tf.random.uniform((4, 5), minval=-1, maxval=1)
print(f'{ANSI.BOLD}[tf.random.uniform]{ANSI.END} Tensor with random values\n{X}\n')
X = tf.convert_to_tensor(X.numpy())
print(f'{ANSI.BOLD}[tf.convert_to_tensor]{ANSI.END} Tensor from List or Numpy array\n{X}\n')


[1m[tf.eye][0m Identity matrix
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

[1m[tf.ones][0m Tensor full of ones
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]

[1m[tf.zeros][0m Tensor full of zeros
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

[1m[tf.ones_like][0m Tensor full of ones with same dimension of provided matrix
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]

[1m[tf.zeros_like][0m Tensor full of zeros with same dimension of provided matrix
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

[1m[tf.random.normal][0m Tensor with random values from normal distribution
[[ 0.7555313   0.21146102 -0.20984747 -0.5180186  -0.6184139 ]
 [ 0.23513651 -0.00698744  0.59442914  0.30126667  0.29985556]
 [-0.35285595 -0.21648772  0.39681226 -0.3487463  -0.4799166 ]
 [-0.45034844 -0.18040527 -0.11188658  0.15191923  0.26076272]]

[1m[tf.random.uniform][0m Tenso

## Matrix Properties
Here are some commonly used important properties of Tensor object.

In [3]:
print(f'{ANSI.BOLD}[X.device]{ANSI.END}: {X.device}')
print(f'{ANSI.BOLD}[X.dtype]{ANSI.END}:', {X.dtype})
print(f'{ANSI.BOLD}[X.shape]{ANSI.END}: {X.shape}')

[1m[X.device][0m: /job:localhost/replica:0/task:0/device:CPU:0
[1m[X.dtype][0m: {tf.float32}
[1m[X.shape][0m: (4, 5)


## Variable Declaration
Note that the declared Tensor object is constant (i.e the values cannot be changed directly). We often use Tensor object for the feature tensor. We can defined Variable object for mutable tensors. We often use Variable object for the __trainable__ parameters in the neural networks:<br><br>
$$
\mathbf{w}=[w_{j}]=\begin{bmatrix}
    w_{1} \\
    w_{2} \\
    \vdots  \\
    w_{n} \\
    \end{bmatrix}, b=0
$$

In [4]:
b = tf.Variable([0], dtype=tf.dtypes.float32)
print(f'{ANSI.BOLD}[tf.Variable]{ANSI.END} Declare Variable\n{b.numpy()}\n')

print(f'{ANSI.BOLD}[V.assign, V.assign_add, V.assign_sub]{ANSI.END} Replace (add, subtract) values with given tensor (only change reference if read_value=True)')
tf.random.set_seed(0)
w = tf.Variable(tf.random.uniform((5, 1), minval=-1, maxval=1))
print(f'Weight (before)\n{w.numpy()}')
tf.random.set_seed(1)
tmp = tf.Variable(tf.random.uniform((5, 1), minval=-1, maxval=1))
w.assign(tmp)
print(f'Weight (after)\n{w.numpy()}\n')

print(f'{ANSI.BOLD}[V.scatter_nd_update, V.scatter_nd_add, V.scatter_nd_sub]{ANSI.END} Replace (add, subtract) values in the given indices (change reference and also return new Variable)')
tf.random.set_seed(0)
w = tf.Variable(tf.random.uniform((5, 1), minval=-1, maxval=1))
indices = tf.constant([[0, 0]])
updates = tf.constant([0], dtype=tf.dtypes.float32)
print(f'Weight (before)\n{w.numpy()}')
updated = w.scatter_nd_update(indices, updates).numpy()
print(f'Weight (after)\n{w.numpy()}')
print(f'Return Tensor\n{updated}')
print(f'Do reference and return Tensor have the same values: {all(w == updated)}')
print(f'Do reference and return Tensor point to same object: {w is updated}')

[1m[tf.Variable][0m Declare Variable
[0.]

[1m[V.assign, V.assign_add, V.assign_sub][0m Replace (add, subtract) values with given tensor (only change reference if read_value=True)
Weight (before)
[[-0.41604972]
 [-0.5868671 ]
 [ 0.07078147]
 [ 0.12251496]
 [-0.16665101]]
Weight (after)
[[-0.6697383 ]
 [ 0.80296254]
 [ 0.26194835]
 [-0.13090777]
 [-0.41612196]]

[1m[V.scatter_nd_update, V.scatter_nd_add, V.scatter_nd_sub][0m Replace (add, subtract) values in the given indices (change reference and also return new Variable)
Weight (before)
[[-0.41604972]
 [-0.5868671 ]
 [ 0.07078147]
 [ 0.12251496]
 [-0.16665101]]
Weight (after)
[[ 0.        ]
 [-0.5868671 ]
 [ 0.07078147]
 [ 0.12251496]
 [-0.16665101]]
Return Tensor
[[ 0.        ]
 [-0.5868671 ]
 [ 0.07078147]
 [ 0.12251496]
 [-0.16665101]]
Do reference and return Tensor have the same values: True
Do reference and return Tensor point to same object: False


## Basic Operations

In [5]:
print(f'{ANSI.BOLD}[tf.reshape]{ANSI.END} Reshape the Tensor\n{tf.reshape(X, (10, 2))}\n')
print(f'{ANSI.BOLD}[tf.transpose]{ANSI.END} Swap dimension of the Tensor\n{tf.transpose(X, [1, 0])}\n')
A, B = tf.split(X, [2, 2], axis=0)
print(f'{ANSI.BOLD}[tf.split]{ANSI.END} Split the Tensor into list of Tensors\nA\n{A}\nB\n{B}\n')
print(f'{ANSI.BOLD}[tf.concat]{ANSI.END} Concat list of Tensors into one\n{tf.concat([A, B], axis=0)}\n')
print(f'{ANSI.BOLD}[tf.stack]{ANSI.END} Concat list of Tensors on a new axis\n{tf.stack([A, B], axis=0)}\n')
print(f'{ANSI.BOLD}[tf.matmul, operators]{ANSI.END} Xw + b\n{tf.matmul(X, w) + b}\n')
print(f'{ANSI.BOLD}[tf.sort]{ANSI.END} Sort the Tensor\n{tf.sort(X, axis=1)}\n')
print(f'{ANSI.BOLD}[tf.argsort]{ANSI.END} Get the order of the Tensor\n{tf.argsort(X, axis=1)}\n')
print(f'{ANSI.BOLD}[tf.reduce_min, tf,reduce_max, tf.reduce_mean, tf_reduce_sum]{ANSI.END} Compute min, max, mean and sum of the Tensor \n{tf.reduce_min(X, axis=1)}\n')
print(f'{ANSI.BOLD}[tfp.stats.percentile]{ANSI.END} Compute percentile of the Tensor\n{tfp.stats.percentile(X, 50, axis=1)}\n')
print(f'{ANSI.BOLD}[tf.pow, tf.log]{ANSI.END} Compute power or log of Tensor\n{tf.pow(X, 2)}\n')
print(f'{ANSI.BOLD}[tf.squeeze]{ANSI.END} Remove dimensions with 1 length\n{tf.squeeze(w)}\n')
print(f'{ANSI.BOLD}[tf.expand_dims]{ANSI.END} Expand dimension \n{tf.expand_dims(tf.squeeze(w), axis=1)}\n')
print(f'{ANSI.BOLD}[tf.where]{ANSI.END} Replace values if condition is (not) met\n{tf.where(X > 0, tf.zeros_like(X), X)}')


[1m[tf.reshape][0m Reshape the Tensor
[[-0.41604972 -0.5868671 ]
 [ 0.07078147  0.12251496]
 [-0.16665101  0.6156559 ]
 [-0.0135498   0.9962585 ]
 [ 0.3934703  -0.7492528 ]
 [ 0.4196334   0.32483125]
 [ 0.14451313 -0.27049303]
 [-0.15896344  0.26011395]
 [ 0.827626    0.3232944 ]
 [ 0.6669471  -0.83208394]]

[1m[tf.transpose][0m Swap dimension of the Tensor
[[-0.41604972  0.6156559   0.4196334   0.26011395]
 [-0.5868671  -0.0135498   0.32483125  0.827626  ]
 [ 0.07078147  0.9962585   0.14451313  0.3232944 ]
 [ 0.12251496  0.3934703  -0.27049303  0.6669471 ]
 [-0.16665101 -0.7492528  -0.15896344 -0.83208394]]

[1m[tf.split][0m Split the Tensor into list of Tensors
A
[[-0.41604972 -0.5868671   0.07078147  0.12251496 -0.16665101]
 [ 0.6156559  -0.0135498   0.9962585   0.3934703  -0.7492528 ]]
B
[[ 0.4196334   0.32483125  0.14451313 -0.27049303 -0.15896344]
 [ 0.26011395  0.827626    0.3232944   0.6669471  -0.83208394]]

[1m[tf.concat][0m Concat list of Tensors into one
[[-0.416049

## Partial Derivatives, Jacobian Matrix and Gradient
Suppose $\mathbf{\hat{y}}$ is an $m$ length vector, and $\mathbf{\hat{y}}$ is a function of another vector $\mathbf{w}$ with length $n$ (i.e. $\mathbf{\hat{y}} = \psi(\mathbf{w})$, where $\psi: \mathbb{R}^n \to \mathbb{R}^m$). The __Jacobian matrix__ (matrix with first-order partial derivatives) of $\mathbf{\hat{y}}$ with respect to $\mathbf{w}$ is:

$$
\mathbf{\hat{y}}=[\hat{y}_{i}]=\begin{bmatrix}
    \hat{y}_{1} \\
    \hat{y}_{2} \\
    \vdots  \\
    \hat{y}_{m} \\
\end{bmatrix}, \;\;\;
\mathbf{w}=[w_{j}]=\begin{bmatrix}
    w_{1} \\
    w_{2} \\
    \vdots  \\
    w_{n} \\
\end{bmatrix}, \;\;\;
\mathbf{\frac{\partial \hat{y}}{\partial w}}=\begin{bmatrix}
    \frac{\partial\hat{y}_1}{\partial w_1} & \frac{\partial\hat{y}_1}{\partial w_2} & \cdots & \frac{\partial\hat{y}_1}{\partial w_n} \\
    \frac{\partial\hat{y}_2}{\partial w_1} & \frac{\partial\hat{y}_2}{\partial w_2} & \cdots & \frac{\partial\hat{y}_2}{\partial w_n} \\
    \vdots & \vdots & \ddots & \vdots \\ 
    \frac{\partial\hat{y}_m}{\partial w_1} & \frac{\partial\hat{y}_n}{\partial w_2} & \cdots & \frac{\partial\hat{y}_m}{\partial w_n}
\end{bmatrix}
$$

For instance, let $\mathbf{\hat{y}}=\mathbf{Xw} + b$, where $\mathbf{X}$ is a matrix independent from $\mathbf{w}$:

$$
\mathbf{X}=[x_{i,j}]=\begin{bmatrix}
    x_{1,1} & x_{1,2} & \cdots & x_{1,n}\\
    x_{2,1} & x_{2,2} & \cdots & x_{2,n}\\
    \vdots  & \vdots  & \ddots & \vdots \\
    x_{m,1} & x_{m,2} & \cdots & x_{m,n}\\
    \end{bmatrix}, \;\;\;
\mathbf{\hat{y}}=
\begin{bmatrix}
    x_{1,1} & x_{1,2} & \cdots & x_{1,n}\\
    x_{2,1} & x_{2,2} & \cdots & x_{2,n}\\
    \vdots  & \vdots  & \ddots & \vdots \\
    x_{m,1} & x_{m,2} & \cdots & x_{m,n}\\
\end{bmatrix}\begin{bmatrix}
    w_{1}   \\
    w_{2}   \\
    \vdots  \\
    w_{n}   \\
\end{bmatrix} + b=
\begin{bmatrix}
    w_1x_{1,1} + w_2x_{1,2} + \cdots + w_mx_{1,n} + b \\
    w_1x_{2,1} + w_2x_{2,2} + \cdots + w_mx_{2,n} + b \\
    \vdots \\
    w_1x_{m,1} + w_2x_{m,2} + \cdots + w_mx_{m,n} + b \\
\end{bmatrix}
$$

The Jacobian matrix of $\mathbf{\hat{y}}$ with respect to $\mathbf{w}$ is:
$$
\mathbf{\frac{\partial \hat{y}}{\partial w}}=\begin{bmatrix}
    x_{1,1} & x_{1,2} & \cdots & x_{1,n}\\
    x_{2,1} & x_{2,2} & \cdots & x_{2,n}\\
    \vdots  & \vdots  & \ddots & \vdots \\
    x_{m,1} & x_{m,2} & \cdots & x_{m,n}\\
    \end{bmatrix} = \mathbf{X}
$$

Suppose $f(\mathbf{\hat{y}})=\mathbf{\hat{y}}^T\mathbf{A}\mathbf{\hat{y}}$, where $\mathbf{A} \in \mathbb{R}^{m\times m}$ is matrix independent from $\mathbf{\hat{y}}$, then $f(\mathbf{\hat{y}})$ is:

$$
\begin{align}
f(\mathbf{\hat{y}})&=\begin{bmatrix} \hat{y}_1 & \hat{y}_2 & \cdots & \hat{y}_m \\ \end{bmatrix}
\begin{bmatrix}
    a_{1,1} & a_{1,2} & \cdots & a_{1,m}\\
    a_{2,1} & a_{2,2} & \cdots & a_{2,m}\\
    \vdots  & \vdots  & \ddots & \vdots \\
    a_{m,1} & a_{m,2} & \cdots & a_{m,m}\\
    \end{bmatrix}
\begin{bmatrix} \hat{y}_1 \\  \hat{y}_2 \\ \vdots \\ \hat{y}_m \\ \end{bmatrix}\\
&=\begin{bmatrix} \sum\limits_{i=1}^{m}a_{i,1}\hat{y}_1 & \sum\limits_{i=1}^{m}a_{i,2}\hat{y}_2 & \cdots & \sum\limits_{i=1}^{m}a_{i,m}\hat{y}_m \\ \end{bmatrix}\begin{bmatrix} \hat{y}_1 \\  \hat{y}_2 \\ \vdots \\ \hat{y}_m \\ \end{bmatrix} \\
&=\sum\limits_{j=1}^{m}\hat{y}_j\sum\limits_{i=1}^{m}a_{i,j}\hat{y}_j \\
&=\sum\limits_{j=1}^{m}\hat{y}_j^2\sum\limits_{i=1}^{m}a_{i,j}
\end{align}
$$

then the __gradient__ (the partial derivatives of a sclar with respect to a vector) for this function $f$ with respect to $\hat{y}$ is:

$$
\begin{align}
\nabla_{\hat{y}}f(\hat{y})&=\begin{bmatrix} \frac{\partial f}{\partial\hat{y}_1} & \frac{\partial f}{\partial\hat{y}_2} & \cdots & \frac{\partial f}{\partial\hat{y}_m} \end{bmatrix}^T \\
&=\begin{bmatrix} 2\hat{y}_1\sum\limits_{i=1}^{m}a_{i,1} & 2\hat{y}_2\sum\limits_{i=1}^{m}a_{i,2} & \cdots & 2\hat{y}_m\sum\limits_{i=1}^{m}a_{i,m} \end{bmatrix}^T\\
&=[2\hat{y}^T\mathbf{A}]^T
\end{align}
$$

Suppose we have another function that map an vector to a scalar ($L: \mathbb{R}^m \to \mathbb{R}$), e.g. $L(\mathbf{\hat{y}})=(\mathbf{y} - \mathbf{\hat{y}})^T(\mathbf{y} - \mathbf{\hat{y}})$, where $\mathbf{\hat{y}}$ is a $m$ length vector independent from $\mathbf{\hat{y}}$ ($L$ is actually the square deviation between $\mathbf{y}$ and $\mathbf{\hat{y}}$)

$$
\mathbf{y}=[y_{i}]=\begin{bmatrix}
    y_{1} \\
    y_{2} \\
    \vdots  \\
    y_{m} \\
\end{bmatrix}, \;\;\;
\mathbf{\hat{y}}=[\hat{y}_{i}]=\begin{bmatrix}
    \hat{y}_{1} \\
    \hat{y}_{2} \\
    \vdots  \\
    \hat{y}_{m} \\
\end{bmatrix}=\mathbf{Xw}+b, \;\;\;
L(\mathbf{\hat{y}})=\sum\limits_{i=1}^{m}(y_i - \hat{y}_i)^2
$$

We can compute the gradient of this function with respect to $\mathbf{\hat{y}}$

$$
\begin{align}
\nabla_{\hat{y}}L(\hat{y})&=\begin{bmatrix} \frac{\partial L}{\partial\hat{y_1}} & \frac{\partial L}{\partial\hat{y_2}} & \cdots & \frac{\partial L}{\partial\hat{y_m}} \end{bmatrix}^T \\
&=\begin{bmatrix} 
    \frac{\partial L}{\partial(y_1 - \hat{y}_1)}\frac{\partial(y_1 - \hat{y}_1)}{\partial\hat{y}_1} & 
    \frac{\partial L}{\partial(y_2 - \hat{y}_2)}\frac{\partial(y_2 - \hat{y}_2)}{\partial\hat{y}_2} & 
    \cdots & 
    \frac{\partial L}{\partial(y_m - \hat{y}_m)}\frac{\partial(y_m - \hat{y}_m)}{\partial\hat{y}_m} \end{bmatrix}^T \\
&=\begin{bmatrix} 
    2(\hat{y}_1 - y_1) & 
    2(\hat{y}_2 - y_2) & 
    \cdots & 
    2(\hat{y}_m - y_m) \end{bmatrix}^T\\
&=-2(\mathbf{y} - \hat{y})
\end{align}
$$

Moreover, we can compute the gradient of $L$ with respect to $\mathbf{w}$ using multivariate chain rule:

$$
\begin{align}
\nabla_{\mathbf{w}}L(\hat{y})&=\begin{bmatrix} \frac{\partial L}{\partial w_1} & \frac{\partial L}{\partial w_2} & \cdots & \frac{\partial L}{\partial w_n} \end{bmatrix}^T\\
&=\begin{bmatrix}
    \frac{\partial L}{\partial\mathbf{\hat{y}}}\frac{\partial\mathbf{\hat{y}}}{\partial w_1} & \frac{\partial L}{\partial\mathbf{\hat{y}}}\frac{\partial\mathbf{\hat{y}}}{\partial w_2} & \cdots &\frac{\partial L}{\partial\mathbf{\hat{y}}}\frac{\partial\mathbf{\hat{y}}}{\partial w_n} 
\end{bmatrix}^T\\
&=\begin{bmatrix}
    \sum\limits_{i=1}^{m}\frac{\partial L}{\partial\hat{y_i}}\frac{\partial\hat{y_i}}{\partial w_1} & \sum\limits_{i=1}^{m}\frac{\partial L}{\partial\hat{y_i}}\frac{\partial\hat{y_i}}{\partial w_2} & \cdots & \sum\limits_{i=1}^{m}\frac{\partial L}{\partial\hat{y_i}}\frac{\partial\hat{y_i}}{\partial w_n} 
\end{bmatrix}^T\\
&=[-2(\mathbf{y} - \hat{y})^T\frac{\partial\mathbf{\hat{y}}}{\partial w}]^T\\
&=[-2(\mathbf{y} - \hat{y})^T\mathbf{X}]^T
\end{align}
$$

In [6]:
def allclose(x, y, rtol=1e-5, atol=1e-8):
    return tf.reduce_all(tf.abs(x - y) <= tf.abs(y) * rtol + atol).numpy()

tf.random.set_seed(0)
y = tf.Variable(tf.random.uniform((4, 1), minval=-5, maxval=5))

with tf.GradientTape() as g:
    y_hat = tf.matmul(X, w) + b
    loss = tf.reduce_sum(tf.pow(y - y_hat, 2))
tf_autograd = g.gradient(loss, w)
derived_grad = tf.transpose(-2 * tf.matmul(tf.transpose(y - y_hat), X))

print(f'Are auto-computed gradient and self-derived gradient the same?: {allclose(derived_grad, tf_autograd)}')

Are auto-computed gradient and self-derived gradient the same?: True
