Reference: https://www.tensorflow.org/

```python
import tensorflow as tf
```

# Tensors

## Basic functions/attributes

### tf.constant(), tf.ragged.constant(), tf.string()

A tf.Tensor is immutable. You can't change a tensor once it's created. It has a value, but no state.

A tensor with variable numbers of elements along some axis is called "ragged".

```python
t = tf.ragged.constant([[0,1,2,3],[4,5],[6,7,8],[9]])
t.shape  
# TensorShape([4, None])
```

### tf.Variable()

```python
a = tf.Variable([2.0, 3.0])
a.assign([1, 2])            # it doesn't change the dtype; a has [1., 2.]

a.assign_add([2,3])         # a has [3., 5.]
a.assign_sub([7,9])         # a has [-4., -4.]

b = a + 1         # a variable tensor + a tensor is a tensor, not a variable.

b = tf.Variable([2.0, 3.0], trainable=False)     # not need gradients
```

### tf.convert_to_tensor(), tf.cast()

### tf.newaxis, tf.reshape(), tf.broadcast_to()

### tf.add(), tf.multiply(), tf.matmul(), tf.square(), tf.reduce_mean()

### tf.linspace()

### tf.random.normal()

### tf.device()

```python
with tf.device('CPU:0'):
    # ...
with tf.device('GPU:0'):
    # ...
```

### tf.strings

* split(), bytes_split(), unicode_split()
* unicode_decode(), unicode_encode()
* to_number()


### tf.GradientTape()

Use tf.GradientTape() to train and/or compute gradients in eager.

### persistent

GradientTape.gradient can only be called once on non-persistent tapes.

```python
x = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as tape:
    y = x * x
    z = y * y
tape.gradient(y, x)    # 2*x which is 6.0    
tape.gradient(z, y)    # 2*y which is 2*x*x = 18.0
tape.gradient(z, x)    # 108.0 which is 6.0*18.0
```

#### watch()

tf.Variable (with trainable=True) is watched by default, but tf.Tensor is not watched by default.

```python
x0 = tf.Variable(3.0, name='x0')                   # trainable
x1 = tf.Variable(3.0, name='x1', trainable=False)  # not trainable
x2 = x0 + 1.0                                      # not trainable, since it's a tensor
x3 = tf.constant(3.0, name='x3')                   # not trainable

with tf.GradientTape() as tape:
    y = tf.square(x0) + tf.square(x1) + tf.square(x2) + tf.square(x3)
grad = tape.gradient(y, [x0,x1,x2,x3])

grad
[<tf.Tensor: shape=(), dtype=float32, numpy=6.0>, None, None, None]

tape.watched_variables()
(<tf.Variable 'x0:0' shape=() dtype=float32, numpy=3.0>,)
```

Use watch():

```python
with tf.GradientTape() as tape:
    tape.watch([x1,x2,x3])
    y = tf.square(x0) + tf.square(x1) + tf.square(x2) + tf.square(x3)
grad = tape.gradient(y, [x0,x1,x2,x3])

grad
[<tf.Tensor: shape=(), dtype=float32, numpy=6.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.0>]

tape.watched_variables()
(<tf.Variable 'x0:0' shape=() dtype=float32, numpy=3.0>,
 <tf.Variable 'x1:0' shape=() dtype=float32, numpy=3.0>)
```

#### watch_accessed_variables
To disable the default behavior of watching all tf.Variables, set `watch_accessed_variables=False`:

```python
with tf.GradientTape(watch_accessed_variables=False) as tape:
    tape.watch(x1)          # Only x1 is watched.
    # ...
```

#### non-scalar targets

A gradient tape will compute 

* $\nabla_{\mathbf{x}} y$, if $y$ is a scalar function of $\mathbf{x}=(x_1,\ldots,x_n)$,

* $\nabla_{\mathbf{x}}  \sum_i y_i$, if $y_i$ is scalar function of $\mathbf{x}=(x_1,\ldots,x_n)$.

```python
x = tf.Variable([2.0, 3.0])
with tf.GradientTape() as tape:
    y0 = x**2
    y1 = 1 / x
tape.gradient([y0, y1], x)
<tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.75    , 5.888889], dtype=float32)>

# The above is same as:
x = tf.Variable([2.0, 3.0])
with tf.GradientTape(persistent=True) as tape:
    y0 = x**2
    y1 = 1 / x
tape.gradient(y0, x) + tape.gradient(y1, x)
```
        
Similarly, if the target is not scalar, the gradient of the sum is calculated:

```python
x = tf.Variable([[2.0, 3.0]])
with tf.GradientTape() as tape:
    y = tf.transpose(x) @ x       # 2-by-2           
tape.gradient(y, x)              # shape: (1,2), since x.shape is (1,2)

# The above is same as:  
x = tf.Variable([[2.0, 3.0]])
with tf.GradientTape() as tape:
    y = tf.transpose(x) @ x       # 2-by-2
    y1, y2, y3, y4 = (y[i][j] for i in [0,1] for j in [0,1])
tape.gradient([y1,y2,y3,y4], x)
```

#### A gradient of None

```python
# Ex1
with tf.GradientTape() as tape:
    z = y * y
tape.gradient(z, x)  # None

# Ex2
x = tf.Variable(2.0)
for epoch in range(2):
    with tf.GradientTape() as tape:
        y = x+1
    x = x + 1        # After this operation, x is now a Tensor, not a Variable; it should be x.assign_add(1).

# Ex3    
with tf.GradientTape() as tape:
    y = np.mean(x2, axis=0)
    y = tf.reduce_mean(y, axis=0)  # y is a constant tensor
    
# Ex4
x = tf.Variable([[2, 2], [2, 2]])
with tf.GradientTape() as tape:
    y = tf.reduce_sum(x)           # x.dtype should not be int.
print(tape.gradient(y, x))
```

### tf.sparse

* SparseTensor()

### tf.io

* decode_raw()

```python
tf.io.decode_raw("Duck", tf.uint8)
<tf.Tensor: shape=(4,), dtype=uint8, numpy=array([ 68, 117,  99, 107], dtype=uint8)>
```

### tf.nn

* tf.nn.softmax()
* tf.nn.softplus(): log(exp(x) + 1)
* tf.nn.sigmoid(): 1 / (1 + exp(-x))


## Example of training a model

```python
# Data
shuffle_size, batch_size = 1000, 32
input_shape = (28,28,1)

(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data()

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis]/255, tf.float32),
     tf.cast(mnist_labels, tf.int64))
).shuffle(shuffle_size).batch(batch_size)

# Model
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16,[3,3],activation='relu',input_shape=input_shape),
    tf.keras.layers.Conv2D(16,[3,3],activation='relu'),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(10)])
                          
# Optimizer/Loss
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
                          
# Train a model
def train_step(batch, labels):
    with tf.GradientTape() as tape:
        logits = model(batch, training=True)
        loss = loss_fn(labels, logits)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss.numpy().mean()

epochs = 1
loss_history = []
for epoch in range(epochs):
    for batch, labels in dataset:
        batch_loss_mean = train_step(batch, labels)
        loss_history.append(batch_loss_mean)
    print('Epoch {}, Loss {}'.format(epoch+1, np.mean(loss_history)))
    
model.save_weights('weights')
status = model.load_weights('weights')
```