### Quick start to Tensorflow 2 - Basic MNIST classifier using CNN
Original [doc](https://www.tensorflow.org/tutorials/quickstart/advanced)

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
import numpy as np
tf.__version__

'2.2.0'

___
#### Checking available datasets with keras

In [2]:
#Checking available datasets with keras
for dataset in dir(tf.keras.datasets):
    if "_" not in dataset:
        print(dataset)

cifar10
cifar100
imdb
mnist
reuters


---
#### load and prepare MNIST dataset

In [3]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train, x_test = x_train/255.0, x_test/255.0
x_train = x_train[..., tf.newaxis].astype("float32")
#OR x_train = x_train[..., np.newaxis].astype("float32")
#OR x_train = x_train.reshape(x_train.shape[0], 28, 28, -1).astype("float32")
x_test = x_test[..., tf.newaxis].astype("float32")

x_train.shape, x_test.shape

((60000, 28, 28, 1), (10000, 28, 28, 1))

**Note:** *...* is called ellipsis and is used for slicing entire array and is equivalent to *:,:,:* in this case

Refer second answer [here](https://stackoverflow.com/questions/118370/how-do-you-use-the-ellipsis-slicing-syntax-in-python) for more details
___

#### Shuffle and batch the dataset
**Note:** Here 10000 is buffer size argument for shuffling.It should be greater than dataset size for perfect shuffle. [refer](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle)

In [4]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)).shuffle(10000).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

___
#### Build keras model using functional API

In [5]:
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        #32 : num filters , 3 : filter size
        self.conv1 = Conv2D(32, 3, activation='relu')
        self.flatten = Flatten()
        self.d1 = Dense(128, activation='relu')
        self.d2 = Dense(10)
    def call(self, x):
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)
        x = self.d2(x)
        return x
model = MyModel()

---
#### loss and optimizer
**Note:** Sparse categorical cross entropy is exactly same as categorical cross entropy except that Sparse version uses integer input while normal version uses one hot encodings.This saves some memory . [refer](https://stats.stackexchange.com/questions/326065/cross-entropy-vs-sparse-cross-entropy-when-to-use-one-over-the-other)

In [6]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam()

#### GradientTape explanation
* Record operations for automatic differentiation
* Trainable variables (created by `tf.Variable` or `tf.compat.v1.get_variable`,
  where `trainable=True` is default in both cases) are automatically watched.
  Tensors can be manually watched by invoking the `watch` method on this context
  manager.

In [8]:
x = tf.constant(3.0)
with tf.GradientTape() as g:
    g.watch(x)
    with tf.GradientTape() as gg:
        gg.watch(x)
        y = x * x
    dy_dx = gg.gradient(y, x)     # Will compute to 6.0
    d2y_dx2 = g.gradient(dy_dx, x)  # Will compute to 2.0
print(dy_dx, d2y_dx2)


tf.Tensor(6.0, shape=(), dtype=float32) tf.Tensor(2.0, shape=(), dtype=float32)


#### Train the model

In [7]:
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
    # training=True is only when there are layers like dropout
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)