<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/custom_training_practice/mnist_tpu_custom_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Bismillah Hirrahamaa Nirraheem")

Bismillah Hirrahamaa Nirraheem


In [2]:
import os,sys,warnings,time,re,math
warnings.filterwarnings("ignore")
from IPython.display import clear_output
os.cpu_count()

40

In [3]:
from google.colab import auth
auth.authenticate_user()
!gcloud auth login
clear_output()
!gcloud config set project kaggle-406814

Updated property [core/project].


In [4]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from typing import Literal
from multiprocessing import Process
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor,wait
pio.templates.default = "plotly_dark"
import tensorflow as tf
from tensorflow import keras
from tensorflow.io import VarLenFeature,parse_tensor,parse_single_example
from tensorflow.data import TFRecordDataset
from sklearn.datasets import fetch_openml
tf.get_logger().setLevel("ERROR")
from tqdm import tqdm
from zipfile import ZipFile
%matplotlib inline

## Strategy

In [5]:
try:
    tpu_cluster = tf.distribute.cluster_resolver.TPUClusterResolver()
    is_tpu = True
except ValueError:
    is_tpu = False

if is_tpu:
    tf.config.experimental_connect_to_cluster(tpu_cluster)
    tf.tpu.experimental.initialize_tpu_system(tpu_cluster)
    strategy = tf.distribute.TPUStrategy(tpu_cluster)
    print("TPU running on :",tpu_cluster.cluster_spec().as_dict()["worker"])
else:
    if len(tf.config.list_physical_devices()) > 1:
        strategy = tf.distribute.OneDeviceStrategy("GPU")
    else:
        strategy = tf.distribute.OneDeviceStrategy("CPU")

print(strategy.num_replicas_in_sync)

TPU running on : ['10.22.86.130:8470']
8


# Dataset Loading

In [6]:
PATH = "gs://stanfordrna/mnist/*.tfrecord"
all_files = tf.io.gfile.glob(PATH)
train_files = all_files[:15]
valid_files = all_files[15:18]
test_files = all_files[18:]
train_raw = TFRecordDataset(train_files)
test_raw = TFRecordDataset(test_files)
valid_raw = TFRecordDataset(valid_files)

In [7]:
mnist_feature = dict(
    image=VarLenFeature(tf.string),
    label = VarLenFeature(tf.string)
)

def mnist_example(example):
    example = parse_single_example(example,mnist_feature)
    example["image"] = parse_tensor(tf.sparse.to_dense(example["image"])[0],out_type=tf.float32)[...,tf.newaxis]
    example["image"] /= 255.0
    example["label"] = parse_tensor(tf.sparse.to_dense(example["label"])[0],out_type=tf.float32)
    return example["image"],example["label"]


def shape_set(image,label,batch_size):
    image.set_shape([batch_size,28,28,1])
    label.set_shape([batch_size,1])
    return image,label

def create_ds(ds:tf.data.Dataset,batch_size:int,is_train:bool=False,shuffle_size:int=5000):
    ds = ds.map(mnist_example,num_parallel_calls=tf.data.AUTOTUNE)
    if is_train:
        ds = ds.shuffle(shuffle_size)
        ds = ds.repeat()
    ds = ds.batch(batch_size,drop_remainder=True)
    ds = ds.map(lambda image,label: shape_set(image,label,batch_size),num_parallel_calls=tf.data.AUTOTUNE)
    if not is_train:
        ds = ds.cache()
    return ds.prefetch(tf.data.AUTOTUNE)


BATCH_SIZE = 32
sample_ds = create_ds(train_raw,batch_size=BATCH_SIZE,is_train=True)

In [8]:
X,y = sample_ds.take(1).get_single_element()
print("X shape: ",X.shape)
print("y shape: ",y.shape)

X shape:  (32, 28, 28, 1)
y shape:  (32, 1)


In [9]:
print(y[0].numpy())
px.imshow(X[0].numpy().reshape(28,28))

[6.]


In [10]:
class Mnist(keras.models.Model):

    def __init__(self,
        filters_1:int=12,
        ksize_1:int=3,
        filters_2:int=24,
        ksize_2:int=6,
        filters_3:int=32,
        ksize_3:int=6,
        norm_sacle:bool=False,
        norm_center:bool=True,
        padding:str="same",
        bias:bool=False,
        activation="relu",
        dense_units:int=200,
        drop_rate:float=0.4,
        dense_out:int=10,
        **kwargs):

        super(Mnist,self).__init__(**kwargs)

        self.conv_1 = keras.layers.Conv2D(filters=filters_1,kernel_size=ksize_1,padding=padding,use_bias=bias)
        self.conv_2 = keras.layers.Conv2D(filters=filters_2,kernel_size=ksize_2,padding=padding,use_bias=bias,strides=2)
        self.conv_3 = keras.layers.Conv2D(filters=filters_3,kernel_size=ksize_3,padding=padding,use_bias=bias,strides=2)
        self.batch_norm_1 = keras.layers.BatchNormalization(scale=norm_sacle,center=norm_center)
        self.batch_norm_2 = keras.layers.BatchNormalization(scale=norm_sacle,center=norm_center)
        self.batch_norm_3 = keras.layers.BatchNormalization(scale=norm_sacle,center=norm_center)
        self.batch_norm_4 = keras.layers.BatchNormalization(scale=norm_sacle,center=norm_center)
        self.activation = keras.activations.get(activation)
        self.flat = keras.layers.Flatten()
        self.forward = keras.layers.Dense(dense_units,use_bias=False)
        self.drop_out = keras.layers.Dropout(drop_rate)
        self.out = keras.layers.Dense(dense_out,activation="softmax")



    def call(self,x):

        x = self.conv_1(x)
        x = self.batch_norm_1(x)
        x = self.activation(x)
        x = self.conv_2(x)
        x = self.batch_norm_2(x)
        x = self.activation(x)
        x = self.conv_3(x)
        x = self.batch_norm_3(x)
        x = self.activation(x)
        x = self.flat(x)
        x = self.forward(x)
        x = self.batch_norm_4(x)
        x = self.activation(x)
        x = self.drop_out(x)
        return self.out(x)


In [11]:
model = Mnist()
test_out = model(X)
test_out.shape

TensorShape([32, 10])

In [12]:
BATCH_SIZE = 16 * strategy.num_replicas_in_sync if is_tpu else 32
num_train_files = 15
num_valid_files = 3
num_test_files = 2
total_instances = 210000
num_train_instances = (total_instances * 15)//20
num_test_instances = (total_instances * 2)//20
num_valid_instances = (total_instances * 3)//20
steps_per_epoch = num_train_instances//BATCH_SIZE
validation_steps = num_valid_instances//BATCH_SIZE

In [13]:
with strategy.scope():
    model = Mnist()
    optimizer = keras.optimizers.Adam()
    training_loss = keras.metrics.Mean("Training Loss",dtype=tf.float32)
    training_accuracy = keras.metrics.SparseCategoricalAccuracy("Training Accuracy",dtype=tf.float32)

per_replica_batch_size = BATCH_SIZE // strategy.num_replicas_in_sync

train_ds = strategy.distribute_datasets_from_function(lambda _: create_ds(train_raw,per_replica_batch_size,is_train=True))

@tf.function
def train_step(iterator):
    """The step function for one training step"""

    def step_fn(inputs):
        """The computation to run on each TPU device."""

        images,labels = inputs
        with tf.GradientTape() as tape:
            logits = model(images,training=True)
            per_example_loss = keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)
            loss = tf.nn.compute_average_loss(per_example_loss)
            model_losses = model.losses
            if model_losses:
                loss += tf.nn.scale_regularization_loss(tf.add_n(model_losses))

        grads = tape.gradient(loss,model.trainable_variables)
        optimizer.apply_gradients(list(zip(grads,model.trainable_variables)))
        training_loss.update_state(loss*strategy.num_replicas_in_sync)
        training_accuracy.update_state(labels,logits)

    strategy.run(step_fn,args=(next(iterator),))

In [14]:
train_iterator = iter(train_ds)
for epoch in range(5):
    print(f"Epoch {epoch}")

    for step in tqdm(range(steps_per_epoch)):
        train_step(train_iterator)

    print(f"current step {optimizer.iterations.numpy()}, Training Loss: {round(float(training_loss.result()))}, Training Accuracy: {round(float(training_accuracy.result())*100,2)}")
    training_loss.reset_state()
    training_accuracy.reset_state()

Epoch 0


100%|██████████| 1230/1230 [00:04<00:00, 268.89it/s]


current step 1230, Training Loss: 2, Training Accuracy: 55.87
Epoch 1


100%|██████████| 1230/1230 [00:01<00:00, 720.51it/s]


current step 2460, Training Loss: 1, Training Accuracy: 60.16
Epoch 2


100%|██████████| 1230/1230 [00:01<00:00, 717.82it/s]


current step 3690, Training Loss: 1, Training Accuracy: 61.19
Epoch 3


100%|██████████| 1230/1230 [00:01<00:00, 724.84it/s]


current step 4920, Training Loss: 1, Training Accuracy: 61.81
Epoch 4


100%|██████████| 1230/1230 [00:01<00:00, 711.62it/s]


current step 6150, Training Loss: 1, Training Accuracy: 62.53
