In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
import time

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28 * 28).astype('float32') / 255
x_test = x_test.reshape(-1, 28 * 28).astype('float32') / 255
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

def build_model(activation):
    model = models.Sequential()
    model.add(layers.Dense(128, input_shape=(28 * 28,), activation=activation))
    model.add(layers.Dense(64, activation=activation))
    model.add(layers.Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

activations = ['sigmoid', 'relu', 'leaky_relu']
results = {}

for act in activations:
    if act == 'leaky_relu':
        model = build_model(tf.nn.leaky_relu)
    else:
        model = build_model(act)
    
    start_time = time.time()
    history = model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2, verbose=0)
    training_time = time.time() - start_time
    
    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
    
    results[act] = {
        'training_time': training_time,
        'test_accuracy': test_acc,
        'history': history.history
    }

for act, res in results.items():
    print(f"Activation: {act}")
    print(f"Training Time: {res['training_time']:.2f} seconds")
    print(f"Test Accuracy: {res['test_accuracy']:.4f}")
    print("---")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Activation: sigmoid
Training Time: 8.99 seconds
Test Accuracy: 0.9676
---
Activation: relu
Training Time: 8.45 seconds
Test Accuracy: 0.9761
---
Activation: leaky_relu
Training Time: 7.75 seconds
Test Accuracy: 0.9738
---


## Vanishing/Exploding Gradients: Sigmoid is more prone to vanishing gradients, especially in deeper networks, while ReLU and Leaky ReLU mitigate this issue. Sigmoid has slower training rate and lower accuracy compared to ReLU and Leaky ReLU.