In [None]:
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow.keras.backend as K
from tensorflow.keras.utils import *
import pickle
import tensorflow as tf
import re
print(tf.__version__)

Download mnist

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)
x_train = x_train.reshape(-1, 784)
x_test = x_test.reshape(-1, 784)
x_train = x_train / 255.0
x_test = x_test / 255.0
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

Models

In [None]:
class SignGate(Layer):
    def __init__(self, **kwargs):
        super(SignGate, self).__init__(**kwargs)

    def build(self, input_shape):
        super(SignGate, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
        output = K.sign(K.relu(x))

        return output

    def compute_output_shape(self, input_shape):
        return input_shape

def getNetwork1Layers(model):
    pattern = re.compile("^R[0-9]*$")
    layer_list  = []

    for layer in model.layers:
        if pattern.match(layer.name):
            layer_list.append(layer)

    return layer_list

def getNetwork2Layers(model):
    pattern = re.compile("^G[0-9]*$")
    layer_list  = []

    for layer in model.layers:
        if pattern.match(layer.name):
            layer_list.append(layer)

    return layer_list

def getGalu(depth, width):
    inputs = Input(shape = (784, ))

    R1 = Dense(units = width, activation = 'relu', name = "R1")(inputs)
    G1 = Dense(units = width, activation = 'linear',  name = "G1")(inputs)
    G1Activ = SignGate(name = "G1Activ")(R1)
    G1A = Multiply(name = "G1A")([G1, G1Activ])

    for i in range(depth - 2):
        R1 = Dense(units = width, activation = 'relu',name = "R"+str(i+2))(R1)
        G1 = Dense(units = width, activation = 'linear', name = "G"+str(i+2))(G1A)
        G1Activ = SignGate(name = "G"+str(i+2)+"Activ")(R1)
        G1A = Multiply(name = "G"+str(i+2)+"A")([G1, G1Activ])

    outputs = Dense(units = 10, activation = "softmax", name = "G"+str(depth))(G1A)
    model = keras.Model(inputs = inputs, outputs = outputs, name = 'galu_model')

    return model

def getRelu(depth, width):
    inputs = Input(shape = (784, ))
    R1 = Dense(units = width, activation = 'relu', name = "R1")(inputs)
    for i in range(depth - 2):
        R1 = Dense(units = width, activation = 'relu', name = "R"+str(i+2))(R1)

    outputs = Dense(units = 10, activation = "softmax", name = "R"+str(depth))(R1)

    model = keras.Model(inputs = inputs, outputs = outputs, name = 'relu_model')

    return model

eps, beta = 0.1, 4

class SoftGate(Layer):
    def __init__(self, **kwargs):
        super(SoftGate, self).__init__(**kwargs)

    def build(self, input_shape):
        super(SoftGate, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
        activation = (1 + eps)*K.sigmoid(beta*x)
        return activation

    def compute_output_shape(self, input_shape):
        return input_shape

def getDecoupledLearning(depth, width):
    inputs = Input(shape = (784, ))

    R1 = Dense(units = width, activation = 'linear', name = "R1")(inputs)
    R1A = Activation('relu')(R1)
    A1 = SoftGate()(R1)

    G1 = Dense(units = width, activation = 'linear',  name = "G1")(inputs)
    G1A = Multiply(name = "G1A")([G1, A1])

    for i in range(depth - 2):
        R1 = Dense(units = width, activation = 'linear',name = "R"+str(i+2))(R1A)
        R1A = Activation('relu')(R1)
        A1 = SoftGate()(R1)

        G1 = Dense(units = width, activation = 'linear', name = "G"+str(i+2))(G1A)
        G1A = Multiply(name = "G"+str(i+2)+"A")([G1, A1])

    outputs = Dense(units = 10, activation = "softmax", name = "G"+str(depth))(G1A)
    model = keras.Model(inputs = inputs, outputs = outputs, name = 'galu_model')

    return model

In [None]:
depth, width = 6, 128
lr = 1e-4
loss = keras.losses.categorical_crossentropy
batch_size = 32
num_exp = 5
num_epochs = 100

history_relu = {'acc':[], 'val_acc':[], 'loss': [], 'val_loss': []}
history_galu_learned = {'acc':[], 'val_acc':[], 'loss': [], 'val_loss': []}
history_frozen_relu = {'acc':[], 'val_acc':[], 'loss': [], 'val_loss': []}
history_galu = {'acc':[], 'val_acc':[], 'loss': [], 'val_loss': []}
history_decoupled_learning = {'acc':[], 'val_acc':[], 'loss': [], 'val_loss': []}

1.**Train Relu & GaLU Learned**

In [None]:
for exp_i in range(num_exp):
    print("_____________EXP:{}____________".format(exp_i+1))

    model_relu = getRelu(depth, width)
    model_relu.compile(loss = loss, optimizer = keras.optimizers.Adam(lr), metrics = ['acc'])

    filepath="weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]

    history = model_relu.fit(x_train, y_train, validation_data = (x_test, y_test), verbose = 0,
                                batch_size=batch_size, epochs= num_epochs, callbacks = callbacks_list)
    history_relu['acc'].append(history.history['acc'])
    history_relu['val_acc'].append(history.history['val_acc'])
    history_relu['loss'].append(history.history['loss'])
    history_relu['val_loss'].append(history.history['val_loss'])
    print("ReLU: MAX ACC = {}, MAX VAL ACC = {}".format(np.max(history.history['acc']), 
                                                        np.max(history.history['val_acc'])))
    model_relu.load_weights("weights.best.hdf5")

    model_galu_learned = getGalu(depth, width)
    layers_relu = getNetwork1Layers(model_relu)[:-1]
    layers_galu_learned = getNetwork1Layers(model_galu_learned)

    for layer in layers_galu_learned:
        layer.trainable = False

    model_galu_learned.compile(loss = loss, optimizer = keras.optimizers.Adam(lr), metrics = ['acc'])

    for layer1,layer2 in zip(layers_galu_learned, layers_relu): 
        layer1.set_weights(layer2.get_weights())

    history = model_galu_learned.fit(x_train, y_train,validation_data = (x_test, y_test), verbose = 0,
                                     batch_size=batch_size, epochs= num_epochs)
    history_galu_learned['acc'].append(history.history['acc'])
    history_galu_learned['val_acc'].append(history.history['val_acc'])
    history_galu_learned['loss'].append(history.history['loss'])
    history_galu_learned['val_loss'].append(history.history['val_loss'])

    print("GaLU Learned: MAX ACC = {}, MAX VAL ACC = {}".format(np.max(history.history['acc']), 
                                                        np.max(history.history['val_acc'])))

In [None]:
print("ReLU: max_acc = {:.4f}, mean_max_val_acc = {:.4f}, std_max_val_acc = {:.4f}".format(
                                                    np.mean(np.max(history_relu['acc'], axis = 1)), 
                                                    np.mean(np.max(history_relu['val_acc'], axis = 1)),
                                                    np.std(np.max(history_relu['val_acc'], axis = 1))))

print("GaLU Learned: max_acc = {:.4f}, mean_max_val_acc = {:.4f}, std_max_val_acc = {:.4f}".format(
                                                    np.mean(np.max(history_galu_learned['acc'], axis = 1)), 
                                                    np.mean(np.max(history_galu_learned['val_acc'], axis = 1)),
                                                    np.std(np.max(history_galu_learned['val_acc'], axis = 1))))

In [None]:
file = open('h_mnist_arch1_relu', 'wb')
pickle.dump(history_relu, file)

file = open('h_mnist_arch1_galu_learned', 'wb')
pickle.dump(history_galu_learned, file)

2. **Frozen ReLU**

In [None]:
for exp_i in range(num_exp):
    print("_____________EXP:{}____________".format(exp_i+1))
    model_frozen_relu = getGalu(depth, width)
    layers_frozen_relu_n1 = getNetwork1Layers(model_frozen_relu)
    layers_frozen_relu_n2 = getNetwork2Layers(model_frozen_relu)

    for layer in layers_frozen_relu_n1:
        layer.trainable = False

    model_frozen_relu.compile(loss = loss, optimizer = keras.optimizers.Adam(lr), metrics = ['acc'])

    for layer1, layer2 in zip(layers_frozen_relu_n1, layers_frozen_relu_n2):
        layer1.set_weights(layer2.get_weights())    

    history = model_frozen_relu.fit(x_train, y_train, validation_data = (x_test, y_test), verbose = 0,
                                   batch_size=batch_size, epochs= num_epochs)
    history_frozen_relu['acc'].append(history.history['acc'])
    history_frozen_relu['val_acc'].append(history.history['val_acc'])
    history_frozen_relu['loss'].append(history.history['loss'])
    history_frozen_relu['val_loss'].append(history.history['val_loss'])

    print("Frozen ReLU: MAX ACC = {:.4f}, MAX VAL ACC = {:.4f}".format(np.max(history.history['acc']), 
                                                        np.max(history.history['val_acc'])))

In [None]:
print("Frozen ReLU: max_acc = {:.4f}, mean_max_val_acc = {:.4f}, std_max_val_acc = {:.4f}".format(
                                                    np.mean(np.max(history_frozen_relu['acc'], axis = 1)), 
                                                    np.mean(np.max(history_frozen_relu['val_acc'], axis = 1)),
                                                    np.std(np.max(history_frozen_relu['val_acc'], axis = 1))))

In [None]:
file = open('h_mnist_arch1_frozen_relu', 'wb')
pickle.dump(history_frozen_relu, file)

- **GaLU**

In [None]:
for exp_i in range(num_exp):
    print("_____________EXP:{}____________".format(exp_i+1))
    model_galu = getGalu(depth, width)
    layers_frozen_relu_n1 = getNetwork1Layers(model_galu)

    for layer in layers_frozen_relu_n1:
        layer.trainable = False

    model_galu.compile(loss = loss, optimizer = keras.optimizers.Adam(lr), metrics = ['acc'])

    history = model_galu.fit(x_train, y_train, validation_data = (x_test, y_test), verbose = 0,
                             batch_size=batch_size, epochs= num_epochs)
    history_galu['acc'].append(history.history['acc'])
    history_galu['val_acc'].append(history.history['val_acc'])
    history_galu['loss'].append(history.history['loss'])
    history_galu['val_loss'].append(history.history['val_loss'])

    print("GaLU: MAX ACC = {:.4f}, MAX VAL ACC = {:.4f}".format(np.max(history.history['acc']), 
                                                        np.max(history.history['val_acc'])))

In [None]:
print("GaLU: max_acc = {:.4f}, mean_max_val_acc = {:.4f}, std_max_val_acc = {:.4f}".format(
                                                    np.mean(np.max(history_galu['acc'], axis = 1)), 
                                                    np.mean(np.max(history_galu['val_acc'], axis = 1)),
                                                    np.std(np.max(history_galu['val_acc'], axis = 1))))

In [None]:
file = open('h_mnist_arch1_history_galu', 'wb')
pickle.dump(history_galu, file)

- Decoupled Learning

In [None]:
for exp_i in range(num_exp):
    print("_____________EXP:{}____________".format(exp_i+1))
    model_dl = getDecoupledLearning(depth, width)
    model_dl.compile(loss = loss, optimizer = keras.optimizers.Adam(lr), metrics = ['acc'])

    history = model_dl.fit(x_train, y_train, validation_data = (x_test, y_test), verbose = 0,
                             batch_size=batch_size, epochs= num_epochs)
    
    history_decoupled_learning['acc'].append(history.history['acc'])
    history_decoupled_learning['val_acc'].append(history.history['val_acc'])
    history_decoupled_learning['loss'].append(history.history['loss'])
    history_decoupled_learning['val_loss'].append(history.history['val_loss'])

    print("Decoupled Learning: MAX ACC = {:.4f}, MAX VAL ACC = {:.4f}".format(
                                                        np.max(history.history['acc']), 
                                                        np.max(history.history['val_acc'])))

In [None]:
print("Decoupled Learning: max_acc = {:.4f}, mean_max_val_acc = {:.4f}, std_max_val_acc = {:.4f}".format(
                                            np.mean(np.max(history_decoupled_learning['acc'], axis = 1)), 
                                            np.mean(np.max(history_decoupled_learning['val_acc'], axis = 1)),
                                            np.std(np.max(history_decoupled_learning['val_acc'], axis = 1))))

In [None]:
file = open('h_mnist_decoupled_learning', 'wb')
pickle.dump(history_decoupled_learning, file)