# Import Libraries & Dataset

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier

In [None]:
# Dataset
df = np.loadtxt("pima-indians-diabetes.csv", delimiter=",")

# Split the dataset
X = df[:, 0:8]
Y = df[:,-1]

# Fix the random seed for reproducability
seed = 7
tf.random.set_seed(seed)

In [None]:
df.shape

## Keras Models in scikit-learn

Keras models must be wrapped in either **KerasClassifier** or **KerasRegressor** class from the **SciKeras** module. To utilize these wrappers we need to define a function that creates and returns the Keras sequential model, then pass this function to the `model` argument when construction the **KerasClassifier** class.

In [None]:
# function to create a model as mentioned above
def create_model():
    model = Sequential()
    model.add(Dense(12, input_shape=(8,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Grid Search

The constructor for the KerasClassifier class can take new arguments that can be passed to your custom `create_model()` function. These new args must also be defined in the signature of your `create_model()` function with default parameters

## Tuning Batch Size and # of Epochs

In [None]:
model = KerasClassifier(model=create_model, verbose=0)

# Define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# Summarizing the results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{round(mean, 6)} {round(stdev, 6)} with {param}")

## Tuning Learning Rate and Momentum

Using the above results we will now test for the optimal learning rate and momentum for the SGD optimizer.  Momentum controls how much to let the previous update influence the current weight update. For this step we will be assuming that Stochastic Gradient Descent is the most optimal optimizer to use which in most cases will be correct. 

In [None]:
# Make a new create_model() function as this one won't include the .compile() call 
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(12, input_shape=(8,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In the above `create_model()` we don't include the .compile() unlike earlier as its better to leave the optimizer for a Keras model to the KerasClassifier

In [None]:
model = KerasClassifier(model=create_model, 
                        loss="binary_crossentropy", 
                        optimizer="SGD", 
                        epochs=100, 
                        batch_size=10, 
                        verbose=0)

lr = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
# With the SciKeras wrapper, we will route the parameters 
# to the optimizer with the prefix optimizer__
param_grid = dict(optimizer__learning_rate=lr, optimizer__momentum=momentum)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# Summarize the Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Tuning Network Weight Initialization

Weight initialization refers to the process of randomly initializing the network's 
weights and biases. There are many methods to doing this ie. HeNormal, GlorotUniform, RandomUniform etc. 

In [None]:
# Define a new create_model() function to accept the initialization technique as parameter
def create_model(init_mode='uniform'):
    model = Sequential()
    model.add(Dense(12, input_shape=(8,), kernel_initializer=init_mode, activation='relu'))
    model.add(Dense(1, kernel_initializer=init_mode, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

In [None]:
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

# Defining the GridSearch parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero','glorot_normal',
            'glorot_uniform', 'he_normal','he_uniform']
param_grid = dict(model__init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1,cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# Summarizing Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Tuning the Neuron Activation Function

The activation function controls the non-linearity of individual neurons and when to fire. Normally the `rectifier` or `sigmoid` are most popular. The GridSearch will only be testing the activation function for the hidden layers as the output layer needs to have an activation function used for binary classification. 

Note that generally each activation function will perform differently due to range of data that the neuron will be inputted. Some activation functions need the input to be standardized and others don't

In [None]:
def create_model(activation='relu'):
    # create model
    model = Sequential()
    model.add(Dense(12, input_shape=(8,), kernel_initializer='uniform', activation=activation))
    model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

# define the grid search parameters
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(model__activation=activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Tuning Dropout Regularization


Dropout is a regularization technique where randomly selected neurons are ignored during training. They are "dropped-out" randomly. This means that their contribution to the activation of downstream neurons is temporarily removed on the forward pass and any weight updates are not applied to the neuron on the backward pass.

As the neural network learns, neuron weights settle into their context  within the network. Weights of neurons are tuned for specific features providing some specialization. Neighboring neurons become to rely on this specialization, which if taken too far can result in a fragile model too specialized to the training data. This reliance on context for a specific neuron during training is referred to complex *co-adaptations*.

If neurons are randomly dropped out of the network during training, than other neurons will have to step in and handle the representation required to make predictions for the missing neurons. By doing this no neuron will be too independent of its neighboring neurons. This is believed to result in multiple independent internal reperesentation being learned by the network.

The effect is that the network becomes less sensitive to the specific weights of neurons. This in turn results in a network that is capable of better generalization and is less likely to overfit the training data.
[Credit](https://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/)

In [None]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.constraints import MaxNorm

def create_model(dropout_rate, weight_constraint):
    # create model
    model = Sequential()
    model.add(Dense(12, input_shape=(8,), kernel_initializer='uniform', activation='linear', kernel_constraint=MaxNorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

# define the grid search parameters
weight_constraint = [1.0, 2.0, 3.0, 4.0, 5.0]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(model__dropout_rate=dropout_rate, model__weight_constraint=weight_constraint)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# Summarize the Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Tuning the Number of Neurons in the Hidden Layer

The number of neuruons in the hidden layers control the representation capacity of the network, at least at that point in the topology. A large enough single layer (in theory) can approximate any other neural network. For this case we will be try values between 1 and 30 in steps of 5. The larger the network the more training will be needed; and ideally the batch size and number of epochs should be optimized with the number of neurons.

In [None]:
def create_model(neurons):
    # create model
    model = Sequential()
    model.add(Dense(neurons, input_shape=(8,), kernel_initializer='uniform', activation='linear', kernel_constraint=MaxNorm(4)))
    model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

# define the grid search parameters
neurons = [1, 5, 10, 15, 20, 25, 30]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[Credit](https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/)