# **Assignment Hyperparameter Optimization**

In [None]:
# !pip uninstall tf-keras
# !pip install keras-tuner
# !pip install tensorflow==2.16.1

In [20]:
import keras
import tensorflow as tf
print("Keras Current Version:", keras.__version__, "Tensorflow Current Version:", tf.__version__)

Keras Current Version: 3.4.1 Tensorflow Current Version: 2.16.1


# **Imports**

In [3]:
import time
import numpy as np
import pandas as pd
from joblib import dump, load
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.initializers import RandomNormal, RandomUniform, GlorotUniform, GlorotNormal, HeNormal
from keras.optimizers.schedules import ExponentialDecay
from keras_tuner import RandomSearch, GridSearch, BayesianOptimization
from keras_tuner.engine.hyperparameters import HyperParameters

random.seed(46)
np.random.seed(46)
tf.random.set_seed(46)


# **Functions**

In [4]:
def preprocess_data(filepath):
    data = pd.read_csv(filepath)
    scaler = StandardScaler()
    X = scaler.fit_transform(data.drop('Outcome', axis=1))
    y = data['Outcome'].values
    dump(scaler, 'scaler.joblib')
    return X, y

def prepare_datasets(X_train, X_val, y_train, y_val, batch_size=None):
    if batch_size is None:
        batch_size = len(X_train)
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    val_dataset = val_dataset.batch(batch_size)
    return train_dataset, val_dataset

def plot_training_history(history, train_loss='loss', train_metric='accuracy', val_loss='val_loss', val_metric='val_accuracy'):

    #Loss
    plt.figure(figsize=(10, 5))
    plt.plot(history.history[train_loss], label='Training Loss')
    plt.plot(history.history[val_loss], label='Validation Loss')
    plt.title('Training and Validation Loss Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Metrics
    plt.figure(figsize=(10, 5))
    plt.plot(history.history[train_metric], label=f"Training: {train_metric}")
    plt.plot(history.history[val_metric], label=f"Validation: {val_metric}")
    plt.title(f'Training and Validation {train_metric} Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel(f'train_metric')
    plt.legend()
    plt.show()

def get_best_epoch_details(history):
    val_losses = history.history['val_loss']
    min_val_loss_index = val_losses.index(min(val_losses))
    best_epoch = min_val_loss_index + 1

    epoch_details = {}
    for key in history.history.keys():
        epoch_details[key] = history.history[key][min_val_loss_index]

    epoch_details['best_epoch'] = best_epoch
    print(f"Best epoch details: {epoch_details}")

# **Data Preparation**

In [5]:
X, y = preprocess_data('diabetes.csv')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=5)

train_ds, val_ds = prepare_datasets(X_train, X_val, y_train, y_val, batch_size=32)

# **Task 1: Create the Hyperparameter Search Space According to the Following Values:**

**Layer count**: 1-10

**Unit count**: Between 32-512, increasing by 16.

**Activation functions**: relu, tanh, sigmoid

**l2**: 0.0001-0.01

**dropout**: Between 0.1-0.5, increasing by 0.05.

**initial learning rate**: 0.0001-0.01 (1e-4 - 1e-2)

**learning rate scheduler**: decay steps: 20

**optimizers**: 'sgd', 'adam', 'rmsprop' (can remain as is)

**Random search:** epoch count must be at least 200

You can make other settings as you wish.

# **Task 1 Solution**

In [26]:
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(train_ds.element_spec[0].shape[1],)))

    # Hidden layers, activation functions, l2, Dropout
    for i in range(hp.Int('num_layers', 1, 10)):

        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=16),
                        activation=hp.Choice('activation_' + str(i), values=['relu', 'tanh', 'sigmoid']),
                        kernel_regularizer=l2(hp.Float('l2_' + str(i), min_value=0.0001, max_value=0.01, sampling='log'))))

        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_' + str(i), min_value=0.1, max_value=0.5, step=0.05)))

    model.add(Dense(1, activation='sigmoid'))

    # Learning rate schedule
    initial_learning_rate = hp.Float('initial_learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')

    lr_schedule = ExponentialDecay(
        initial_learning_rate=initial_learning_rate,
        decay_steps=20,
        decay_rate=0.96,
        staircase=True
    )

    # optimizers
    optimizer_choice = hp.Choice('optimizer', values=['sgd', 'adam', "rmsprop"])
    if optimizer_choice == 'sgd':
        optimizer = SGD(
            learning_rate=lr_schedule,
            momentum=hp.Float('momentum', min_value=0.0, max_value=0.9, step=0.1)
        )
    elif optimizer_choice == 'adam':
        optimizer = Adam(
            learning_rate=lr_schedule,
            beta_1=hp.Float('beta1', min_value=0.85, max_value=0.99, step=0.01),
            beta_2=hp.Float('beta2', min_value=0.999, max_value=0.9999, step=0.0001),
            epsilon=hp.Float('epsilon', min_value=1e-8, max_value=1e-7, step=1e-8)
        )

    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(
            learning_rate=lr_schedule,
            rho=hp.Float('rho', min_value=0.8, max_value=0.99, step=0.01),
            epsilon=hp.Float('epsilon', min_value=1e-10, max_value=1e-8, step=1e-10),
            momentum=hp.Float('momentum', min_value=0.0, max_value=0.9, step=0.1)
        )

    model.compile(optimizer=optimizer,
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    return model

# **Task 2: Start the Search With the Epoch Count Being 200. Other settings Can Remain the Same.**

# **Task 2 Solution**

In [27]:
random_search_tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=20,
    executions_per_trial=1,
    overwrite=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    verbose=1,
    restore_best_weights=True)


In [28]:
random_search_tuner.search(train_ds,
                           epochs=200,
                           validation_data=val_ds,
                           callbacks=[early_stopping])

Trial 20 Complete [00h 01m 14s]
val_loss: 3.0624544620513916

Best val_loss So Far: 0.5032702088356018
Total elapsed time: 00h 19m 46s


In [29]:
random_search_tuner.search_space_summary()

Search space summary
Default search space size: 48
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 10, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 16, 'sampling': 'linear'}
activation_0 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh', 'sigmoid'], 'ordered': False}
l2_0 (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
dropout_0 (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.5, 'step': 0.05, 'sampling': 'linear'}
initial_learning_rate (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
optimizer (Choice)
{'default': 'sgd', 'conditions': [], 'values': ['sgd', 'adam', 'rmsprop'], 'ordered': False}
momentum (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.9, 'ste

In [30]:
random_search_tuner.results_summary()

Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 02 summary
Hyperparameters:
num_layers: 1
units_0: 496
activation_0: relu
l2_0: 0.0008678811964271953
dropout_0: 0.35
initial_learning_rate: 0.0007443190353728441
optimizer: sgd
momentum: 0.0
units_1: 192
activation_1: relu
l2_1: 0.0008083461993086267
dropout_1: 0.30000000000000004
units_2: 432
activation_2: relu
l2_2: 0.0008639199562623075
dropout_2: 0.15000000000000002
units_3: 320
activation_3: tanh
l2_3: 0.0016212150201753844
dropout_3: 0.30000000000000004
units_4: 48
activation_4: relu
l2_4: 0.0002522332696647429
dropout_4: 0.4
beta1: 0.89
beta2: 0.9994999999999999
epsilon: 6e-08
Score: 0.5032702088356018

Trial 06 summary
Hyperparameters:
num_layers: 1
units_0: 272
activation_0: relu
l2_0: 0.007321002854350309
dropout_0: 0.4
initial_learning_rate: 0.0025284101838969467
optimizer: adam
momentum: 0.4
units_1: 288
activation_1: sigmoid
l2_1: 0.0018539278556818578
d

# **Task 3: Bring the Best 3 Hyperparameter Sets, Save Them Separately, Examine Their Values, and Comment on Some Hyperparameter Values.**

# **Task 3 Solution**

In [44]:
best_hps = random_search_tuner.get_best_hyperparameters(num_trials=3)

In [45]:
best_hps_1 = best_hps[0]
best_hps_2 = best_hps[1]
best_hps_3 = best_hps[2]

In [48]:
print(f"First Best Hyperparameters: {best_hps_1.values}")
print(f"Second Best Hyperparameters: {best_hps_2.values}")
print(f"Third Best Hyperparameters: {best_hps_3.values}")

First Best Hyperparameters: {'num_layers': 1, 'units_0': 496, 'activation_0': 'relu', 'l2_0': 0.0008678811964271953, 'dropout_0': 0.35, 'initial_learning_rate': 0.0007443190353728441, 'optimizer': 'sgd', 'momentum': 0.0, 'units_1': 192, 'activation_1': 'relu', 'l2_1': 0.0008083461993086267, 'dropout_1': 0.30000000000000004, 'units_2': 432, 'activation_2': 'relu', 'l2_2': 0.0008639199562623075, 'dropout_2': 0.15000000000000002, 'units_3': 320, 'activation_3': 'tanh', 'l2_3': 0.0016212150201753844, 'dropout_3': 0.30000000000000004, 'units_4': 48, 'activation_4': 'relu', 'l2_4': 0.0002522332696647429, 'dropout_4': 0.4, 'beta1': 0.89, 'beta2': 0.9994999999999999, 'epsilon': 6e-08}
Second Best Hyperparameters: {'num_layers': 1, 'units_0': 272, 'activation_0': 'relu', 'l2_0': 0.007321002854350309, 'dropout_0': 0.4, 'initial_learning_rate': 0.0025284101838969467, 'optimizer': 'adam', 'momentum': 0.4, 'units_1': 288, 'activation_1': 'sigmoid', 'l2_1': 0.0018539278556818578, 'dropout_1': 0.1500

As seen in the results of our searches, the layer numbers came as 1, 1 and 5 respectively. The layer number of our first two best models was determined as 1. **However, another common feature was determined as relu as the activation function.** In our third model, unlike these, the sigmoid function was determined.

**Although we sense a degree of illogicality in determining sigmoid as the activation function in our third model, we do not make a definitive comment. The reason for this is that the sigmoid function compresses the outputs between 0, 0 and 1, 1. This can cause information loss by narrowing the output range too much, especially in a layer that receives a large amount of information, such as the first layer. This means that the model cannot sufficiently enrich the information transferred to the deep layers. The sigmoid function increases the problem of shrinking gradients, especially in large deep networks. If the model has a large enough data set and a very deep structure is used, the gradients can shrink, causing updates to be ineffective.**

If we continue to examine the parameter combinations, the use of the sigmoid function as the activation function in the intermediate layers of the second best combination creates doubt.

# **Task 4: Select the Best 3 Models.**

# **Task 4 Solution**

In [49]:
best_models = random_search_tuner.get_best_models(num_models=3)

  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))


In [50]:
best_models_1 = best_models[0]
best_models_2 = best_models[1]
best_models_3 = best_models[2]

In [51]:
best_models_1.summary()

In [52]:
best_models_2.summary()

In [53]:
best_models_3.summary()

# **Task 5: Calculate the Model Success of the Top 3 Models Through a Loop**

# **Task 5 Solution**

In [55]:
for i, model in enumerate(best_models):
    loss, acc = model.evaluate(val_ds, verbose=0)
    print(f"Best Model {i+1}, Validation loss: {loss}, Validation Accuracy: {acc}")

Best Model 1, Validation loss: 0.5032702088356018, Validation Accuracy: 0.7727272510528564
Best Model 2, Validation loss: 0.5036947131156921, Validation Accuracy: 0.7662337422370911
Best Model 3, Validation loss: 0.5059388279914856, Validation Accuracy: 0.7727272510528564


# **Task 6: Why is There a Difference Between the Accuracy Values ​​of the Models? We Expect the Top Model to be the Best, If Not, Why Doesn't Whe Top One Have the Highest Accuracy Value?**

# **Task 6 Solution**

Here, the highest accuracy value belongs to the top model. However, we determined the combinations by monitoring the **'val_loss'** metric. Therefore, even if the accuracy value of the top model was lower than the other two models, it would still be the best model.