In [None]:
import numpy as np
from utils import load_model, Visualizer
import numpy as np
from utils import DataHandler, Visualizer
from model import Model
from layers import FullyConnectedLayer, ActivationReLU, ActivationSoftmax
from training_core import SGDOptimizer, CELoss, Trainer
from evaluate import Evaluator
from metrics import Accuracy

np.random.seed(0)
data_handler = DataHandler('.\data')
X_train, y_train, X_test, y_test = data_handler.load_mnist()
X, y, X_val, y_val = data_handler.split_validation(
    X_train, y_train, val_ratio=0.25)
X, X_test = data_handler.scale(X, X_test)
X_val = data_handler.scale(X_val)

## Hyperparameter Search

In [15]:
def create_model(n_neuron_layer=128, learning_rate=1, 
                 decay=0.1, moment=0.8, batch_size=128, 
                 l2_reg_weight=0, n_epoch=5):
    # 3-layer neural network
    model = Model()
    model.add(FullyConnectedLayer(X.shape[1], n_neuron_layer, l2_reg_weight))
    model.add(ActivationReLU())
    model.add(FullyConnectedLayer(
        n_neuron_layer, n_neuron_layer, l2_reg_weight))
    model.add(ActivationReLU())
    model.add(FullyConnectedLayer(n_neuron_layer, 10, l2_reg_weight))
    model.add(ActivationSoftmax())
    model.set_items(loss=CELoss(), optimizer=SGDOptimizer(learning_rate, decay, moment), accuracy=Accuracy())
    model.finalize()
    
    trainer = Trainer(model)
    trainer.train(X, y, epochs=n_epoch, batch_size=batch_size,
                print_every=None, val_data=(X_val, y_val))
    model.set_parameters(model.best_weights)

    return model

In [12]:
# Number of neurons in the hidden layer
n_neuron_layer = [32, 64, 128, 256]
for n in n_neuron_layer:
    print(f"\n----n_neuron_layer={n}----")
    model = create_model(n_neuron_layer=n)


----n_neuron_layer=32----
Training--acc: 0.8433, loss: 0.3620 (data_loss: 0.3620, reg_loss: 0.0000)
Validation--acc: 0.8359, loss: 0.4574
### Training finished!

----n_neuron_layer=64----
Training--acc: 0.8638, loss: 0.2639 (data_loss: 0.2639, reg_loss: 0.0000)
Validation--acc: 0.8531, loss: 0.3947
### Training finished!

----n_neuron_layer=128----
Training--acc: 0.8613, loss: 0.2914 (data_loss: 0.2914, reg_loss: 0.0000)
Validation--acc: 0.8543, loss: 0.3983
### Training finished!

----n_neuron_layer=256----
Training--acc: 0.8711, loss: 0.2524 (data_loss: 0.2524, reg_loss: 0.0000)
Validation--acc: 0.8633, loss: 0.3763
### Training finished!


The results are very closed, though the performance improves slightly as the number of hidden layer neurons increases. To save computing time, we select `n_neuron_layer=128` as default.

In [16]:
batch_size = [32, 64, 128, 256]
for bs in batch_size:
    print(f"\n----batch_size={bs}----")
    model = create_model(batch_size=bs)


----batch_size=32----
Training--acc: 0.8392, loss: 0.1392 (data_loss: 0.1392, reg_loss: 0.0000)
Validation--acc: 0.8298, loss: 0.4757
### Training finished!

----batch_size=64----
Training--acc: 0.8594, loss: 0.1481 (data_loss: 0.1481, reg_loss: 0.0000)
Validation--acc: 0.8476, loss: 0.4185
### Training finished!

----batch_size=128----
Training--acc: 0.8728, loss: 0.3470 (data_loss: 0.3470, reg_loss: 0.0000)
Validation--acc: 0.8605, loss: 0.3945
### Training finished!

----batch_size=256----
Training--acc: 0.8604, loss: 0.4013 (data_loss: 0.4013, reg_loss: 0.0000)
Validation--acc: 0.8511, loss: 0.4206
### Training finished!


We observe that the performance of the models peaks at n=128, and as `batch_size` increases, the training time gets shorter. We then select 128 as the default `batch_size` parameter.

In [17]:
learning_rate = [1, 0.1, 0.01]
for lr in learning_rate:
    print(f"\n----learning_rate={lr}----")
    model = create_model(learning_rate=lr)


----learning_rate=1----
Training--acc: 0.8669, loss: 0.3720 (data_loss: 0.3720, reg_loss: 0.0000)
Validation--acc: 0.8550, loss: 0.4037
### Training finished!

----learning_rate=0.1----
Training--acc: 0.7382, loss: 0.7104 (data_loss: 0.7104, reg_loss: 0.0000)
Validation--acc: 0.7297, loss: 0.7365
### Training finished!

----learning_rate=0.01----
Training--acc: 0.3973, loss: 2.2986 (data_loss: 2.2986, reg_loss: 0.0000)
Validation--acc: 0.3947, loss: 2.2982
### Training finished!


As shown above, a lower initial `learning_rate` leads to a poorer performance. This could be due to the fact that the learning rate is too low to allow the model to learn the data. We then select 1 as the default `learning_rate` parameter.

In [18]:
# Learning rate decay factor
decay = [1, 0.1, 0.01, 0.001]
for d in decay:
    print(f"\n----decay={d}----")
    model = create_model(decay=d)


----decay=1----
Training--acc: 0.7659, loss: 0.6317 (data_loss: 0.6317, reg_loss: 0.0000)
Validation--acc: 0.7580, loss: 0.6637
### Training finished!

----decay=0.1----
Training--acc: 0.8682, loss: 0.3474 (data_loss: 0.3474, reg_loss: 0.0000)
Validation--acc: 0.8561, loss: 0.4074
### Training finished!

----decay=0.01----
Training--acc: 0.7846, loss: 0.6432 (data_loss: 0.6432, reg_loss: 0.0000)
Validation--acc: 0.7640, loss: 0.6848
### Training finished!

----decay=0.001----
Training--acc: 0.1105, loss: 2.3131 (data_loss: 2.3131, reg_loss: 0.0000)
Validation--acc: 0.0987, loss: 2.3042
### Training finished!


We observe that the performance of the models peaks at `decay=0.1`, and as `decay` gets smaller, the metrics get worse quickly. Thus, we cannot let lr decay too slowly. We then select 0.1 as the default `decay` parameter.

In [21]:
moment = [0.0, 0.7, 0.8, 0.9]
for m in moment:
    print(f"\n----moment={m}----")
    model = create_model(moment=m)


----moment=0.0----
Training--acc: 0.8159, loss: 0.4779 (data_loss: 0.4779, reg_loss: 0.0000)
Validation--acc: 0.8039, loss: 0.5426
### Training finished!

----moment=0.7----
Training--acc: 0.8652, loss: 0.3521 (data_loss: 0.3521, reg_loss: 0.0000)
Validation--acc: 0.8569, loss: 0.4027
### Training finished!

----moment=0.8----
Training--acc: 0.8709, loss: 0.3646 (data_loss: 0.3646, reg_loss: 0.0000)
Validation--acc: 0.8609, loss: 0.3933
### Training finished!

----moment=0.9----
Training--acc: 0.8449, loss: 0.4187 (data_loss: 0.4187, reg_loss: 0.0000)
Validation--acc: 0.8381, loss: 0.4620
### Training finished!


The momentum parameter indeed improves the performance. But we do not find a clear difference between 0.7 and 0.8, though a too high momentum may lead to overshooting. We select 0.8 as the default `momentum` parameter.

In [20]:
l2_reg_weight = [0, 0.1, 0.01, 0.001, 0.0001]
for l2 in l2_reg_weight:
    print(f"\n----l2_reg_weight={l2}----")
    model = create_model(l2_reg_weight=l2)


----l2_reg_weight=0----
Training--acc: 0.8689, loss: 0.3388 (data_loss: 0.3388, reg_loss: 0.0000)
Validation--acc: 0.8539, loss: 0.4009
### Training finished!

----l2_reg_weight=0.1----
Training--acc: 0.0982, loss: 2.3028 (data_loss: 2.3028, reg_loss: 0.0000)
Validation--acc: 0.0969, loss: 2.3028
### Training finished!

----l2_reg_weight=0.01----
Training--acc: 0.8364, loss: 0.7465 (data_loss: 0.4995, reg_loss: 0.2470)
Validation--acc: 0.8305, loss: 0.4957
### Training finished!

----l2_reg_weight=0.001----
Training--acc: 0.8703, loss: 0.4287 (data_loss: 0.3301, reg_loss: 0.0987)
Validation--acc: 0.8579, loss: 0.3966
### Training finished!

----l2_reg_weight=0.0001----
Training--acc: 0.8654, loss: 0.3941 (data_loss: 0.3737, reg_loss: 0.0204)
Validation--acc: 0.8551, loss: 0.4080
### Training finished!


From above and previous results, the training and validation losses and accuracies are not far from each other, which indicates that the model is not overfitting. We can then select `l2_reg_weight=0` as the default parameter.

## Weights Visualization

In [None]:
model = load_model('./result/fashion_mnist_model.pkl')

# Visualize the weights and biases of the first layer
Visualizer.visualize_weights(model.best_weights[0][0])
Visualizer.visualize_biases(model.best_weights[0][1].reshape(-1, 1))