In [1]:
import pickle
import numpy as np
import optuna
import optuna.visualization as ov
import pickle

from sgd_L1 import NeuralNetwork

In [2]:
with open("../data/train_data.pkl", "rb") as train_file:
    train_data = pickle.load(train_file)

rng = np.random.default_rng()
rng.shuffle(train_data)
validation_data = train_data[:5000]
train_data = train_data[5000:]

In [3]:
def objective(trial):
    learning_rate = trial.suggest_float("learning rate", 1e-5, 1e-3, log=True)
    L1_lambda = trial.suggest_float("L1 lambda", 1e-2, 10, log=True)
    nn = NeuralNetwork(
        [28*28, 1024, 512, 128, 10], # layers size
        learning_rate,               # learning rate
        L1_lambda,                   # L1 lambda
        64,                          # mini batch size
        5                            # training epochs
    )
    accuracy, _ = nn.train(train_data, validation_data)
    return accuracy[-1]

In [4]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-06-24 18:42:14,007] A new study created in memory with name: no-name-c84d6ffe-ff8f-47f0-847b-d6b070263e0d


Pre-train stats
==> Accuracy: 11.0%, Avg loss: 7.717746

Epoch 0
--------------------
loss: 8.018305 [mini-batch 0 / 859]
loss: 1.703927 [mini-batch 100 / 859]
loss: 1.583194 [mini-batch 200 / 859]
loss: 1.212510 [mini-batch 300 / 859]
loss: 0.963275 [mini-batch 400 / 859]
loss: 1.018145 [mini-batch 500 / 859]
loss: 1.222212 [mini-batch 600 / 859]
loss: 0.859638 [mini-batch 700 / 859]
loss: 1.161640 [mini-batch 800 / 859]
==> Accuracy: 69.3%, Avg loss: 0.864247

Epoch 1
--------------------
loss: 0.811941 [mini-batch 0 / 859]
loss: 0.985837 [mini-batch 100 / 859]
loss: 0.732697 [mini-batch 200 / 859]
loss: 0.897589 [mini-batch 300 / 859]
loss: 0.718318 [mini-batch 400 / 859]
loss: 0.689201 [mini-batch 500 / 859]
loss: 0.584526 [mini-batch 600 / 859]
loss: 0.642675 [mini-batch 700 / 859]
loss: 0.763323 [mini-batch 800 / 859]
==> Accuracy: 74.6%, Avg loss: 0.722654

Epoch 2
--------------------
loss: 0.599678 [mini-batch 0 / 859]
loss: 0.918653 [mini-batch 100 / 859]
loss: 0.647312 [mini

[I 2024-06-24 19:21:52,649] Trial 0 finished with value: 0.7972 and parameters: {'learning rate': 2.169936253375492e-05, 'L1 lambda': 0.8323553239145367}. Best is trial 0 with value: 0.7972.


==> Accuracy: 79.7%, Avg loss: 0.587074

Pre-train stats
==> Accuracy: 3.4%, Avg loss: 6.011395

Epoch 0
--------------------
loss: 5.944395 [mini-batch 0 / 859]
loss: 1.444607 [mini-batch 100 / 859]
loss: 1.144224 [mini-batch 200 / 859]
loss: 0.972792 [mini-batch 300 / 859]
loss: 1.035123 [mini-batch 400 / 859]
loss: 0.798382 [mini-batch 500 / 859]
loss: 0.633287 [mini-batch 600 / 859]
loss: 0.621093 [mini-batch 700 / 859]
loss: 0.672375 [mini-batch 800 / 859]
==> Accuracy: 74.7%, Avg loss: 0.731129

Epoch 1
--------------------
loss: 0.793525 [mini-batch 0 / 859]
loss: 0.730231 [mini-batch 100 / 859]
loss: 0.702761 [mini-batch 200 / 859]
loss: 0.598608 [mini-batch 300 / 859]
loss: 0.964593 [mini-batch 400 / 859]
loss: 0.618002 [mini-batch 500 / 859]
loss: 0.743557 [mini-batch 600 / 859]
loss: 0.394572 [mini-batch 700 / 859]
loss: 0.714718 [mini-batch 800 / 859]
==> Accuracy: 78.5%, Avg loss: 0.619709

Epoch 2
--------------------
loss: 0.727972 [mini-batch 0 / 859]
loss: 0.805240 [mi

[I 2024-06-24 20:03:38,839] Trial 1 finished with value: 0.8228 and parameters: {'learning rate': 4.398328301496164e-05, 'L1 lambda': 0.028870474974720013}. Best is trial 1 with value: 0.8228.


==> Accuracy: 82.3%, Avg loss: 0.522688

Pre-train stats
==> Accuracy: 14.6%, Avg loss: 7.501366

Epoch 0
--------------------
loss: 7.870569 [mini-batch 0 / 859]
loss: 1.043361 [mini-batch 100 / 859]
loss: 0.839007 [mini-batch 200 / 859]
loss: 0.807767 [mini-batch 300 / 859]
loss: 0.623967 [mini-batch 400 / 859]
loss: 0.860438 [mini-batch 500 / 859]
loss: 0.850830 [mini-batch 600 / 859]
loss: 0.644695 [mini-batch 700 / 859]
loss: 0.786097 [mini-batch 800 / 859]
==> Accuracy: 81.9%, Avg loss: 0.519546

Epoch 1
--------------------
loss: 0.536973 [mini-batch 0 / 859]
loss: 0.501551 [mini-batch 100 / 859]
loss: 0.497461 [mini-batch 200 / 859]
loss: 0.673879 [mini-batch 300 / 859]
loss: 0.517692 [mini-batch 400 / 859]
loss: 0.480821 [mini-batch 500 / 859]
loss: 0.589503 [mini-batch 600 / 859]
loss: 0.497396 [mini-batch 700 / 859]
loss: 0.688643 [mini-batch 800 / 859]
==> Accuracy: 82.7%, Avg loss: 0.499637

Epoch 2
--------------------
loss: 0.585830 [mini-batch 0 / 859]
loss: 0.820406 [m

[I 2024-06-24 20:46:02,712] Trial 2 finished with value: 0.8392 and parameters: {'learning rate': 0.0002472578287788824, 'L1 lambda': 0.6665327794552703}. Best is trial 2 with value: 0.8392.


==> Accuracy: 83.9%, Avg loss: 0.505482

Pre-train stats
==> Accuracy: 12.9%, Avg loss: 8.358166

Epoch 0
--------------------
loss: 8.655595 [mini-batch 0 / 859]
loss: 2.004128 [mini-batch 100 / 859]
loss: 1.463768 [mini-batch 200 / 859]
loss: 1.348706 [mini-batch 300 / 859]
loss: 1.259148 [mini-batch 400 / 859]
loss: 1.181559 [mini-batch 500 / 859]
loss: 0.945596 [mini-batch 600 / 859]
loss: 0.939226 [mini-batch 700 / 859]
loss: 0.873238 [mini-batch 800 / 859]
==> Accuracy: 67.6%, Avg loss: 0.947034

Epoch 1
--------------------
loss: 1.099473 [mini-batch 0 / 859]
loss: 0.888866 [mini-batch 100 / 859]
loss: 0.922066 [mini-batch 200 / 859]
loss: 0.968547 [mini-batch 300 / 859]
loss: 0.906857 [mini-batch 400 / 859]
loss: 0.848454 [mini-batch 500 / 859]
loss: 0.783146 [mini-batch 600 / 859]
loss: 1.151051 [mini-batch 700 / 859]
loss: 0.650549 [mini-batch 800 / 859]
==> Accuracy: 73.5%, Avg loss: 0.774678

Epoch 2
--------------------
loss: 0.735514 [mini-batch 0 / 859]
loss: 1.017541 [m

[I 2024-06-24 21:28:22,339] Trial 3 finished with value: 0.7876 and parameters: {'learning rate': 1.4753369965434447e-05, 'L1 lambda': 0.10303108665538029}. Best is trial 2 with value: 0.8392.


==> Accuracy: 78.8%, Avg loss: 0.622142

Pre-train stats
==> Accuracy: 13.6%, Avg loss: 4.651895

Epoch 0
--------------------
loss: 4.349344 [mini-batch 0 / 859]
loss: 0.982543 [mini-batch 100 / 859]
loss: 0.712921 [mini-batch 200 / 859]
loss: 0.650872 [mini-batch 300 / 859]
loss: 0.484414 [mini-batch 400 / 859]
loss: 0.683044 [mini-batch 500 / 859]
loss: 0.636379 [mini-batch 600 / 859]
loss: 0.752425 [mini-batch 700 / 859]
loss: 0.658382 [mini-batch 800 / 859]
==> Accuracy: 79.5%, Avg loss: 0.570887

Epoch 1
--------------------
loss: 0.433576 [mini-batch 0 / 859]
loss: 0.699359 [mini-batch 100 / 859]
loss: 0.737899 [mini-batch 200 / 859]
loss: 0.578256 [mini-batch 300 / 859]
loss: 0.506058 [mini-batch 400 / 859]
loss: 0.532165 [mini-batch 500 / 859]
loss: 0.846103 [mini-batch 600 / 859]
loss: 0.569907 [mini-batch 700 / 859]
loss: 0.501317 [mini-batch 800 / 859]
==> Accuracy: 81.4%, Avg loss: 0.517622

Epoch 2
--------------------
loss: 0.508893 [mini-batch 0 / 859]
loss: 0.422552 [m

[I 2024-06-24 22:10:25,869] Trial 4 finished with value: 0.8426 and parameters: {'learning rate': 0.00013973478878593913, 'L1 lambda': 0.25492135284062234}. Best is trial 4 with value: 0.8426.


==> Accuracy: 84.3%, Avg loss: 0.449771

Pre-train stats
==> Accuracy: 12.4%, Avg loss: 6.541521

Epoch 0
--------------------
loss: 7.423812 [mini-batch 0 / 859]
loss: 0.798792 [mini-batch 100 / 859]
loss: 0.720886 [mini-batch 200 / 859]
loss: 0.724900 [mini-batch 300 / 859]
loss: 0.672673 [mini-batch 400 / 859]
loss: 0.456234 [mini-batch 500 / 859]
loss: 0.721108 [mini-batch 600 / 859]
loss: 0.877620 [mini-batch 700 / 859]
loss: 0.477498 [mini-batch 800 / 859]
==> Accuracy: 80.7%, Avg loss: 0.553881

Epoch 1
--------------------
loss: 0.589836 [mini-batch 0 / 859]
loss: 0.634770 [mini-batch 100 / 859]
loss: 0.433098 [mini-batch 200 / 859]
loss: 0.407730 [mini-batch 300 / 859]
loss: 0.489887 [mini-batch 400 / 859]
loss: 0.585025 [mini-batch 500 / 859]
loss: 0.506029 [mini-batch 600 / 859]
loss: 0.691576 [mini-batch 700 / 859]
loss: 0.398282 [mini-batch 800 / 859]
==> Accuracy: 82.7%, Avg loss: 0.527495

Epoch 2
--------------------
loss: 0.534372 [mini-batch 0 / 859]
loss: 0.559183 [m

[I 2024-06-24 22:53:35,395] Trial 5 finished with value: 0.8182 and parameters: {'learning rate': 0.00019915716003017655, 'L1 lambda': 1.2377114322052611}. Best is trial 4 with value: 0.8426.


==> Accuracy: 81.8%, Avg loss: 0.671031

Pre-train stats
==> Accuracy: 12.1%, Avg loss: 6.448344

Epoch 0
--------------------
loss: 7.142630 [mini-batch 0 / 859]
loss: 0.778788 [mini-batch 100 / 859]
loss: 0.836244 [mini-batch 200 / 859]
loss: 0.783984 [mini-batch 300 / 859]
loss: 0.711410 [mini-batch 400 / 859]
loss: 0.699617 [mini-batch 500 / 859]
loss: 0.562819 [mini-batch 600 / 859]
loss: 0.480427 [mini-batch 700 / 859]
loss: 0.713441 [mini-batch 800 / 859]
==> Accuracy: 81.1%, Avg loss: 0.583243

Epoch 1
--------------------
loss: 0.446075 [mini-batch 0 / 859]
loss: 0.513760 [mini-batch 100 / 859]
loss: 0.520680 [mini-batch 200 / 859]
loss: 0.635940 [mini-batch 300 / 859]
loss: 0.634454 [mini-batch 400 / 859]
loss: 0.579907 [mini-batch 500 / 859]
loss: 0.612484 [mini-batch 600 / 859]
loss: 0.690436 [mini-batch 700 / 859]
loss: 0.701674 [mini-batch 800 / 859]
==> Accuracy: 79.8%, Avg loss: 0.695181

Epoch 2
--------------------
loss: 0.566945 [mini-batch 0 / 859]
loss: 0.763217 [m

[I 2024-06-24 23:36:11,944] Trial 6 finished with value: 0.6088 and parameters: {'learning rate': 0.00037746716853772185, 'L1 lambda': 1.5133102667977998}. Best is trial 4 with value: 0.8426.


==> Accuracy: 60.9%, Avg loss: 1.102869

Pre-train stats
==> Accuracy: 6.8%, Avg loss: 5.802244

Epoch 0
--------------------
loss: 5.388658 [mini-batch 0 / 859]
loss: 1.876371 [mini-batch 100 / 859]
loss: 1.244717 [mini-batch 200 / 859]
loss: 1.397596 [mini-batch 300 / 859]
loss: 0.906431 [mini-batch 400 / 859]
loss: 0.920483 [mini-batch 500 / 859]
loss: 1.290118 [mini-batch 600 / 859]
loss: 1.053688 [mini-batch 700 / 859]
loss: 0.828831 [mini-batch 800 / 859]
==> Accuracy: 69.4%, Avg loss: 0.879846

Epoch 1
--------------------
loss: 0.846465 [mini-batch 0 / 859]
loss: 0.564548 [mini-batch 100 / 859]
loss: 0.961762 [mini-batch 200 / 859]
loss: 0.696438 [mini-batch 300 / 859]
loss: 0.913816 [mini-batch 400 / 859]
loss: 0.607060 [mini-batch 500 / 859]
loss: 0.864323 [mini-batch 600 / 859]
loss: 0.720743 [mini-batch 700 / 859]
loss: 0.839491 [mini-batch 800 / 859]
==> Accuracy: 74.6%, Avg loss: 0.728954

Epoch 2
--------------------
loss: 1.039810 [mini-batch 0 / 859]
loss: 0.912325 [mi

[I 2024-06-25 00:18:13,359] Trial 7 finished with value: 0.7934 and parameters: {'learning rate': 1.9099947977411154e-05, 'L1 lambda': 0.3812019940386868}. Best is trial 4 with value: 0.8426.


==> Accuracy: 79.3%, Avg loss: 0.597323

Pre-train stats
==> Accuracy: 9.7%, Avg loss: 5.822123

Epoch 0
--------------------
loss: 5.606212 [mini-batch 0 / 859]
loss: 0.723467 [mini-batch 100 / 859]
loss: 0.844879 [mini-batch 200 / 859]
loss: 0.954166 [mini-batch 300 / 859]
loss: 0.783204 [mini-batch 400 / 859]
loss: 0.674075 [mini-batch 500 / 859]
loss: 0.963722 [mini-batch 600 / 859]
loss: 0.832579 [mini-batch 700 / 859]
loss: 0.655845 [mini-batch 800 / 859]
==> Accuracy: 78.1%, Avg loss: 0.804867

Epoch 1
--------------------
loss: 0.894351 [mini-batch 0 / 859]
loss: 0.732812 [mini-batch 100 / 859]
loss: 0.912099 [mini-batch 200 / 859]
loss: 1.064511 [mini-batch 300 / 859]
loss: 1.045432 [mini-batch 400 / 859]
loss: 1.066827 [mini-batch 500 / 859]
loss: 1.216109 [mini-batch 600 / 859]
loss: 1.272562 [mini-batch 700 / 859]
loss: 1.416163 [mini-batch 800 / 859]
==> Accuracy: 71.5%, Avg loss: 1.434998

Epoch 2
--------------------
loss: 1.441353 [mini-batch 0 / 859]
loss: 1.542424 [mi

[I 2024-06-25 01:01:09,447] Trial 8 finished with value: 0.0984 and parameters: {'learning rate': 0.00012529007057834556, 'L1 lambda': 5.9260409097524125}. Best is trial 4 with value: 0.8426.


==> Accuracy: 9.8%, Avg loss: 2.303238

Pre-train stats
==> Accuracy: 12.1%, Avg loss: 7.023937

Epoch 0
--------------------
loss: 7.080931 [mini-batch 0 / 859]
loss: 0.723481 [mini-batch 100 / 859]
loss: 0.597779 [mini-batch 200 / 859]
loss: 0.687825 [mini-batch 300 / 859]
loss: 0.492005 [mini-batch 400 / 859]
loss: 0.556992 [mini-batch 500 / 859]
loss: 0.811914 [mini-batch 600 / 859]
loss: 0.815937 [mini-batch 700 / 859]
loss: 0.969852 [mini-batch 800 / 859]
==> Accuracy: 75.9%, Avg loss: 0.964940

Epoch 1
--------------------
loss: 1.023638 [mini-batch 0 / 859]
loss: 1.086373 [mini-batch 100 / 859]
loss: 1.261052 [mini-batch 200 / 859]
loss: 1.382146 [mini-batch 300 / 859]
loss: 1.461103 [mini-batch 400 / 859]
loss: 1.439759 [mini-batch 500 / 859]
loss: 1.274657 [mini-batch 600 / 859]
loss: 1.300609 [mini-batch 700 / 859]
loss: 1.266839 [mini-batch 800 / 859]
==> Accuracy: 49.9%, Avg loss: 1.283804

Epoch 2
--------------------
loss: 1.248368 [mini-batch 0 / 859]
loss: 1.258861 [mi

[I 2024-06-25 01:43:21,637] Trial 9 finished with value: 0.5958 and parameters: {'learning rate': 0.0006054129618896587, 'L1 lambda': 2.4017490037780522}. Best is trial 4 with value: 0.8426.


==> Accuracy: 59.6%, Avg loss: 1.010545

Pre-train stats
==> Accuracy: 7.6%, Avg loss: 6.063529

Epoch 0
--------------------
loss: 7.181239 [mini-batch 0 / 859]
loss: 1.164417 [mini-batch 100 / 859]
loss: 0.977667 [mini-batch 200 / 859]
loss: 0.875998 [mini-batch 300 / 859]
loss: 0.919110 [mini-batch 400 / 859]
loss: 0.960565 [mini-batch 500 / 859]
loss: 0.897800 [mini-batch 600 / 859]
loss: 0.949067 [mini-batch 700 / 859]
loss: 0.362451 [mini-batch 800 / 859]
==> Accuracy: 74.7%, Avg loss: 0.717363

Epoch 1
--------------------
loss: 0.915156 [mini-batch 0 / 859]
loss: 0.890102 [mini-batch 100 / 859]
loss: 1.009066 [mini-batch 200 / 859]
loss: 0.701255 [mini-batch 300 / 859]
loss: 0.647112 [mini-batch 400 / 859]
loss: 0.871211 [mini-batch 500 / 859]
loss: 0.567237 [mini-batch 600 / 859]
loss: 0.552654 [mini-batch 700 / 859]
loss: 0.667232 [mini-batch 800 / 859]
==> Accuracy: 77.8%, Avg loss: 0.640425

Epoch 2
--------------------
loss: 0.483261 [mini-batch 0 / 859]
loss: 0.933667 [mi

[I 2024-06-25 02:24:41,041] Trial 10 finished with value: 0.8194 and parameters: {'learning rate': 6.091255820572607e-05, 'L1 lambda': 0.102729634118518}. Best is trial 4 with value: 0.8426.


==> Accuracy: 81.9%, Avg loss: 0.516534

Pre-train stats
==> Accuracy: 4.6%, Avg loss: 10.180588

Epoch 0
--------------------
loss: 9.139818 [mini-batch 0 / 859]
loss: 0.654197 [mini-batch 100 / 859]
loss: 0.977556 [mini-batch 200 / 859]
loss: 0.724801 [mini-batch 300 / 859]
loss: 0.582684 [mini-batch 400 / 859]
loss: 0.952978 [mini-batch 500 / 859]
loss: 0.508315 [mini-batch 600 / 859]
loss: 0.854136 [mini-batch 700 / 859]
loss: 0.478740 [mini-batch 800 / 859]
==> Accuracy: 81.9%, Avg loss: 0.516894

Epoch 1
--------------------
loss: 0.575393 [mini-batch 0 / 859]
loss: 0.531817 [mini-batch 100 / 859]
loss: 0.543170 [mini-batch 200 / 859]
loss: 0.507036 [mini-batch 300 / 859]
loss: 0.625673 [mini-batch 400 / 859]
loss: 0.612005 [mini-batch 500 / 859]
loss: 0.592355 [mini-batch 600 / 859]
loss: 0.501030 [mini-batch 700 / 859]
loss: 0.391927 [mini-batch 800 / 859]
==> Accuracy: 82.6%, Avg loss: 0.478085

Epoch 2
--------------------
loss: 0.446279 [mini-batch 0 / 859]
loss: 0.635444 [m

[I 2024-06-25 03:07:05,332] Trial 11 finished with value: 0.8554 and parameters: {'learning rate': 0.0003215574222167551, 'L1 lambda': 0.18276577643156702}. Best is trial 11 with value: 0.8554.


==> Accuracy: 85.5%, Avg loss: 0.424196

Pre-train stats
==> Accuracy: 13.1%, Avg loss: 5.659675

Epoch 0
--------------------
loss: 4.909663 [mini-batch 0 / 859]
loss: 0.711710 [mini-batch 100 / 859]
loss: 0.632482 [mini-batch 200 / 859]
loss: 0.584025 [mini-batch 300 / 859]
loss: 0.426729 [mini-batch 400 / 859]
loss: 0.459653 [mini-batch 500 / 859]
loss: 0.484287 [mini-batch 600 / 859]
loss: 0.551681 [mini-batch 700 / 859]
loss: 0.530193 [mini-batch 800 / 859]
==> Accuracy: 80.6%, Avg loss: 0.517391

Epoch 1
--------------------
loss: 0.427132 [mini-batch 0 / 859]
loss: 0.228891 [mini-batch 100 / 859]
loss: 0.590509 [mini-batch 200 / 859]
loss: 0.389385 [mini-batch 300 / 859]
loss: 0.592394 [mini-batch 400 / 859]
loss: 0.347245 [mini-batch 500 / 859]
loss: 0.666618 [mini-batch 600 / 859]
loss: 0.551596 [mini-batch 700 / 859]
loss: 0.457193 [mini-batch 800 / 859]
==> Accuracy: 85.0%, Avg loss: 0.425587

Epoch 2
--------------------
loss: 0.586343 [mini-batch 0 / 859]
loss: 0.264893 [m

[I 2024-06-25 03:49:06,849] Trial 12 finished with value: 0.854 and parameters: {'learning rate': 0.0009864991070211169, 'L1 lambda': 0.106323615247091}. Best is trial 11 with value: 0.8554.


==> Accuracy: 85.4%, Avg loss: 0.400962

Pre-train stats
==> Accuracy: 10.8%, Avg loss: 5.884810

Epoch 0
--------------------
loss: 5.561765 [mini-batch 0 / 859]
loss: 0.607724 [mini-batch 100 / 859]
loss: 0.461415 [mini-batch 200 / 859]
loss: 0.794887 [mini-batch 300 / 859]
loss: 0.638861 [mini-batch 400 / 859]
loss: 0.649578 [mini-batch 500 / 859]
loss: 0.571799 [mini-batch 600 / 859]
loss: 0.461437 [mini-batch 700 / 859]
loss: 0.430848 [mini-batch 800 / 859]
==> Accuracy: 83.2%, Avg loss: 0.476609

Epoch 1
--------------------
loss: 0.566768 [mini-batch 0 / 859]
loss: 0.365246 [mini-batch 100 / 859]
loss: 0.593184 [mini-batch 200 / 859]
loss: 0.347125 [mini-batch 300 / 859]
loss: 0.457390 [mini-batch 400 / 859]
loss: 0.387866 [mini-batch 500 / 859]
loss: 0.227826 [mini-batch 600 / 859]
loss: 0.430371 [mini-batch 700 / 859]
loss: 0.458141 [mini-batch 800 / 859]
==> Accuracy: 84.9%, Avg loss: 0.429597

Epoch 2
--------------------
loss: 0.422655 [mini-batch 0 / 859]
loss: 0.504103 [m

[I 2024-06-25 04:30:58,045] Trial 13 finished with value: 0.8614 and parameters: {'learning rate': 0.0007955636901567097, 'L1 lambda': 0.010021898677856474}. Best is trial 13 with value: 0.8614.


==> Accuracy: 86.1%, Avg loss: 0.396283

Pre-train stats
==> Accuracy: 5.7%, Avg loss: 7.837224

Epoch 0
--------------------
loss: 9.246407 [mini-batch 0 / 859]
loss: 1.031909 [mini-batch 100 / 859]
loss: 0.876982 [mini-batch 200 / 859]
loss: 0.828088 [mini-batch 300 / 859]
loss: 0.596052 [mini-batch 400 / 859]
loss: 0.693654 [mini-batch 500 / 859]
loss: 0.637185 [mini-batch 600 / 859]
loss: 0.451709 [mini-batch 700 / 859]
loss: 0.577303 [mini-batch 800 / 859]
==> Accuracy: 83.5%, Avg loss: 0.480027

Epoch 1
--------------------
loss: 0.436736 [mini-batch 0 / 859]
loss: 0.549081 [mini-batch 100 / 859]
loss: 0.540951 [mini-batch 200 / 859]
loss: 0.367983 [mini-batch 300 / 859]
loss: 0.485926 [mini-batch 400 / 859]
loss: 0.368304 [mini-batch 500 / 859]
loss: 0.401474 [mini-batch 600 / 859]
loss: 0.436179 [mini-batch 700 / 859]
loss: 0.529072 [mini-batch 800 / 859]
==> Accuracy: 84.8%, Avg loss: 0.436859

Epoch 2
--------------------
loss: 0.289070 [mini-batch 0 / 859]
loss: 0.488339 [mi

[I 2024-06-25 05:13:36,925] Trial 14 finished with value: 0.8568 and parameters: {'learning rate': 0.0004883338500567561, 'L1 lambda': 0.010560939489270891}. Best is trial 13 with value: 0.8614.


==> Accuracy: 85.7%, Avg loss: 0.403761

Pre-train stats
==> Accuracy: 5.7%, Avg loss: 7.135423

Epoch 0
--------------------
loss: 7.560624 [mini-batch 0 / 859]
loss: 0.809528 [mini-batch 100 / 859]
loss: 0.576846 [mini-batch 200 / 859]
loss: 0.603767 [mini-batch 300 / 859]
loss: 0.378011 [mini-batch 400 / 859]
loss: 0.544955 [mini-batch 500 / 859]
loss: 0.475658 [mini-batch 600 / 859]
loss: 0.304378 [mini-batch 700 / 859]
loss: 0.449879 [mini-batch 800 / 859]
==> Accuracy: 83.1%, Avg loss: 0.472605

Epoch 1
--------------------
loss: 0.425025 [mini-batch 0 / 859]
loss: 0.542495 [mini-batch 100 / 859]
loss: 0.344247 [mini-batch 200 / 859]
loss: 0.313192 [mini-batch 300 / 859]
loss: 0.356228 [mini-batch 400 / 859]
loss: 0.335334 [mini-batch 500 / 859]
loss: 0.370790 [mini-batch 600 / 859]
loss: 0.559415 [mini-batch 700 / 859]
loss: 0.564212 [mini-batch 800 / 859]
==> Accuracy: 84.3%, Avg loss: 0.432540

Epoch 2
--------------------
loss: 0.295751 [mini-batch 0 / 859]
loss: 0.389867 [mi

[I 2024-06-25 05:55:07,254] Trial 15 finished with value: 0.8372 and parameters: {'learning rate': 0.0009577358755908258, 'L1 lambda': 0.011445838974255135}. Best is trial 13 with value: 0.8614.


==> Accuracy: 83.7%, Avg loss: 0.438749

Pre-train stats
==> Accuracy: 11.4%, Avg loss: 7.785334

Epoch 0
--------------------
loss: 7.844865 [mini-batch 0 / 859]
loss: 0.597304 [mini-batch 100 / 859]
loss: 0.463335 [mini-batch 200 / 859]
loss: 0.610277 [mini-batch 300 / 859]
loss: 0.497698 [mini-batch 400 / 859]
loss: 0.468068 [mini-batch 500 / 859]
loss: 0.540269 [mini-batch 600 / 859]
loss: 0.414265 [mini-batch 700 / 859]
loss: 0.363691 [mini-batch 800 / 859]
==> Accuracy: 82.4%, Avg loss: 0.506668

Epoch 1
--------------------
loss: 0.337850 [mini-batch 0 / 859]
loss: 0.382137 [mini-batch 100 / 859]
loss: 0.353299 [mini-batch 200 / 859]
loss: 0.302129 [mini-batch 300 / 859]
loss: 0.321095 [mini-batch 400 / 859]
loss: 0.350428 [mini-batch 500 / 859]
loss: 0.316650 [mini-batch 600 / 859]
loss: 0.409036 [mini-batch 700 / 859]
loss: 0.767042 [mini-batch 800 / 859]
==> Accuracy: 79.3%, Avg loss: 0.576772

Epoch 2
--------------------
loss: 0.917564 [mini-batch 0 / 859]
loss: 0.435996 [m

[I 2024-06-25 06:36:18,935] Trial 16 finished with value: 0.8506 and parameters: {'learning rate': 0.0005112913433202883, 'L1 lambda': 0.011243497283883158}. Best is trial 13 with value: 0.8614.


==> Accuracy: 85.1%, Avg loss: 0.412777

Pre-train stats
==> Accuracy: 10.2%, Avg loss: 8.245626

Epoch 0
--------------------
loss: 8.899866 [mini-batch 0 / 859]
loss: 0.732414 [mini-batch 100 / 859]
loss: 0.433427 [mini-batch 200 / 859]
loss: 0.595703 [mini-batch 300 / 859]
loss: 0.539365 [mini-batch 400 / 859]
loss: 0.619575 [mini-batch 500 / 859]
loss: 0.522763 [mini-batch 600 / 859]
loss: 0.800606 [mini-batch 700 / 859]
loss: 0.372603 [mini-batch 800 / 859]
==> Accuracy: 82.7%, Avg loss: 0.488949

Epoch 1
--------------------
loss: 0.514041 [mini-batch 0 / 859]
loss: 0.485837 [mini-batch 100 / 859]
loss: 0.403733 [mini-batch 200 / 859]
loss: 0.416243 [mini-batch 300 / 859]
loss: 0.630200 [mini-batch 400 / 859]
loss: 0.410541 [mini-batch 500 / 859]
loss: 0.410976 [mini-batch 600 / 859]
loss: 0.493268 [mini-batch 700 / 859]
loss: 0.438542 [mini-batch 800 / 859]
==> Accuracy: 84.2%, Avg loss: 0.444766

Epoch 2
--------------------
loss: 0.434723 [mini-batch 0 / 859]
loss: 0.256217 [m

[I 2024-06-25 07:17:22,289] Trial 17 finished with value: 0.8614 and parameters: {'learning rate': 0.0005682110131058529, 'L1 lambda': 0.03280029715390454}. Best is trial 13 with value: 0.8614.


==> Accuracy: 86.1%, Avg loss: 0.395244

Pre-train stats
==> Accuracy: 12.3%, Avg loss: 6.059423

Epoch 0
--------------------
loss: 5.997901 [mini-batch 0 / 859]
loss: 0.746360 [mini-batch 100 / 859]
loss: 0.547201 [mini-batch 200 / 859]
loss: 0.724649 [mini-batch 300 / 859]
loss: 0.578724 [mini-batch 400 / 859]
loss: 0.520932 [mini-batch 500 / 859]
loss: 0.459739 [mini-batch 600 / 859]
loss: 0.601075 [mini-batch 700 / 859]
loss: 0.493387 [mini-batch 800 / 859]
==> Accuracy: 79.6%, Avg loss: 0.549453

Epoch 1
--------------------
loss: 0.410730 [mini-batch 0 / 859]
loss: 0.492022 [mini-batch 100 / 859]
loss: 0.411374 [mini-batch 200 / 859]
loss: 0.368989 [mini-batch 300 / 859]
loss: 0.420517 [mini-batch 400 / 859]
loss: 0.489375 [mini-batch 500 / 859]
loss: 0.438578 [mini-batch 600 / 859]
loss: 0.401202 [mini-batch 700 / 859]
loss: 0.439688 [mini-batch 800 / 859]
==> Accuracy: 83.8%, Avg loss: 0.462382

Epoch 2
--------------------
loss: 0.574921 [mini-batch 0 / 859]
loss: 0.407441 [m

[I 2024-06-25 07:58:50,437] Trial 18 finished with value: 0.851 and parameters: {'learning rate': 0.0006647928087792768, 'L1 lambda': 0.02989893171437926}. Best is trial 13 with value: 0.8614.


==> Accuracy: 85.1%, Avg loss: 0.400170

Pre-train stats
==> Accuracy: 8.8%, Avg loss: 10.737887

Epoch 0
--------------------
loss: 11.012549 [mini-batch 0 / 859]
loss: 1.323095 [mini-batch 100 / 859]
loss: 0.756378 [mini-batch 200 / 859]
loss: 0.767382 [mini-batch 300 / 859]
loss: 0.654166 [mini-batch 400 / 859]
loss: 0.762742 [mini-batch 500 / 859]
loss: 0.911947 [mini-batch 600 / 859]
loss: 0.444421 [mini-batch 700 / 859]
loss: 0.648062 [mini-batch 800 / 859]
==> Accuracy: 77.8%, Avg loss: 0.643568

Epoch 1
--------------------
loss: 0.665605 [mini-batch 0 / 859]
loss: 0.618925 [mini-batch 100 / 859]
loss: 0.464664 [mini-batch 200 / 859]
loss: 0.430840 [mini-batch 300 / 859]
loss: 0.548159 [mini-batch 400 / 859]
loss: 0.427944 [mini-batch 500 / 859]
loss: 0.558204 [mini-batch 600 / 859]
loss: 0.605002 [mini-batch 700 / 859]
loss: 0.635926 [mini-batch 800 / 859]
==> Accuracy: 79.1%, Avg loss: 0.579705

Epoch 2
--------------------
loss: 0.538439 [mini-batch 0 / 859]
loss: 0.688029 [

[I 2024-06-25 08:40:58,056] Trial 19 finished with value: 0.8288 and parameters: {'learning rate': 7.31826339269652e-05, 'L1 lambda': 0.033523375323614185}. Best is trial 13 with value: 0.8614.


==> Accuracy: 82.9%, Avg loss: 0.499429

Pre-train stats
==> Accuracy: 13.7%, Avg loss: 5.158815

Epoch 0
--------------------
loss: 4.827010 [mini-batch 0 / 859]
loss: 0.702271 [mini-batch 100 / 859]
loss: 1.042210 [mini-batch 200 / 859]
loss: 0.870425 [mini-batch 300 / 859]
loss: 0.617414 [mini-batch 400 / 859]
loss: 0.663946 [mini-batch 500 / 859]
loss: 0.474360 [mini-batch 600 / 859]
loss: 0.851314 [mini-batch 700 / 859]
loss: 0.623632 [mini-batch 800 / 859]
==> Accuracy: 81.0%, Avg loss: 0.549537

Epoch 1
--------------------
loss: 0.400558 [mini-batch 0 / 859]
loss: 0.369093 [mini-batch 100 / 859]
loss: 0.547290 [mini-batch 200 / 859]
loss: 0.684507 [mini-batch 300 / 859]
loss: 0.501623 [mini-batch 400 / 859]
loss: 0.509881 [mini-batch 500 / 859]
loss: 0.402682 [mini-batch 600 / 859]
loss: 0.562854 [mini-batch 700 / 859]
loss: 0.491692 [mini-batch 800 / 859]
==> Accuracy: 82.4%, Avg loss: 0.504460

Epoch 2
--------------------
loss: 0.540837 [mini-batch 0 / 859]
loss: 0.621843 [m

[I 2024-06-25 09:22:37,494] Trial 20 finished with value: 0.8488 and parameters: {'learning rate': 0.00019319183446876565, 'L1 lambda': 0.04814086670653351}. Best is trial 13 with value: 0.8614.


==> Accuracy: 84.9%, Avg loss: 0.427604

Pre-train stats
==> Accuracy: 10.7%, Avg loss: 7.236620

Epoch 0
--------------------
loss: 8.266877 [mini-batch 0 / 859]
loss: 0.729457 [mini-batch 100 / 859]
loss: 0.826829 [mini-batch 200 / 859]
loss: 0.786932 [mini-batch 300 / 859]
loss: 0.527818 [mini-batch 400 / 859]
loss: 0.640558 [mini-batch 500 / 859]
loss: 0.488798 [mini-batch 600 / 859]
loss: 0.681294 [mini-batch 700 / 859]
loss: 0.424786 [mini-batch 800 / 859]
==> Accuracy: 82.6%, Avg loss: 0.497885

Epoch 1
--------------------
loss: 0.476494 [mini-batch 0 / 859]
loss: 0.333769 [mini-batch 100 / 859]
loss: 0.637873 [mini-batch 200 / 859]
loss: 0.480767 [mini-batch 300 / 859]
loss: 0.478855 [mini-batch 400 / 859]
loss: 0.777539 [mini-batch 500 / 859]
loss: 0.549290 [mini-batch 600 / 859]
loss: 0.425081 [mini-batch 700 / 859]
loss: 0.515532 [mini-batch 800 / 859]
==> Accuracy: 81.3%, Avg loss: 0.511576

Epoch 2
--------------------
loss: 0.419346 [mini-batch 0 / 859]
loss: 0.434606 [m

[I 2024-06-25 10:05:01,736] Trial 21 finished with value: 0.8612 and parameters: {'learning rate': 0.00042635087728591886, 'L1 lambda': 0.015633580148084223}. Best is trial 13 with value: 0.8614.


==> Accuracy: 86.1%, Avg loss: 0.390881

Pre-train stats
==> Accuracy: 15.3%, Avg loss: 6.588694

Epoch 0
--------------------
loss: 5.977013 [mini-batch 0 / 859]
loss: 0.649204 [mini-batch 100 / 859]
loss: 0.545501 [mini-batch 200 / 859]
loss: 0.902558 [mini-batch 300 / 859]
loss: 0.786658 [mini-batch 400 / 859]
loss: 0.576461 [mini-batch 500 / 859]
loss: 0.394667 [mini-batch 600 / 859]
loss: 0.578750 [mini-batch 700 / 859]
loss: 0.526283 [mini-batch 800 / 859]
==> Accuracy: 80.7%, Avg loss: 0.537085

Epoch 1
--------------------
loss: 0.360105 [mini-batch 0 / 859]
loss: 0.444878 [mini-batch 100 / 859]
loss: 0.477410 [mini-batch 200 / 859]
loss: 0.711136 [mini-batch 300 / 859]
loss: 0.536984 [mini-batch 400 / 859]
loss: 0.365677 [mini-batch 500 / 859]
loss: 0.509276 [mini-batch 600 / 859]
loss: 0.567063 [mini-batch 700 / 859]
loss: 0.378564 [mini-batch 800 / 859]
==> Accuracy: 81.6%, Avg loss: 0.507181

Epoch 2
--------------------
loss: 0.419232 [mini-batch 0 / 859]
loss: 0.353495 [m

[I 2024-06-25 10:46:04,633] Trial 22 finished with value: 0.848 and parameters: {'learning rate': 0.0003326412298284213, 'L1 lambda': 0.019208842204589717}. Best is trial 13 with value: 0.8614.


==> Accuracy: 84.8%, Avg loss: 0.422792

Pre-train stats
==> Accuracy: 2.3%, Avg loss: 7.991079

Epoch 0
--------------------
loss: 7.948682 [mini-batch 0 / 859]
loss: 0.791534 [mini-batch 100 / 859]
loss: 0.774691 [mini-batch 200 / 859]
loss: 0.549950 [mini-batch 300 / 859]
loss: 0.745120 [mini-batch 400 / 859]
loss: 0.645032 [mini-batch 500 / 859]
loss: 0.482954 [mini-batch 600 / 859]
loss: 0.390980 [mini-batch 700 / 859]
loss: 0.555356 [mini-batch 800 / 859]
==> Accuracy: 82.1%, Avg loss: 0.492824

Epoch 1
--------------------
loss: 0.570364 [mini-batch 0 / 859]
loss: 0.688774 [mini-batch 100 / 859]
loss: 0.589898 [mini-batch 200 / 859]
loss: 0.506521 [mini-batch 300 / 859]
loss: 0.384402 [mini-batch 400 / 859]
loss: 0.635170 [mini-batch 500 / 859]
loss: 0.440262 [mini-batch 600 / 859]
loss: 0.384721 [mini-batch 700 / 859]
loss: 0.533046 [mini-batch 800 / 859]
==> Accuracy: 84.7%, Avg loss: 0.431717

Epoch 2
--------------------
loss: 0.456339 [mini-batch 0 / 859]
loss: 0.433658 [mi

[W 2024-06-25 11:14:36,818] Trial 23 failed with parameters: {'learning rate': 0.0007261573207076086, 'L1 lambda': 0.060698207315206865} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/guille/venv/torch/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_63639/4099157515.py", line 11, in objective
    accuracy, _ = nn.train(train_data, validation_data)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/guille/Documents/fashion_mnist_nn/regularization/sgd_L1.py", line 125, in train
    sum_loss += self.calc_gradients(x, y)
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/guille/Documents/fashion_mnist_nn/regularization/sgd_L1.py", line 83, in calc_gradients
    self.forward(x)
  File "/home/guille/Documents/fashion_mnist_nn/regularization/sgd_L1.py", line 62, in forward
    self.a[l] = np.m

KeyboardInterrupt: 

In [None]:
study.best_params

In [None]:
ov.plot_optimization_history(study)

In [None]:
ov.plot_contour(study)