In [1]:
!git clone https://github.com/cs21s006/cs6910_assignment1

Cloning into 'cs6910_assignment1'...
remote: Enumerating objects: 96, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 96 (delta 35), reused 42 (delta 14), pack-reused 0[K
Unpacking objects: 100% (96/96), done.


In [2]:
cd cs6910_assignment1

/content/cs6910_assignment1


In [3]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.10-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 28.8 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.5-py2.py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 49.3 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 39.6 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[?25hCollecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Building wheels for c

In [4]:
import wandb
import argparse
import numpy as np
from tqdm import tqdm
from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split 

from neural_networks import NeuralNetwork, Optimizer, clip_gradients, losses
from utils import preprocess_data, make_batches, accuracy, gradient_sum

In [5]:
def mse_softmax_grad(y_true, y_pred):
    b, k = y_pred.shape
    y_minus_t = (y_pred - y_true)
    minus_y = np.repeat(y_pred, k, axis=1).reshape(b, k, k)
    batch_ones = np.tile(np.identity(k), b).T.reshape(b, k, k)
    prod = np.array([y_minus_t[i, :] @ (batch_ones-minus_y)[i, :] for i in range(b)])
    return np.multiply(y_pred, prod)


def train_and_evaluate(args):
    # Load Data
    (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
    X_train,X_val,y_train,y_val=train_test_split(X_train, y_train,test_size=0.1, random_state=1, stratify=y_train)
    (X_train, y_train) = preprocess_data(X_train, y_train)
    (X_test, y_test) = preprocess_data(X_test, y_test)
    (X_val,y_val) = preprocess_data(X_val,y_val)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape,X_val.shape,y_val.shape)

    wandb.run.name = 'ep-'+str(args.epochs)+'-bs-'+str(args.batch_size)+'-lf-'+str(args.loss)+'-op-'+str(args.optimizer)+'-lr-'+ str(args.learning_rate) \
                      + '-nhl-'+str(args.num_layers)+'-shl-'+str(args.hidden_size)+ '-act-'+str(args.activation) \
                      +'-w_i-'+str(args.weight_init_method)+'-w_d-'+str(args.weight_decay)


    # Instantiate model
    architecture = [{'num_neurons': args.hidden_size, 'activation': args.activation, 'init_method': args.weight_init_method}
                    for _ in range(args.num_layers)]
    architecture.append({'num_neurons': 10, 'activation': 'softmax'})  # add output layer

    model = NeuralNetwork(input_dim=784, architecture=architecture)
    optimizer = Optimizer(model, algorithm=args.optimizer,
                          momentum=0.9, beta=0.9,
                          beta1=0.9, beta2=0.99, epsilon=1e-8,weight_decay=args.weight_decay)
    lr = args.learning_rate
    loss_fn, loss_grad_fn = losses[args.loss]

    if args.loss == 'mean_squared_error':
        loss_grad_fn = mse_softmax_grad

    # Train

    for epoch in range(args.epochs):
        running_loss, running_grad = .0, .0
        num_steps = (X_train.shape[0] // args.batch_size)
        progress_bar = tqdm(make_batches(X_train, y_train, args.batch_size),
                            total=num_steps)
        for (X_batch, y_batch) in progress_bar:
            # Forward
            y_pred = model.forward(X_batch)

            # Optimize
            gradients = optimizer.backpropagate(y_batch, y_pred, (loss_fn, loss_grad_fn))
            gradients = clip_gradients(gradients, clip_ratio=5.0, norm_type=2)
            optimizer.optimize(gradients, learning_rate=lr)

            # Track acc, loss and gradients
            loss = loss_fn(y_batch, y_pred)
            grad_sum = gradient_sum(gradients)
            acc = accuracy(y_batch, y_pred)
            progress_bar.set_description(
                f"epoch: {epoch}, lr: {lr:.5f} | loss: {loss:.4f}, acc(batch): {acc:.4f}, grad:{grad_sum:.4f}"
            )
            
            running_loss += loss
            running_grad += grad_sum
        
        # Evaluate train and test splits
        train_acc = accuracy(model.forward(X_train), y_train)
        y_val_pred = model.forward(X_val)
        val_acc = accuracy(y_val_pred,y_val)
        val_loss = loss_fn(y_val,y_val_pred)
        test_acc = accuracy(model.forward(X_test), y_test)
        print(f"acc(train): {train_acc:.4f}, acc(val): {val_acc:.4f}, acc(test): {test_acc:.4f}")
        print('_' * 99)
        
        # Log metrics to wandb.ai
        wandb.log({
            'train_acc': train_acc, 
            'val_acc': val_acc,
            'train_loss': running_loss/num_steps,
            'val_loss' : val_loss,
            'test_acc': test_acc,
            'epoch':epoch            
        })
    
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(model.forward(X_test), axis=1)
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 
              'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankleboot']
    wandb.log({
      "confusion_matrix" : wandb.plot.confusion_matrix(probs=None,
                              y_true=y_true, preds=y_pred,
                              class_names=class_names)
    })

In [6]:
wandb.login()

sweep_config = {
    "name" : "cs6910_assignment1_fashion-mnist_mse_vs_xe_sweep",
    "description": "sweep comparing mean squared error and cross entropy loss functions",
    "method" : "grid",
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize'
    },
    "parameters" : {
        "epochs" : {
            "values" : [20]
        },
        "batch_size": {
            "values": [128]
        },
        "loss": {
            "values": ['mean_squared_error', 'cross_entropy']
        },
        "optimizer": {
            "values": ["adam"]
        },
        "learning_rate" :{
            "values": [1e-4]
        },
        "num_layers": {
            "values": [4]
        },
        "hidden_size": {
            "values": [256]
        },
        "activation": {
            "values": ['tanh']
        },
        "weight_init_method":{
            "values":['He_normal']
        },
        "weight_decay":{
            "values":[0]
        }
    }
}
sweep_id = wandb.sweep(sweep_config, project="cs6910_assignment1", entity="cs21s006_cs21s043")
print('sweep_id: ', sweep_id)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: cp83i9rw
Sweep URL: https://wandb.ai/cs21s006_cs21s043/cs6910_assignment1/sweeps/cp83i9rw
sweep_id:  cp83i9rw


In [7]:
def spawn_fn():
      with wandb.init(project="cs6910_assignment1", entity="cs21s006_cs21s043") as run:
        config = wandb.config
        train_and_evaluate(config)

In [8]:
wandb.agent(sweep_id, function=spawn_fn)

[34m[1mwandb[0m: Agent Starting Run: nlskhadb with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: mean_squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: He_normal
[34m[1mwandb[0m: Currently logged in as: [33mcs21s006_cs21s043[0m (use `wandb login --relogin` to force relogin)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0189, acc(batch): 0.8839, grad:104041.2075: : 422it [00:44,  9.56it/s]                       


acc(train): 0.8486, acc(val): 0.8467, acc(test): 0.8372
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0181, acc(batch): 0.9018, grad:103704.1240: : 422it [00:43,  9.60it/s]


acc(train): 0.8599, acc(val): 0.8583, acc(test): 0.8454
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0173, acc(batch): 0.9018, grad:103409.3403: : 422it [00:42,  9.82it/s]


acc(train): 0.8695, acc(val): 0.8652, acc(test): 0.8519
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0167, acc(batch): 0.9018, grad:103416.1608: : 422it [00:42,  9.83it/s]


acc(train): 0.8760, acc(val): 0.8693, acc(test): 0.8574
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0163, acc(batch): 0.8929, grad:103339.4309: : 422it [00:42,  9.91it/s]


acc(train): 0.8815, acc(val): 0.8742, acc(test): 0.8621
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0159, acc(batch): 0.8929, grad:103272.1416: : 422it [00:42,  9.93it/s]


acc(train): 0.8855, acc(val): 0.8767, acc(test): 0.8640
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0157, acc(batch): 0.8929, grad:103218.8246: : 422it [00:42,  9.81it/s]


acc(train): 0.8887, acc(val): 0.8798, acc(test): 0.8654
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0154, acc(batch): 0.8929, grad:103334.9375: : 422it [00:42,  9.87it/s]


acc(train): 0.8922, acc(val): 0.8823, acc(test): 0.8662
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0153, acc(batch): 0.8929, grad:103716.7638: : 422it [00:42,  9.86it/s]                       


acc(train): 0.8953, acc(val): 0.8840, acc(test): 0.8677
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0151, acc(batch): 0.8929, grad:104087.9020: : 422it [00:43,  9.77it/s]                       


acc(train): 0.8983, acc(val): 0.8868, acc(test): 0.8712
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0150, acc(batch): 0.8929, grad:104276.1251: : 422it [00:43,  9.77it/s]                       


acc(train): 0.9013, acc(val): 0.8882, acc(test): 0.8719
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0149, acc(batch): 0.8929, grad:104260.9403: : 422it [00:42,  9.82it/s]                       


acc(train): 0.9044, acc(val): 0.8890, acc(test): 0.8732
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0148, acc(batch): 0.8929, grad:104183.7272: : 422it [00:42,  9.82it/s]


acc(train): 0.9070, acc(val): 0.8888, acc(test): 0.8732
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0147, acc(batch): 0.9018, grad:104170.2150: : 422it [00:43,  9.78it/s]


acc(train): 0.9090, acc(val): 0.8887, acc(test): 0.8738
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0146, acc(batch): 0.9018, grad:104176.4852: : 422it [00:42,  9.82it/s]


acc(train): 0.9108, acc(val): 0.8885, acc(test): 0.8744
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0145, acc(batch): 0.8929, grad:104169.9891: : 422it [00:43,  9.76it/s]


acc(train): 0.9124, acc(val): 0.8888, acc(test): 0.8742
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0145, acc(batch): 0.8929, grad:104006.1408: : 422it [00:42,  9.85it/s]


acc(train): 0.9130, acc(val): 0.8882, acc(test): 0.8742
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0144, acc(batch): 0.8929, grad:103784.3882: : 422it [00:42,  9.85it/s]


acc(train): 0.9133, acc(val): 0.8878, acc(test): 0.8746
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0143, acc(batch): 0.8839, grad:103642.1613: : 422it [00:42,  9.90it/s]


acc(train): 0.9144, acc(val): 0.8878, acc(test): 0.8743
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0141, acc(batch): 0.8929, grad:103616.1779: : 422it [00:42,  9.84it/s]                       


acc(train): 0.9156, acc(val): 0.8873, acc(test): 0.8748
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.59110050533…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▆▆▆▇▇▇█████████
train_acc,▁▂▃▄▄▅▅▆▆▆▇▇▇▇██████
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▃▄▅▆▆▆▇▇███████████
val_loss,█▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8748
train_acc,0.91556
train_loss,0.01149
val_acc,0.88733
val_loss,0.01668


[34m[1mwandb[0m: Agent Starting Run: yyp3b0s7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0382, acc(batch): 0.8661, grad:106712.1916: : 422it [00:41, 10.24it/s]


acc(train): 0.8404, acc(val): 0.8407, acc(test): 0.8266
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0345, acc(batch): 0.8929, grad:106011.9598: : 422it [00:40, 10.35it/s]                       


acc(train): 0.8580, acc(val): 0.8532, acc(test): 0.8404
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0326, acc(batch): 0.9018, grad:106148.3728: : 422it [00:40, 10.33it/s]                       


acc(train): 0.8679, acc(val): 0.8630, acc(test): 0.8483
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0314, acc(batch): 0.9018, grad:106653.6856: : 422it [00:40, 10.34it/s]                       


acc(train): 0.8755, acc(val): 0.8680, acc(test): 0.8529
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0304, acc(batch): 0.9018, grad:107081.5203: : 422it [00:40, 10.37it/s]


acc(train): 0.8817, acc(val): 0.8725, acc(test): 0.8583
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0296, acc(batch): 0.9018, grad:107353.4747: : 422it [00:40, 10.33it/s]


acc(train): 0.8867, acc(val): 0.8767, acc(test): 0.8619
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0288, acc(batch): 0.9018, grad:107508.6799: : 422it [00:40, 10.36it/s]


acc(train): 0.8912, acc(val): 0.8807, acc(test): 0.8653
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0281, acc(batch): 0.9196, grad:107600.5376: : 422it [00:40, 10.36it/s]


acc(train): 0.8949, acc(val): 0.8822, acc(test): 0.8678
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0275, acc(batch): 0.9286, grad:107632.7015: : 422it [00:40, 10.36it/s]                       


acc(train): 0.8979, acc(val): 0.8852, acc(test): 0.8714
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0269, acc(batch): 0.9286, grad:107618.1136: : 422it [00:40, 10.39it/s]


acc(train): 0.9013, acc(val): 0.8868, acc(test): 0.8730
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0264, acc(batch): 0.9286, grad:107709.5679: : 422it [00:40, 10.35it/s]


acc(train): 0.9039, acc(val): 0.8887, acc(test): 0.8746
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0260, acc(batch): 0.9286, grad:107845.8275: : 422it [00:40, 10.32it/s]


acc(train): 0.9068, acc(val): 0.8905, acc(test): 0.8760
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0255, acc(batch): 0.9286, grad:107972.7133: : 422it [00:40, 10.33it/s]


acc(train): 0.9090, acc(val): 0.8922, acc(test): 0.8762
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0251, acc(batch): 0.9286, grad:108087.7409: : 422it [00:40, 10.32it/s]                       


acc(train): 0.9117, acc(val): 0.8933, acc(test): 0.8771
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0247, acc(batch): 0.9286, grad:108191.7360: : 422it [00:40, 10.34it/s]


acc(train): 0.9136, acc(val): 0.8937, acc(test): 0.8783
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0243, acc(batch): 0.9375, grad:108266.4053: : 422it [00:40, 10.33it/s]                       


acc(train): 0.9156, acc(val): 0.8945, acc(test): 0.8779
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0239, acc(batch): 0.9464, grad:108310.7899: : 422it [00:41, 10.26it/s]


acc(train): 0.9175, acc(val): 0.8953, acc(test): 0.8784
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0235, acc(batch): 0.9464, grad:108306.9855: : 422it [00:40, 10.32it/s]                       


acc(train): 0.9197, acc(val): 0.8958, acc(test): 0.8790
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0231, acc(batch): 0.9464, grad:108241.8408: : 422it [00:41, 10.28it/s]


acc(train): 0.9216, acc(val): 0.8958, acc(test): 0.8796
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0227, acc(batch): 0.9464, grad:108164.5237: : 422it [00:40, 10.33it/s]                       


acc(train): 0.9230, acc(val): 0.8952, acc(test): 0.8803
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.59115168539…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▄▅▆▆▆▇▇▇▇▇███████
train_acc,▁▂▃▄▅▅▅▆▆▆▆▇▇▇▇▇████
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▃▄▄▅▆▆▆▇▇▇▇████████
val_loss,█▆▅▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8803
train_acc,0.92302
train_loss,0.02062
val_acc,0.89517
val_loss,0.03071


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [None]:
# wandb.agent("1463mzwx", function=spawn_fn, count=10)

[34m[1mwandb[0m: Agent Starting Run: 7uitvlls with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0817, acc(batch): 0.8958, grad:13974.4253: : 844it [00:10, 77.94it/s]                       


acc(train): 0.7300, acc(val): 0.7305, acc(test): 0.7233
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0503, acc(batch): 0.9375, grad:15169.5247: : 844it [00:10, 77.85it/s]


acc(train): 0.7712, acc(val): 0.7748, acc(test): 0.7628
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0365, acc(batch): 0.9375, grad:14811.3835: : 844it [00:10, 80.41it/s]                       


acc(train): 0.7987, acc(val): 0.8037, acc(test): 0.7910
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0293, acc(batch): 0.9375, grad:14890.5081: : 844it [00:10, 79.54it/s]                       


acc(train): 0.8168, acc(val): 0.8218, acc(test): 0.8060
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0252, acc(batch): 0.9375, grad:14954.6467: : 844it [00:10, 79.74it/s]                       


acc(train): 0.8280, acc(val): 0.8305, acc(test): 0.8151
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0227, acc(batch): 0.9375, grad:15204.2862: : 844it [00:10, 81.38it/s]                       


acc(train): 0.8361, acc(val): 0.8380, acc(test): 0.8214
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0211, acc(batch): 0.9375, grad:15388.4464: : 844it [00:10, 81.26it/s]                       


acc(train): 0.8417, acc(val): 0.8435, acc(test): 0.8281
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0201, acc(batch): 0.9375, grad:15347.2773: : 844it [00:10, 82.71it/s]                       


acc(train): 0.8466, acc(val): 0.8475, acc(test): 0.8323
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0195, acc(batch): 0.9375, grad:15344.9171: : 844it [00:10, 79.38it/s]


acc(train): 0.8504, acc(val): 0.8502, acc(test): 0.8349
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0189, acc(batch): 0.9375, grad:15406.2563: : 844it [00:10, 80.85it/s]                       


acc(train): 0.8537, acc(val): 0.8525, acc(test): 0.8380
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0184, acc(batch): 0.9375, grad:15516.3534: : 844it [00:10, 79.80it/s]                       


acc(train): 0.8568, acc(val): 0.8543, acc(test): 0.8394
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0179, acc(batch): 0.9375, grad:15645.1069: : 844it [00:10, 78.27it/s]                       


acc(train): 0.8595, acc(val): 0.8565, acc(test): 0.8412
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0173, acc(batch): 0.9375, grad:15775.8329: : 844it [00:10, 81.14it/s]                       


acc(train): 0.8622, acc(val): 0.8588, acc(test): 0.8426
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0168, acc(batch): 0.9375, grad:15893.6686: : 844it [00:10, 81.68it/s]                       


acc(train): 0.8641, acc(val): 0.8617, acc(test): 0.8451
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0163, acc(batch): 0.9375, grad:15988.6780: : 844it [00:10, 79.58it/s]


acc(train): 0.8666, acc(val): 0.8618, acc(test): 0.8465
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0158, acc(batch): 0.9375, grad:16063.9349: : 844it [00:10, 79.20it/s]                       


acc(train): 0.8682, acc(val): 0.8623, acc(test): 0.8486
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0154, acc(batch): 0.9375, grad:16124.9142: : 844it [00:10, 80.47it/s]                       


acc(train): 0.8697, acc(val): 0.8620, acc(test): 0.8507
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0151, acc(batch): 0.9375, grad:16172.7439: : 844it [00:10, 81.14it/s]                       


acc(train): 0.8711, acc(val): 0.8632, acc(test): 0.8511
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0147, acc(batch): 0.9375, grad:16207.9634: : 844it [00:10, 82.11it/s]                       


acc(train): 0.8723, acc(val): 0.8637, acc(test): 0.8533
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0144, acc(batch): 0.9375, grad:16230.5059: : 844it [00:10, 81.63it/s]                       


acc(train): 0.8737, acc(val): 0.8637, acc(test): 0.8539
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▅▅▆▆▇▇▇▇▇▇▇███████
train_acc,▁▃▄▅▆▆▆▇▇▇▇▇▇███████
train_loss,█▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▇▇▇▇▇██████████
val_loss,█▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8539
train_acc,0.87372
train_loss,0.03658
val_acc,0.86367
val_loss,0.03886


[34m[1mwandb[0m: Agent Starting Run: a8dlcc0z with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00100 | loss: 0.0395, acc(batch): 0.8929, grad:16026.6877: : 422it [00:05, 77.89it/s]                       


acc(train): 0.8392, acc(val): 0.8430, acc(test): 0.8261
___________________________________________________________________________________________________


epoch: 1, lr: 0.00100 | loss: 0.0375, acc(batch): 0.8929, grad:17106.7960: : 422it [00:05, 78.41it/s]


acc(train): 0.8515, acc(val): 0.8498, acc(test): 0.8355
___________________________________________________________________________________________________


epoch: 2, lr: 0.00100 | loss: 0.0362, acc(batch): 0.8750, grad:17369.5281: : 422it [00:05, 76.59it/s]


acc(train): 0.8588, acc(val): 0.8555, acc(test): 0.8415
___________________________________________________________________________________________________


epoch: 3, lr: 0.00100 | loss: 0.0351, acc(batch): 0.8750, grad:17581.3264: : 422it [00:05, 78.73it/s]                       


acc(train): 0.8650, acc(val): 0.8590, acc(test): 0.8461
___________________________________________________________________________________________________


epoch: 4, lr: 0.00100 | loss: 0.0341, acc(batch): 0.8750, grad:17844.1719: : 422it [00:05, 79.16it/s]                       


acc(train): 0.8684, acc(val): 0.8605, acc(test): 0.8511
___________________________________________________________________________________________________


epoch: 5, lr: 0.00100 | loss: 0.0335, acc(batch): 0.8839, grad:17955.1467: : 422it [00:05, 77.23it/s]                       


acc(train): 0.8710, acc(val): 0.8618, acc(test): 0.8520
___________________________________________________________________________________________________


epoch: 6, lr: 0.00100 | loss: 0.0329, acc(batch): 0.8839, grad:18005.3514: : 422it [00:05, 75.54it/s]                       


acc(train): 0.8734, acc(val): 0.8648, acc(test): 0.8546
___________________________________________________________________________________________________


epoch: 7, lr: 0.00100 | loss: 0.0325, acc(batch): 0.8839, grad:18018.2259: : 422it [00:05, 76.51it/s]                       


acc(train): 0.8759, acc(val): 0.8665, acc(test): 0.8558
___________________________________________________________________________________________________


epoch: 8, lr: 0.00100 | loss: 0.0321, acc(batch): 0.8839, grad:18030.0101: : 422it [00:05, 74.93it/s]                       


acc(train): 0.8781, acc(val): 0.8672, acc(test): 0.8567
___________________________________________________________________________________________________


epoch: 9, lr: 0.00100 | loss: 0.0319, acc(batch): 0.8839, grad:18109.3231: : 422it [00:05, 74.86it/s]                       


acc(train): 0.8799, acc(val): 0.8688, acc(test): 0.8570
___________________________________________________________________________________________________


epoch: 10, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8839, grad:18223.8328: : 422it [00:05, 71.68it/s]                       


acc(train): 0.8809, acc(val): 0.8693, acc(test): 0.8581
___________________________________________________________________________________________________


epoch: 11, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18323.7855: : 422it [00:05, 73.33it/s]                       


acc(train): 0.8819, acc(val): 0.8710, acc(test): 0.8590
___________________________________________________________________________________________________


epoch: 12, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8839, grad:18388.7537: : 422it [00:05, 74.19it/s]                       


acc(train): 0.8827, acc(val): 0.8710, acc(test): 0.8598
___________________________________________________________________________________________________


epoch: 13, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18422.2501: : 422it [00:05, 73.69it/s]                       


acc(train): 0.8834, acc(val): 0.8715, acc(test): 0.8600
___________________________________________________________________________________________________


epoch: 14, lr: 0.00100 | loss: 0.0319, acc(batch): 0.8929, grad:18431.3552: : 422it [00:05, 75.03it/s]                       


acc(train): 0.8844, acc(val): 0.8718, acc(test): 0.8602
___________________________________________________________________________________________________


epoch: 15, lr: 0.00100 | loss: 0.0319, acc(batch): 0.8929, grad:18421.1632: : 422it [00:05, 76.01it/s]                       


acc(train): 0.8849, acc(val): 0.8737, acc(test): 0.8604
___________________________________________________________________________________________________


epoch: 16, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18418.7548: : 422it [00:05, 77.52it/s]                       


acc(train): 0.8855, acc(val): 0.8748, acc(test): 0.8607
___________________________________________________________________________________________________


epoch: 17, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18402.0034: : 422it [00:05, 76.14it/s]


acc(train): 0.8860, acc(val): 0.8748, acc(test): 0.8609
___________________________________________________________________________________________________


epoch: 18, lr: 0.00100 | loss: 0.0318, acc(batch): 0.9018, grad:18367.1699: : 422it [00:05, 74.88it/s]                       


acc(train): 0.8864, acc(val): 0.8757, acc(test): 0.8610
___________________________________________________________________________________________________


epoch: 19, lr: 0.00100 | loss: 0.0317, acc(batch): 0.9018, grad:18316.2449: : 422it [00:06, 68.40it/s]                       


acc(train): 0.8868, acc(val): 0.8762, acc(test): 0.8607
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57941091539…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▆▇▇▇▇▇█████████
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇███████
train_loss,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▂▄▄▅▅▆▆▆▆▇▇▇▇▇▇████
val_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8607
train_acc,0.88676
train_loss,0.02982
val_acc,0.87617
val_loss,0.03665


[34m[1mwandb[0m: Agent Starting Run: zmp8yloz with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0221, acc(batch): 0.9375, grad:64399.3219: : 844it [00:37, 22.62it/s]                       


acc(train): 0.8439, acc(val): 0.8458, acc(test): 0.8351
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0204, acc(batch): 0.9167, grad:63689.2142: : 844it [00:37, 22.67it/s]                       


acc(train): 0.8619, acc(val): 0.8555, acc(test): 0.8491
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0196, acc(batch): 0.9167, grad:63024.9530: : 844it [00:36, 22.83it/s]                       


acc(train): 0.8706, acc(val): 0.8608, acc(test): 0.8566
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0187, acc(batch): 0.9167, grad:62917.7614: : 844it [00:37, 22.74it/s]                       


acc(train): 0.8766, acc(val): 0.8647, acc(test): 0.8620
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0177, acc(batch): 0.9167, grad:62918.3835: : 844it [00:37, 22.71it/s]                       


acc(train): 0.8812, acc(val): 0.8687, acc(test): 0.8649
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0168, acc(batch): 0.9167, grad:63092.8218: : 844it [00:37, 22.69it/s]                       


acc(train): 0.8852, acc(val): 0.8723, acc(test): 0.8676
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0158, acc(batch): 0.9167, grad:63189.6915: : 844it [00:37, 22.45it/s]                       


acc(train): 0.8883, acc(val): 0.8752, acc(test): 0.8684
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0149, acc(batch): 0.9375, grad:63216.4673: : 844it [00:37, 22.53it/s]                       


acc(train): 0.8914, acc(val): 0.8770, acc(test): 0.8695
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0140, acc(batch): 0.9375, grad:63199.2412: : 844it [00:37, 22.33it/s]                       


acc(train): 0.8941, acc(val): 0.8790, acc(test): 0.8708
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0132, acc(batch): 0.9375, grad:63169.5793: : 844it [00:37, 22.51it/s]                       


acc(train): 0.8968, acc(val): 0.8832, acc(test): 0.8723
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0124, acc(batch): 0.9375, grad:63148.5305: : 844it [00:37, 22.44it/s]                       


acc(train): 0.8989, acc(val): 0.8828, acc(test): 0.8730
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0116, acc(batch): 0.9583, grad:63032.5241: : 844it [00:37, 22.49it/s]                       


acc(train): 0.9006, acc(val): 0.8837, acc(test): 0.8740
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0109, acc(batch): 0.9583, grad:62892.1694: : 844it [00:37, 22.41it/s]                       


acc(train): 0.9027, acc(val): 0.8843, acc(test): 0.8753
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0103, acc(batch): 0.9792, grad:62854.4642: : 844it [00:37, 22.54it/s]                       


acc(train): 0.9046, acc(val): 0.8855, acc(test): 0.8758
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0097, acc(batch): 0.9792, grad:62820.3329: : 844it [00:39, 21.16it/s]                       


acc(train): 0.9059, acc(val): 0.8865, acc(test): 0.8772
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0091, acc(batch): 0.9792, grad:62812.2001: : 844it [00:37, 22.66it/s]                       


acc(train): 0.9074, acc(val): 0.8883, acc(test): 0.8783
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0085, acc(batch): 0.9792, grad:62812.1138: : 844it [00:37, 22.68it/s]                       


acc(train): 0.9089, acc(val): 0.8890, acc(test): 0.8791
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0080, acc(batch): 0.9792, grad:62786.4879: : 844it [00:37, 22.57it/s]                       


acc(train): 0.9106, acc(val): 0.8897, acc(test): 0.8804
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0076, acc(batch): 0.9792, grad:62806.7517: : 844it [00:37, 22.43it/s]                       


acc(train): 0.9126, acc(val): 0.8892, acc(test): 0.8804
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0072, acc(batch): 0.9792, grad:62807.3841: : 844it [00:37, 22.49it/s]                       


acc(train): 0.9141, acc(val): 0.8893, acc(test): 0.8813
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▆▆▆▆▇▇▇▇▇▇█████
train_acc,▁▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇███
train_loss,█▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▃▃▄▅▅▆▆▆▇▇▇▇▇▇█████
val_loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8813
train_acc,0.91409
train_loss,0.02495
val_acc,0.88933
val_loss,0.03076


[34m[1mwandb[0m: Agent Starting Run: cq4pu5wy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00030 | loss: 0.0373, acc(batch): 0.8839, grad:100932.2099: : 422it [00:50,  8.41it/s]


acc(train): 0.8585, acc(val): 0.8600, acc(test): 0.8477
___________________________________________________________________________________________________


epoch: 1, lr: 0.00030 | loss: 0.0328, acc(batch): 0.9107, grad:99950.8492: : 422it [00:50,  8.39it/s]


acc(train): 0.8723, acc(val): 0.8657, acc(test): 0.8559
___________________________________________________________________________________________________


epoch: 2, lr: 0.00030 | loss: 0.0323, acc(batch): 0.9018, grad:99286.2738: : 422it [00:50,  8.42it/s]


acc(train): 0.8822, acc(val): 0.8685, acc(test): 0.8606
___________________________________________________________________________________________________


epoch: 3, lr: 0.00030 | loss: 0.0314, acc(batch): 0.9107, grad:98320.6648: : 422it [00:50,  8.41it/s]


acc(train): 0.8887, acc(val): 0.8735, acc(test): 0.8633
___________________________________________________________________________________________________


epoch: 4, lr: 0.00030 | loss: 0.0302, acc(batch): 0.9107, grad:97422.6394: : 422it [00:50,  8.43it/s]


acc(train): 0.8940, acc(val): 0.8757, acc(test): 0.8683
___________________________________________________________________________________________________


epoch: 5, lr: 0.00030 | loss: 0.0294, acc(batch): 0.9196, grad:95302.8585: : 422it [00:49,  8.48it/s]


acc(train): 0.8966, acc(val): 0.8773, acc(test): 0.8697
___________________________________________________________________________________________________


epoch: 6, lr: 0.00030 | loss: 0.0287, acc(batch): 0.9286, grad:95747.2129: : 422it [00:49,  8.53it/s]


acc(train): 0.8988, acc(val): 0.8795, acc(test): 0.8703
___________________________________________________________________________________________________


epoch: 7, lr: 0.00030 | loss: 0.0278, acc(batch): 0.9286, grad:94330.2569: : 422it [00:49,  8.49it/s]


acc(train): 0.9022, acc(val): 0.8793, acc(test): 0.8710
___________________________________________________________________________________________________


epoch: 8, lr: 0.00030 | loss: 0.0270, acc(batch): 0.9375, grad:93477.8353: : 422it [00:49,  8.53it/s]


acc(train): 0.9046, acc(val): 0.8808, acc(test): 0.8728
___________________________________________________________________________________________________


epoch: 9, lr: 0.00030 | loss: 0.0268, acc(batch): 0.9375, grad:92662.7078: : 422it [00:49,  8.51it/s]


acc(train): 0.9081, acc(val): 0.8815, acc(test): 0.8728
___________________________________________________________________________________________________


epoch: 10, lr: 0.00030 | loss: 0.0264, acc(batch): 0.9375, grad:92680.4022: : 422it [00:49,  8.52it/s]


acc(train): 0.9104, acc(val): 0.8802, acc(test): 0.8727
___________________________________________________________________________________________________


epoch: 11, lr: 0.00030 | loss: 0.0261, acc(batch): 0.9375, grad:93017.1797: : 422it [00:49,  8.57it/s]


acc(train): 0.9123, acc(val): 0.8792, acc(test): 0.8738
___________________________________________________________________________________________________


epoch: 12, lr: 0.00030 | loss: 0.0256, acc(batch): 0.9375, grad:92889.4153: : 422it [00:49,  8.50it/s]


acc(train): 0.9149, acc(val): 0.8797, acc(test): 0.8742
___________________________________________________________________________________________________


epoch: 13, lr: 0.00030 | loss: 0.0248, acc(batch): 0.9375, grad:92792.5052: : 422it [00:49,  8.48it/s]


acc(train): 0.9155, acc(val): 0.8815, acc(test): 0.8741
___________________________________________________________________________________________________


epoch: 14, lr: 0.00030 | loss: 0.0258, acc(batch): 0.9286, grad:93039.9633: : 422it [00:50,  8.43it/s]


acc(train): 0.9143, acc(val): 0.8770, acc(test): 0.8721
___________________________________________________________________________________________________


epoch: 15, lr: 0.00030 | loss: 0.0262, acc(batch): 0.9464, grad:94656.9018: : 422it [00:50,  8.44it/s]


acc(train): 0.9179, acc(val): 0.8798, acc(test): 0.8738
___________________________________________________________________________________________________


epoch: 16, lr: 0.00030 | loss: 0.0248, acc(batch): 0.9464, grad:94473.2426: : 422it [00:49,  8.52it/s]


acc(train): 0.9176, acc(val): 0.8782, acc(test): 0.8727
___________________________________________________________________________________________________


epoch: 17, lr: 0.00030 | loss: 0.0243, acc(batch): 0.9464, grad:92445.5427: : 422it [00:50,  8.41it/s]


acc(train): 0.9220, acc(val): 0.8805, acc(test): 0.8737
___________________________________________________________________________________________________


epoch: 18, lr: 0.00030 | loss: 0.0251, acc(batch): 0.9196, grad:91908.2061: : 422it [00:49,  8.55it/s]


acc(train): 0.9215, acc(val): 0.8757, acc(test): 0.8706
___________________________________________________________________________________________________


epoch: 19, lr: 0.00030 | loss: 0.0263, acc(batch): 0.9196, grad:91309.4906: : 422it [00:49,  8.45it/s]


acc(train): 0.9196, acc(val): 0.8755, acc(test): 0.8686
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▇▇▇██████▇███▇▇
train_acc,▁▃▄▄▅▅▅▆▆▆▇▇▇▇▇█████
train_loss,█▅▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▃▄▅▆▇▇▇███▇▇█▇▇▇█▆▆
val_loss,█▅▃▂▂▁▁▁▁▁▁▂▂▂▃▃▄▄▅▇

0,1
epoch,19.0
test_acc,0.8686
train_acc,0.91956
train_loss,0.01652
val_acc,0.8755
val_loss,0.03945


[34m[1mwandb[0m: Agent Starting Run: dkxo52c5 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: Xavier_uniform


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0500, acc(batch): 0.8839, grad:33584.7646: : 422it [00:11, 37.25it/s]


acc(train): 0.8000, acc(val): 0.8002, acc(test): 0.7905
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0370, acc(batch): 0.8839, grad:33569.7829: : 422it [00:11, 36.94it/s]


acc(train): 0.8311, acc(val): 0.8265, acc(test): 0.8202
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0337, acc(batch): 0.9107, grad:33453.4632: : 422it [00:11, 36.63it/s]                       


acc(train): 0.8446, acc(val): 0.8423, acc(test): 0.8316
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0327, acc(batch): 0.8929, grad:33353.3586: : 422it [00:11, 37.06it/s]                       


acc(train): 0.8529, acc(val): 0.8497, acc(test): 0.8383
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0323, acc(batch): 0.8929, grad:33350.4648: : 422it [00:11, 37.02it/s]


acc(train): 0.8584, acc(val): 0.8557, acc(test): 0.8429
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0318, acc(batch): 0.8929, grad:33401.2496: : 422it [00:11, 37.54it/s]                       


acc(train): 0.8635, acc(val): 0.8592, acc(test): 0.8462
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0314, acc(batch): 0.8929, grad:33359.7711: : 422it [00:11, 36.95it/s]                       


acc(train): 0.8674, acc(val): 0.8612, acc(test): 0.8509
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0310, acc(batch): 0.8929, grad:33232.7049: : 422it [00:11, 37.52it/s]                       


acc(train): 0.8714, acc(val): 0.8642, acc(test): 0.8533
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0306, acc(batch): 0.8929, grad:33102.8670: : 422it [00:11, 37.70it/s]


acc(train): 0.8739, acc(val): 0.8667, acc(test): 0.8556
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0302, acc(batch): 0.8839, grad:32939.8454: : 422it [00:11, 37.40it/s]                       


acc(train): 0.8764, acc(val): 0.8687, acc(test): 0.8576
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0298, acc(batch): 0.8839, grad:32797.0708: : 422it [00:11, 36.88it/s]


acc(train): 0.8786, acc(val): 0.8703, acc(test): 0.8603
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0294, acc(batch): 0.8839, grad:32712.9691: : 422it [00:11, 36.88it/s]                       


acc(train): 0.8807, acc(val): 0.8717, acc(test): 0.8614
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0290, acc(batch): 0.8929, grad:32648.4800: : 422it [00:11, 37.54it/s]


acc(train): 0.8827, acc(val): 0.8718, acc(test): 0.8619
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0287, acc(batch): 0.8929, grad:32629.8726: : 422it [00:11, 37.34it/s]                       


acc(train): 0.8845, acc(val): 0.8728, acc(test): 0.8632
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0283, acc(batch): 0.8929, grad:32660.7244: : 422it [00:11, 37.17it/s]                       


acc(train): 0.8860, acc(val): 0.8733, acc(test): 0.8643
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0280, acc(batch): 0.8929, grad:32709.8503: : 422it [00:11, 36.73it/s]                       


acc(train): 0.8876, acc(val): 0.8737, acc(test): 0.8655
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0278, acc(batch): 0.8929, grad:32768.7120: : 422it [00:11, 36.51it/s]                       


acc(train): 0.8890, acc(val): 0.8745, acc(test): 0.8668
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0275, acc(batch): 0.9018, grad:32812.6700: : 422it [00:11, 37.15it/s]                       


acc(train): 0.8905, acc(val): 0.8753, acc(test): 0.8677
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0273, acc(batch): 0.9018, grad:32855.9469: : 422it [00:11, 36.61it/s]                       


acc(train): 0.8920, acc(val): 0.8755, acc(test): 0.8680
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0271, acc(batch): 0.9018, grad:32893.8976: : 422it [00:11, 37.34it/s]


acc(train): 0.8932, acc(val): 0.8768, acc(test): 0.8683
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▄▅▅▆▆▆▇▇▇▇▇▇███████
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇█████
train_loss,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▆▇▇▇▇▇█████████
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8683
train_acc,0.89324
train_loss,0.03023
val_acc,0.87683
val_loss,0.0334


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xv7ymibt with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0491, acc(batch): 0.8571, grad:36631.3625: : 422it [00:12, 34.78it/s]                       


acc(train): 0.8406, acc(val): 0.8422, acc(test): 0.8302
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0434, acc(batch): 0.8661, grad:36830.3102: : 422it [00:12, 34.66it/s]


acc(train): 0.8547, acc(val): 0.8570, acc(test): 0.8431
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0400, acc(batch): 0.8839, grad:36782.1833: : 422it [00:12, 34.40it/s]                       


acc(train): 0.8637, acc(val): 0.8623, acc(test): 0.8513
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0378, acc(batch): 0.9018, grad:36787.1750: : 422it [00:12, 34.54it/s]                       


acc(train): 0.8696, acc(val): 0.8667, acc(test): 0.8537
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0362, acc(batch): 0.9107, grad:36761.0226: : 422it [00:12, 34.53it/s]                       


acc(train): 0.8736, acc(val): 0.8667, acc(test): 0.8545
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0347, acc(batch): 0.9196, grad:36362.3463: : 422it [00:12, 34.10it/s]                       


acc(train): 0.8768, acc(val): 0.8687, acc(test): 0.8586
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0336, acc(batch): 0.9196, grad:35995.0995: : 422it [00:12, 34.67it/s]


acc(train): 0.8782, acc(val): 0.8683, acc(test): 0.8581
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0328, acc(batch): 0.9196, grad:35995.1145: : 422it [00:12, 34.97it/s]                       


acc(train): 0.8805, acc(val): 0.8693, acc(test): 0.8613
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0323, acc(batch): 0.9196, grad:36097.4167: : 422it [00:12, 34.32it/s]                       


acc(train): 0.8822, acc(val): 0.8697, acc(test): 0.8624
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0315, acc(batch): 0.9196, grad:35733.2205: : 422it [00:12, 34.87it/s]


acc(train): 0.8839, acc(val): 0.8715, acc(test): 0.8648
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▄▅▆▆▇▇▇██
train_acc,▁▃▅▆▆▇▇▇██
train_loss,█▄▃▃▂▂▂▁▁▁
val_acc,▁▅▆▇▇▇▇▇██
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,9.0
test_acc,0.8648
train_acc,0.88387
train_loss,0.03272
val_acc,0.8715
val_loss,0.03582


[34m[1mwandb[0m: Agent Starting Run: pf6v970q with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00030 | loss: 0.0445, acc(batch): 0.8571, grad:173785.9407: : 422it [02:21,  2.98it/s]


acc(train): 0.8551, acc(val): 0.8510, acc(test): 0.8426
___________________________________________________________________________________________________


epoch: 1, lr: 0.00030 | loss: 0.0406, acc(batch): 0.8482, grad:174813.6768: : 422it [02:20,  3.00it/s]


acc(train): 0.8742, acc(val): 0.8648, acc(test): 0.8586
___________________________________________________________________________________________________


epoch: 2, lr: 0.00030 | loss: 0.0372, acc(batch): 0.8839, grad:175121.9531: : 422it [02:23,  2.94it/s]


acc(train): 0.8857, acc(val): 0.8725, acc(test): 0.8660
___________________________________________________________________________________________________


epoch: 3, lr: 0.00030 | loss: 0.0344, acc(batch): 0.8839, grad:174435.4188: : 422it [02:23,  2.94it/s]


acc(train): 0.8946, acc(val): 0.8762, acc(test): 0.8715
___________________________________________________________________________________________________


epoch: 4, lr: 0.00030 | loss: 0.0321, acc(batch): 0.9107, grad:174126.2045: : 422it [02:22,  2.97it/s]


acc(train): 0.9017, acc(val): 0.8793, acc(test): 0.8730
___________________________________________________________________________________________________


epoch: 5, lr: 0.00030 | loss: 0.0301, acc(batch): 0.9196, grad:173929.1696: : 422it [02:21,  2.98it/s]


acc(train): 0.9061, acc(val): 0.8807, acc(test): 0.8759
___________________________________________________________________________________________________


epoch: 6, lr: 0.00030 | loss: 0.0282, acc(batch): 0.9196, grad:173962.4115: : 422it [02:21,  2.99it/s]


acc(train): 0.9099, acc(val): 0.8833, acc(test): 0.8764
___________________________________________________________________________________________________


epoch: 7, lr: 0.00030 | loss: 0.0261, acc(batch): 0.9375, grad:174261.7242: : 422it [02:21,  2.98it/s]


acc(train): 0.9137, acc(val): 0.8843, acc(test): 0.8788
___________________________________________________________________________________________________


epoch: 8, lr: 0.00030 | loss: 0.0239, acc(batch): 0.9464, grad:174709.6601: : 422it [02:21,  2.97it/s]


acc(train): 0.9166, acc(val): 0.8852, acc(test): 0.8803
___________________________________________________________________________________________________


epoch: 9, lr: 0.00030 | loss: 0.0215, acc(batch): 0.9643, grad:174098.1276: : 422it [02:22,  2.96it/s]


acc(train): 0.9179, acc(val): 0.8862, acc(test): 0.8802
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▄▅▆▇▇▇███
train_acc,▁▃▄▅▆▇▇███
train_loss,█▅▄▃▃▂▂▂▁▁
val_acc,▁▄▅▆▇▇▇███
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,9.0
test_acc,0.8802
train_acc,0.91785
train_loss,0.01967
val_acc,0.88617
val_loss,0.03189


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g3ntr2cp with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00100 | loss: 0.0706, acc(batch): 0.7679, grad:31912.3937: : 422it [00:11, 36.80it/s]                       


acc(train): 0.7349, acc(val): 0.7357, acc(test): 0.7278
___________________________________________________________________________________________________


epoch: 1, lr: 0.00100 | loss: 0.0528, acc(batch): 0.8482, grad:31482.9904: : 422it [00:11, 36.60it/s]                       


acc(train): 0.7897, acc(val): 0.7880, acc(test): 0.7816
___________________________________________________________________________________________________


epoch: 2, lr: 0.00100 | loss: 0.0473, acc(batch): 0.8482, grad:30700.2445: : 422it [00:11, 35.88it/s]                       


acc(train): 0.8208, acc(val): 0.8155, acc(test): 0.8093
___________________________________________________________________________________________________


epoch: 3, lr: 0.00100 | loss: 0.0433, acc(batch): 0.8571, grad:30182.6538: : 422it [00:11, 36.47it/s]                       


acc(train): 0.8402, acc(val): 0.8347, acc(test): 0.8285
___________________________________________________________________________________________________


epoch: 4, lr: 0.00100 | loss: 0.0406, acc(batch): 0.8750, grad:30181.3648: : 422it [00:11, 36.59it/s]                       


acc(train): 0.8505, acc(val): 0.8433, acc(test): 0.8383
___________________________________________________________________________________________________


epoch: 5, lr: 0.00100 | loss: 0.0393, acc(batch): 0.8929, grad:30223.8044: : 422it [00:11, 36.46it/s]                       


acc(train): 0.8557, acc(val): 0.8478, acc(test): 0.8423
___________________________________________________________________________________________________


epoch: 6, lr: 0.00100 | loss: 0.0386, acc(batch): 0.9018, grad:30265.7466: : 422it [00:11, 36.63it/s]                       


acc(train): 0.8596, acc(val): 0.8510, acc(test): 0.8459
___________________________________________________________________________________________________


epoch: 7, lr: 0.00100 | loss: 0.0380, acc(batch): 0.9018, grad:30277.6865: : 422it [00:11, 36.78it/s]                       


acc(train): 0.8625, acc(val): 0.8542, acc(test): 0.8471
___________________________________________________________________________________________________


epoch: 8, lr: 0.00100 | loss: 0.0377, acc(batch): 0.9018, grad:30269.4015: : 422it [00:11, 37.27it/s]                       


acc(train): 0.8644, acc(val): 0.8575, acc(test): 0.8488
___________________________________________________________________________________________________


epoch: 9, lr: 0.00100 | loss: 0.0374, acc(batch): 0.9018, grad:30204.7168: : 422it [00:11, 37.24it/s]                       


acc(train): 0.8669, acc(val): 0.8592, acc(test): 0.8506
___________________________________________________________________________________________________


epoch: 10, lr: 0.00100 | loss: 0.0372, acc(batch): 0.9018, grad:30111.8622: : 422it [00:11, 35.79it/s]


acc(train): 0.8693, acc(val): 0.8617, acc(test): 0.8518
___________________________________________________________________________________________________


epoch: 11, lr: 0.00100 | loss: 0.0370, acc(batch): 0.9018, grad:30010.6992: : 422it [00:11, 36.59it/s]


acc(train): 0.8713, acc(val): 0.8617, acc(test): 0.8536
___________________________________________________________________________________________________


epoch: 12, lr: 0.00100 | loss: 0.0369, acc(batch): 0.9018, grad:29911.6027: : 422it [00:11, 36.29it/s]


acc(train): 0.8727, acc(val): 0.8623, acc(test): 0.8550
___________________________________________________________________________________________________


epoch: 13, lr: 0.00100 | loss: 0.0368, acc(batch): 0.9018, grad:29817.3005: : 422it [00:11, 36.85it/s]                       


acc(train): 0.8739, acc(val): 0.8633, acc(test): 0.8549
___________________________________________________________________________________________________


epoch: 14, lr: 0.00100 | loss: 0.0367, acc(batch): 0.9018, grad:29727.9019: : 422it [00:11, 36.68it/s]                       


acc(train): 0.8747, acc(val): 0.8640, acc(test): 0.8550
___________________________________________________________________________________________________


epoch: 15, lr: 0.00100 | loss: 0.0367, acc(batch): 0.8929, grad:29645.6071: : 422it [00:11, 35.92it/s]                       


acc(train): 0.8756, acc(val): 0.8638, acc(test): 0.8557
___________________________________________________________________________________________________


epoch: 16, lr: 0.00100 | loss: 0.0366, acc(batch): 0.8929, grad:29574.0485: : 422it [00:11, 36.26it/s]                       


acc(train): 0.8765, acc(val): 0.8655, acc(test): 0.8566
___________________________________________________________________________________________________


epoch: 17, lr: 0.00100 | loss: 0.0366, acc(batch): 0.8839, grad:29510.5106: : 422it [00:11, 35.87it/s]


acc(train): 0.8771, acc(val): 0.8652, acc(test): 0.8575
___________________________________________________________________________________________________


epoch: 18, lr: 0.00100 | loss: 0.0367, acc(batch): 0.8839, grad:29452.3214: : 422it [00:11, 36.26it/s]


acc(train): 0.8778, acc(val): 0.8662, acc(test): 0.8578
___________________________________________________________________________________________________


epoch: 19, lr: 0.00100 | loss: 0.0367, acc(batch): 0.8929, grad:29401.0394: : 422it [00:11, 35.43it/s]                       


acc(train): 0.8783, acc(val): 0.8658, acc(test): 0.8587
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57941091539…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▄▅▆▇▇▇▇▇███████████
train_acc,▁▄▅▆▇▇▇▇▇▇██████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▇▇▇▇████████████
val_loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8587
train_acc,0.87828
train_loss,0.03443
val_acc,0.86583
val_loss,0.03768


[34m[1mwandb[0m: Agent Starting Run: mrsawphd with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0383, acc(batch): 0.9018, grad:84510.3052: : 422it [00:44,  9.49it/s]


acc(train): 0.8624, acc(val): 0.8553, acc(test): 0.8507
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0339, acc(batch): 0.9018, grad:82911.1339: : 422it [00:44,  9.44it/s]


acc(train): 0.8745, acc(val): 0.8627, acc(test): 0.8562
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0321, acc(batch): 0.9107, grad:82299.1022: : 422it [00:44,  9.47it/s]                       


acc(train): 0.8842, acc(val): 0.8678, acc(test): 0.8642
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0306, acc(batch): 0.9107, grad:82730.9595: : 422it [00:44,  9.45it/s]                       


acc(train): 0.8876, acc(val): 0.8707, acc(test): 0.8651
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0314, acc(batch): 0.9107, grad:82695.8833: : 422it [00:45,  9.34it/s]


acc(train): 0.8922, acc(val): 0.8735, acc(test): 0.8670
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0300, acc(batch): 0.9286, grad:81280.7322: : 422it [00:44,  9.42it/s]


acc(train): 0.8960, acc(val): 0.8758, acc(test): 0.8693
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0292, acc(batch): 0.9107, grad:79421.4574: : 422it [00:44,  9.42it/s]


acc(train): 0.8999, acc(val): 0.8772, acc(test): 0.8690
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0288, acc(batch): 0.9107, grad:81245.3982: : 422it [00:44,  9.38it/s]


acc(train): 0.9034, acc(val): 0.8797, acc(test): 0.8723
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0288, acc(batch): 0.9107, grad:79396.4565: : 422it [00:44,  9.57it/s]


acc(train): 0.9054, acc(val): 0.8820, acc(test): 0.8717
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0275, acc(batch): 0.9286, grad:80314.0182: : 422it [00:44,  9.47it/s]


acc(train): 0.9077, acc(val): 0.8822, acc(test): 0.8727
___________________________________________________________________________________________________


epoch: 10, lr: 0.00050 | loss: 0.0263, acc(batch): 0.9375, grad:79100.8522: : 422it [00:44,  9.55it/s]                       


acc(train): 0.9109, acc(val): 0.8860, acc(test): 0.8749
___________________________________________________________________________________________________


epoch: 11, lr: 0.00050 | loss: 0.0246, acc(batch): 0.9464, grad:77878.5610: : 422it [00:44,  9.53it/s]                       


acc(train): 0.9113, acc(val): 0.8848, acc(test): 0.8740
___________________________________________________________________________________________________


epoch: 12, lr: 0.00050 | loss: 0.0241, acc(batch): 0.9375, grad:77220.4674: : 422it [00:44,  9.44it/s]


acc(train): 0.9134, acc(val): 0.8857, acc(test): 0.8762
___________________________________________________________________________________________________


epoch: 13, lr: 0.00050 | loss: 0.0238, acc(batch): 0.9554, grad:77828.9646: : 422it [00:45,  9.27it/s]


acc(train): 0.9174, acc(val): 0.8862, acc(test): 0.8787
___________________________________________________________________________________________________


epoch: 14, lr: 0.00050 | loss: 0.0248, acc(batch): 0.9464, grad:75986.5784: : 422it [00:45,  9.35it/s]


acc(train): 0.9175, acc(val): 0.8872, acc(test): 0.8773
___________________________________________________________________________________________________


epoch: 15, lr: 0.00050 | loss: 0.0233, acc(batch): 0.9464, grad:77027.1146: : 422it [00:44,  9.51it/s]                       


acc(train): 0.9193, acc(val): 0.8878, acc(test): 0.8782
___________________________________________________________________________________________________


epoch: 16, lr: 0.00050 | loss: 0.0218, acc(batch): 0.9554, grad:76915.1394: : 422it [00:47,  8.94it/s]


acc(train): 0.9218, acc(val): 0.8890, acc(test): 0.8790
___________________________________________________________________________________________________


epoch: 17, lr: 0.00050 | loss: 0.0224, acc(batch): 0.9375, grad:75114.9376: : 422it [00:51,  8.21it/s]


acc(train): 0.9238, acc(val): 0.8910, acc(test): 0.8809
___________________________________________________________________________________________________


epoch: 18, lr: 0.00050 | loss: 0.0224, acc(batch): 0.9464, grad:75826.0208: : 422it [00:48,  8.77it/s]


acc(train): 0.9234, acc(val): 0.8880, acc(test): 0.8780
___________________________________________________________________________________________________


epoch: 19, lr: 0.00050 | loss: 0.0206, acc(batch): 0.9554, grad:76208.1416: : 422it [00:47,  8.90it/s]


acc(train): 0.9263, acc(val): 0.8910, acc(test): 0.8807
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57943385326…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▂▄▄▅▅▅▆▆▆▇▆▇▇▇▇██▇█
train_acc,▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇████
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▂▃▄▅▅▅▆▆▆▇▇▇▇▇▇██▇█
val_loss,█▅▃▃▂▂▂▁▁▁▁▂▂▁▂▂▁▁▂▁

0,1
epoch,19.0
test_acc,0.8807
train_acc,0.92631
train_loss,0.0172
val_acc,0.891
val_loss,0.03284


[34m[1mwandb[0m: Agent Starting Run: zxhwhi0m with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0752, acc(batch): 0.7500, grad:19615.4083: : 1688it [00:25, 65.19it/s]                        


acc(train): 0.7499, acc(val): 0.7467, acc(test): 0.7427
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0515, acc(batch): 0.8750, grad:19987.6963: : 1688it [00:26, 63.00it/s]                        


acc(train): 0.7876, acc(val): 0.7845, acc(test): 0.7767
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0431, acc(batch): 0.8750, grad:19817.0564: : 1688it [00:25, 65.14it/s]                        


acc(train): 0.8078, acc(val): 0.8083, acc(test): 0.7950
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0402, acc(batch): 0.8750, grad:19635.1823: : 1688it [00:26, 64.29it/s]                        


acc(train): 0.8181, acc(val): 0.8198, acc(test): 0.8073
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0393, acc(batch): 0.8750, grad:19474.7674: : 1688it [00:28, 59.36it/s]


acc(train): 0.8260, acc(val): 0.8250, acc(test): 0.8134
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0389, acc(batch): 0.8750, grad:19296.0841: : 1688it [00:26, 64.02it/s]                        


acc(train): 0.8313, acc(val): 0.8297, acc(test): 0.8174
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0387, acc(batch): 0.8750, grad:19116.1983: : 1688it [00:24, 69.73it/s]                        


acc(train): 0.8354, acc(val): 0.8338, acc(test): 0.8214
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0385, acc(batch): 0.8750, grad:19000.3456: : 1688it [00:24, 68.44it/s]                        


acc(train): 0.8395, acc(val): 0.8382, acc(test): 0.8253
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8750, grad:18968.1091: : 1688it [00:24, 69.88it/s]                        


acc(train): 0.8427, acc(val): 0.8402, acc(test): 0.8273
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8750, grad:18927.1525: : 1688it [00:24, 68.34it/s]                        


acc(train): 0.8450, acc(val): 0.8430, acc(test): 0.8294
___________________________________________________________________________________________________


epoch: 10, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8125, grad:18881.5846: : 1688it [00:24, 68.01it/s]                        


acc(train): 0.8471, acc(val): 0.8453, acc(test): 0.8313
___________________________________________________________________________________________________


epoch: 11, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8125, grad:18850.6842: : 1688it [00:24, 68.67it/s]                        


acc(train): 0.8491, acc(val): 0.8478, acc(test): 0.8329
___________________________________________________________________________________________________


epoch: 12, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8125, grad:18841.7971: : 1688it [00:24, 67.58it/s]                        


acc(train): 0.8509, acc(val): 0.8483, acc(test): 0.8348
___________________________________________________________________________________________________


epoch: 13, lr: 0.00050 | loss: 0.0385, acc(batch): 0.8125, grad:18848.7130: : 1688it [00:25, 67.07it/s]                        


acc(train): 0.8523, acc(val): 0.8488, acc(test): 0.8364
___________________________________________________________________________________________________


epoch: 14, lr: 0.00050 | loss: 0.0386, acc(batch): 0.8125, grad:18839.1270: : 1688it [00:24, 67.56it/s]                        


acc(train): 0.8541, acc(val): 0.8510, acc(test): 0.8377
___________________________________________________________________________________________________


epoch: 15, lr: 0.00050 | loss: 0.0387, acc(batch): 0.8125, grad:18816.7240: : 1688it [00:25, 67.25it/s]                        


acc(train): 0.8556, acc(val): 0.8525, acc(test): 0.8393
___________________________________________________________________________________________________


epoch: 16, lr: 0.00050 | loss: 0.0387, acc(batch): 0.8125, grad:18785.4361: : 1688it [00:26, 64.70it/s]                        


acc(train): 0.8569, acc(val): 0.8542, acc(test): 0.8402
___________________________________________________________________________________________________


epoch: 17, lr: 0.00050 | loss: 0.0388, acc(batch): 0.8125, grad:18755.4512: : 1688it [00:24, 68.07it/s]                        


acc(train): 0.8580, acc(val): 0.8543, acc(test): 0.8415
___________________________________________________________________________________________________


epoch: 18, lr: 0.00050 | loss: 0.0388, acc(batch): 0.8125, grad:18722.3485: : 1688it [00:25, 66.89it/s]


acc(train): 0.8592, acc(val): 0.8552, acc(test): 0.8421
___________________________________________________________________________________________________


epoch: 19, lr: 0.00050 | loss: 0.0389, acc(batch): 0.8125, grad:18683.6124: : 1688it [00:24, 67.53it/s]                        


acc(train): 0.8603, acc(val): 0.8558, acc(test): 0.8423
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57929642445…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▅▆▆▆▇▇▇▇▇▇▇███████
train_acc,▁▃▅▅▆▆▆▇▇▇▇▇▇▇██████
train_loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▆▇▇▇▇▇▇████████
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8423
train_acc,0.86026
train_loss,0.03931
val_acc,0.85583
val_loss,0.04049
