In [1]:
!unzip cs6910_assignment1-main.zip

Archive:  cs6910_assignment1-main.zip
0c67ac4944c825c8fef50343967db2a97554ee5b
   creating: cs6910_assignment1-main/
  inflating: cs6910_assignment1-main/.gitignore  
   creating: cs6910_assignment1-main/neural_networks/
  inflating: cs6910_assignment1-main/neural_networks/__init__.py  
  inflating: cs6910_assignment1-main/neural_networks/activations.py  
  inflating: cs6910_assignment1-main/neural_networks/layers.py  
  inflating: cs6910_assignment1-main/neural_networks/losses.py  
  inflating: cs6910_assignment1-main/neural_networks/neural_network.py  
  inflating: cs6910_assignment1-main/neural_networks/optimizer.py  
  inflating: cs6910_assignment1-main/train.py  
   creating: cs6910_assignment1-main/utils/
  inflating: cs6910_assignment1-main/utils/__init__.py  
  inflating: cs6910_assignment1-main/utils/data.py  
  inflating: cs6910_assignment1-main/utils/evaluate.py  


In [2]:
cd cs6910_assignment1-main/

/content/cs6910_assignment1-main


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# cd /content/drive/MyDrive/cs6910_assignment1-main

In [5]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.10-py2.py3-none-any.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 8.6 MB/s eta 0:00:01[K     |▍                               | 20 kB 12.6 MB/s eta 0:00:01[K     |▋                               | 30 kB 11.5 MB/s eta 0:00:01[K     |▊                               | 40 kB 9.7 MB/s eta 0:00:01[K     |█                               | 51 kB 4.3 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.0 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.6 MB/s eta 0:00:01[K     |█▌                              | 81 kB 6.0 MB/s eta 0:00:01[K     |█▊                              | 92 kB 6.7 MB/s eta 0:00:01[K     |██                              | 102 kB 5.3 MB/s eta 0:00:01[K     |██                              | 112 kB 5.3 MB/s eta 0:00:01[K     |██▎                             | 122 kB 5.3 MB/s eta 0:00:01[K     |██▌                             | 133 kB 5.3 MB/s eta 0:00:01[

In [6]:
import wandb
import argparse
import numpy as np
from tqdm import tqdm
from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split 

from neural_networks import NeuralNetwork, Optimizer, clip_gradients, losses
from utils import preprocess_data, make_batches, accuracy, gradient_sum

In [11]:
def train_and_evaluate(args):
    # Load Data
    (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
    X_train,X_val,y_train,y_val=train_test_split(X_train, y_train,test_size=0.1, random_state=1, stratify=y_train)
    (X_train, y_train) = preprocess_data(X_train, y_train)
    (X_test, y_test) = preprocess_data(X_test, y_test)
    (X_val,y_val) = preprocess_data(X_val,y_val)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape,X_val.shape,y_val.shape)

    wandb.run.name = 'ep-'+str(args.epochs)+'-bs-'+str(args.batch_size)+'-lf-'+str(args.loss)+'-op-'+str(args.optimizer)+'-lr-'+ str(args.learning_rate) \
                      + '-nhl-'+str(args.num_layers)+'-shl-'+str(args.hidden_size)+ '-act-'+str(args.activation) \
                      +'-w_i-'+str(args.weight_init_method)+'-w_d-'+str(args.weight_decay)


    # Instantiate model
    architecture = [{'num_neurons': args.hidden_size, 'activation': args.activation, 'init_method': args.weight_init_method}
                    for _ in range(args.num_layers)]
    architecture.append({'num_neurons': 10, 'activation': 'softmax'})  # add output layer

    model = NeuralNetwork(input_dim=784, architecture=architecture)
    optimizer = Optimizer(model, algorithm=args.optimizer,
                          momentum=0.9, beta=0.9,
                          beta1=0.9, beta2=0.99, epsilon=1e-8,weight_decay=args.weight_decay)
    lr = args.learning_rate
    loss_fn, _ = losses[args.loss]

    # Train

    for epoch in range(args.epochs):
        running_loss, running_grad = .0, .0
        num_steps = (X_train.shape[0] // args.batch_size)
        progress_bar = tqdm(make_batches(X_train, y_train, args.batch_size),
                            total=num_steps)
        for (X_batch, y_batch) in progress_bar:
            # Forward
            y_pred = model.forward(X_batch)

            # Optimize
            gradients = optimizer.backpropagate(y_batch, y_pred, losses[args.loss])
            gradients = clip_gradients(gradients, clip_ratio=5.0, norm_type=2)
            optimizer.optimize(gradients, learning_rate=lr)

            # Track acc, loss and gradients
            loss = loss_fn(y_batch, y_pred)
            grad_sum = gradient_sum(gradients)
            acc = accuracy(y_batch, y_pred)
            progress_bar.set_description(
                f"epoch: {epoch}, lr: {lr:.5f} | loss: {loss:.4f}, acc(batch): {acc:.4f}, grad:{grad_sum:.4f}"
            )
            
            running_loss += loss
            running_grad += grad_sum
        
        # Evaluate train and test splits
        train_acc = accuracy(model.forward(X_train), y_train)
        y_val_pred = model.forward(X_val)
        val_acc = accuracy(y_val_pred,y_val)
        val_loss = loss_fn(y_val,y_val_pred)
        test_acc = accuracy(model.forward(X_test), y_test)
        print(f"acc(train): {train_acc:.4f}, acc(val): {val_acc:.4f}, acc(test): {test_acc:.4f}")
        print('_' * 99)
        
        # Log metrics to wandb.ai
        wandb.log({
            'train_acc': train_acc, 
            'val_acc': val_acc,
            'train_loss': running_loss/num_steps,
            'val_loss' : val_loss,
            'test_acc': test_acc,
            'epoch':epoch            
        })
    
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(model.forward(X_test), axis=1)
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 
              'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankleboot']
    wandb.log({
      "confusion_matrix" : wandb.plot.confusion_matrix(probs=None,
                              y_true=y_true, preds=y_pred,
                              class_names=class_names)
    })

In [12]:
wandb.login()

sweep_config = {
    "name" : "cs6910_assignment1_fashion-mnist_sweep",
    "method" : "bayes",
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize'
    },
    "parameters" : {
        "epochs" : {
            "values" : [5, 10, 20]
        },
        "batch_size": {
            "values": [16, 32, 64, 128]
        },
        "loss": {
            "values": ['cross_entropy']
        },
        "optimizer": {
            "values": ["nag", "rmsprop", "adam", "nadam", "sgd", "momentum"]
        },
        "learning_rate" :{
            "values": [1e-4, 3e-4, 5e-4, 1e-3, 3e-3, 5e-3, 0.01]
        },
        "num_layers": {
            "values": [1, 2, 3, 4, 5]
        },
        "hidden_size": {
            "values": [16, 32, 64, 128, 256, 512]
        },
        "activation": {
            "values": ['sigmoid', 'tanh', 'ReLU']
        },
        "weight_init_method":{
            "values":['Xavier_normal','Xavier_uniform','He_normal','He_uniform']
        },
        "weight_decay":{
            "values":[0, 0.0005, 0.5]
        }
    }
}
sweep_id = wandb.sweep(sweep_config, project="cs6910_assignment1", entity="cs21s006_cs21s043")
print('sweep_id: ', sweep_id)

Create sweep with ID: r9k3ve7r
Sweep URL: https://wandb.ai/cs21s006_cs21s043/cs6910_assignment1/sweeps/r9k3ve7r
sweep_id:  r9k3ve7r


In [13]:
def spawn_fn():
      with wandb.init(project="cs6910_assignment1", entity="cs21s006_cs21s043") as run:
        config = wandb.config
        train_and_evaluate(config)


In [14]:
wandb.agent("1463mzwx", function=spawn_fn, count=10)

[34m[1mwandb[0m: Agent Starting Run: st0m94hk with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_uniform


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.01000 | loss: 0.1017, acc(batch): 0.7500, grad:45936.5646: : 1688it [00:51, 32.92it/s]


acc(train): 0.6804, acc(val): 0.6757, acc(test): 0.6735
___________________________________________________________________________________________________


epoch: 1, lr: 0.01000 | loss: 0.0923, acc(batch): 0.6875, grad:45781.4346: : 1688it [00:49, 34.22it/s]                        


acc(train): 0.7625, acc(val): 0.7653, acc(test): 0.7525
___________________________________________________________________________________________________


epoch: 2, lr: 0.01000 | loss: 0.0958, acc(batch): 0.7500, grad:40936.1150: : 1688it [00:49, 34.32it/s]                        


acc(train): 0.7468, acc(val): 0.7412, acc(test): 0.7373
___________________________________________________________________________________________________


epoch: 3, lr: 0.01000 | loss: 0.1164, acc(batch): 0.6875, grad:38731.0391: : 1688it [00:49, 34.32it/s]


acc(train): 0.6372, acc(val): 0.6307, acc(test): 0.6269
___________________________________________________________________________________________________


epoch: 4, lr: 0.01000 | loss: 0.1531, acc(batch): 0.4375, grad:42166.0974: : 1688it [00:48, 34.59it/s]                        


acc(train): 0.6145, acc(val): 0.6132, acc(test): 0.6101
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57945680439…

0,1
epoch,▁▃▅▆█
test_acc,▄█▇▂▁
train_acc,▄█▇▂▁
train_loss,█▂▁▁▁
val_acc,▄█▇▂▁
val_loss,▃▁▁▇█

0,1
epoch,4.0
test_acc,0.6101
train_acc,0.61448
train_loss,0.0778
val_acc,0.61317
val_loss,0.12687


[34m[1mwandb[0m: Agent Starting Run: 0zmqhxoh with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init_method: Xavier_uniform


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0804, grad:5112.6823: : 422it [00:05, 71.91it/s]                       


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 1, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0804, grad:5112.6826: : 422it [00:05, 72.28it/s]                       


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 2, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0804, grad:5112.6828: : 422it [00:05, 71.96it/s]                       


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 3, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0804, grad:5112.6829: : 422it [00:05, 72.28it/s]                       


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 4, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0804, grad:5112.6829: : 422it [00:05, 73.16it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▃▅▆█
test_acc,▁▁▁▁▁
train_acc,▁▁▁▁▁
train_loss,▁████
val_acc,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
epoch,4.0
test_acc,0.1
train_acc,0.1
train_loss,0.23087
val_acc,0.1
val_loss,0.2303


[34m[1mwandb[0m: Agent Starting Run: az01t31h with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00030 | loss: 0.1486, acc(batch): 0.5893, grad:14409.0939: : 422it [00:04, 89.70it/s]                       


acc(train): 0.5843, acc(val): 0.5787, acc(test): 0.5823
___________________________________________________________________________________________________


epoch: 1, lr: 0.00030 | loss: 0.1087, acc(batch): 0.7232, grad:11597.8337: : 422it [00:04, 89.72it/s]                       


acc(train): 0.6651, acc(val): 0.6622, acc(test): 0.6523
___________________________________________________________________________________________________


epoch: 2, lr: 0.00030 | loss: 0.0907, acc(batch): 0.7411, grad:12945.1637: : 422it [00:04, 91.20it/s]                       


acc(train): 0.7003, acc(val): 0.7007, acc(test): 0.6912
___________________________________________________________________________________________________


epoch: 3, lr: 0.00030 | loss: 0.0804, acc(batch): 0.7679, grad:14022.7402: : 422it [00:04, 90.06it/s]                       


acc(train): 0.7261, acc(val): 0.7262, acc(test): 0.7131
___________________________________________________________________________________________________


epoch: 4, lr: 0.00030 | loss: 0.0733, acc(batch): 0.7679, grad:14517.2623: : 422it [00:04, 89.41it/s]                       


acc(train): 0.7444, acc(val): 0.7440, acc(test): 0.7308
___________________________________________________________________________________________________


epoch: 5, lr: 0.00030 | loss: 0.0680, acc(batch): 0.7946, grad:14630.9909: : 422it [00:04, 89.53it/s]                       


acc(train): 0.7576, acc(val): 0.7552, acc(test): 0.7457
___________________________________________________________________________________________________


epoch: 6, lr: 0.00030 | loss: 0.0639, acc(batch): 0.8214, grad:14485.8671: : 422it [00:04, 89.01it/s]


acc(train): 0.7689, acc(val): 0.7650, acc(test): 0.7552
___________________________________________________________________________________________________


epoch: 7, lr: 0.00030 | loss: 0.0605, acc(batch): 0.8482, grad:14140.1721: : 422it [00:04, 90.84it/s]                       


acc(train): 0.7783, acc(val): 0.7760, acc(test): 0.7633
___________________________________________________________________________________________________


epoch: 8, lr: 0.00030 | loss: 0.0576, acc(batch): 0.8750, grad:13669.2652: : 422it [00:04, 88.10it/s]                       


acc(train): 0.7858, acc(val): 0.7825, acc(test): 0.7712
___________________________________________________________________________________________________


epoch: 9, lr: 0.00030 | loss: 0.0552, acc(batch): 0.8750, grad:13613.1158: : 422it [00:04, 88.47it/s]                       


acc(train): 0.7923, acc(val): 0.7890, acc(test): 0.7795
___________________________________________________________________________________________________


epoch: 10, lr: 0.00030 | loss: 0.0531, acc(batch): 0.8839, grad:13592.0804: : 422it [00:04, 89.12it/s]                       


acc(train): 0.7974, acc(val): 0.7970, acc(test): 0.7841
___________________________________________________________________________________________________


epoch: 11, lr: 0.00030 | loss: 0.0513, acc(batch): 0.8929, grad:13542.0988: : 422it [00:04, 88.70it/s]                       


acc(train): 0.8027, acc(val): 0.8023, acc(test): 0.7898
___________________________________________________________________________________________________


epoch: 12, lr: 0.00030 | loss: 0.0497, acc(batch): 0.9018, grad:13477.8691: : 422it [00:04, 88.01it/s]                       


acc(train): 0.8079, acc(val): 0.8067, acc(test): 0.7935
___________________________________________________________________________________________________


epoch: 13, lr: 0.00030 | loss: 0.0484, acc(batch): 0.9018, grad:13409.0399: : 422it [00:04, 88.97it/s]                       


acc(train): 0.8116, acc(val): 0.8092, acc(test): 0.7979
___________________________________________________________________________________________________


epoch: 14, lr: 0.00030 | loss: 0.0473, acc(batch): 0.8929, grad:13374.9273: : 422it [00:04, 87.53it/s]                       


acc(train): 0.8150, acc(val): 0.8140, acc(test): 0.8018
___________________________________________________________________________________________________


epoch: 15, lr: 0.00030 | loss: 0.0463, acc(batch): 0.8929, grad:13342.8255: : 422it [00:04, 88.57it/s]                       


acc(train): 0.8186, acc(val): 0.8172, acc(test): 0.8052
___________________________________________________________________________________________________


epoch: 16, lr: 0.00030 | loss: 0.0455, acc(batch): 0.9018, grad:13309.5134: : 422it [00:04, 88.80it/s]                       


acc(train): 0.8216, acc(val): 0.8210, acc(test): 0.8092
___________________________________________________________________________________________________


epoch: 17, lr: 0.00030 | loss: 0.0448, acc(batch): 0.8929, grad:13280.2964: : 422it [00:05, 84.29it/s]                       


acc(train): 0.8244, acc(val): 0.8227, acc(test): 0.8121
___________________________________________________________________________________________________


epoch: 18, lr: 0.00030 | loss: 0.0442, acc(batch): 0.9018, grad:13277.0471: : 422it [00:04, 86.44it/s]                       


acc(train): 0.8261, acc(val): 0.8243, acc(test): 0.8143
___________________________________________________________________________________________________


epoch: 19, lr: 0.00030 | loss: 0.0436, acc(batch): 0.8929, grad:13273.1555: : 422it [00:04, 87.51it/s]                       


acc(train): 0.8283, acc(val): 0.8260, acc(test): 0.8162
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇██████
train_acc,▁▃▄▅▆▆▆▇▇▇▇▇▇███████
train_loss,█▅▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▄▅▆▆▆▇▇▇▇▇▇███████
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8162
train_acc,0.82826
train_loss,0.04924
val_acc,0.826
val_loss,0.04877


[34m[1mwandb[0m: Agent Starting Run: wyxtw9c8 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00500 | loss: 0.2307, acc(batch): 0.0625, grad:173723.7503: : 844it [04:33,  3.08it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 1, lr: 0.00500 | loss: 0.2307, acc(batch): 0.0625, grad:199479.9291: : 844it [04:32,  3.10it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 2, lr: 0.00500 | loss: 0.2308, acc(batch): 0.0625, grad:200463.7744: : 844it [04:32,  3.10it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 3, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0625, grad:209212.8599: : 844it [04:33,  3.09it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 4, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0625, grad:191314.5649: : 844it [04:33,  3.09it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 5, lr: 0.00500 | loss: 0.2308, acc(batch): 0.0625, grad:196455.1024: : 844it [04:31,  3.11it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 6, lr: 0.00500 | loss: 0.2306, acc(batch): 0.0625, grad:203621.5355: : 844it [04:43,  2.98it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 7, lr: 0.00500 | loss: 0.2307, acc(batch): 0.0625, grad:198082.8128: : 844it [04:46,  2.95it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 8, lr: 0.00500 | loss: 0.2307, acc(batch): 0.0625, grad:179513.8712: : 844it [04:34,  3.07it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


epoch: 9, lr: 0.00500 | loss: 0.2307, acc(batch): 0.0625, grad:173584.3184: : 844it [04:43,  2.98it/s]


acc(train): 0.1000, acc(val): 0.1000, acc(test): 0.1000
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▁▁▁▁▁▁▁▁▁
train_acc,▁▁▁▁▁▁▁▁▁▁
train_loss,▁█████████
val_acc,▁▁▁▁▁▁▁▁▁▁
val_loss,▅█▂▇█▁▃▆▁▅

0,1
epoch,9.0
test_acc,0.1
train_acc,0.1
train_loss,0.23062
val_acc,0.1
val_loss,0.23031


[34m[1mwandb[0m: Agent Starting Run: 1is7w0mi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00300 | loss: 0.0433, acc(batch): 0.8482, grad:43211.3998: : 422it [00:17, 24.21it/s]


acc(train): 0.7924, acc(val): 0.7913, acc(test): 0.7869
___________________________________________________________________________________________________


epoch: 1, lr: 0.00300 | loss: 0.0446, acc(batch): 0.8750, grad:40544.1494: : 422it [00:17, 24.45it/s]                       


acc(train): 0.8440, acc(val): 0.8422, acc(test): 0.8338
___________________________________________________________________________________________________


epoch: 2, lr: 0.00300 | loss: 0.0424, acc(batch): 0.8750, grad:37138.2267: : 422it [00:18, 22.88it/s]                       


acc(train): 0.8483, acc(val): 0.8435, acc(test): 0.8341
___________________________________________________________________________________________________


epoch: 3, lr: 0.00300 | loss: 0.0399, acc(batch): 0.9018, grad:36864.6523: : 422it [00:23, 18.07it/s]                       


acc(train): 0.8620, acc(val): 0.8537, acc(test): 0.8460
___________________________________________________________________________________________________


epoch: 4, lr: 0.00300 | loss: 0.0376, acc(batch): 0.8661, grad:32753.7296: : 422it [00:24, 17.32it/s]


acc(train): 0.8612, acc(val): 0.8535, acc(test): 0.8460
___________________________________________________________________________________________________


epoch: 5, lr: 0.00300 | loss: 0.0414, acc(batch): 0.8750, grad:31343.4045: : 422it [00:25, 16.33it/s]


acc(train): 0.8678, acc(val): 0.8577, acc(test): 0.8517
___________________________________________________________________________________________________


epoch: 6, lr: 0.00300 | loss: 0.0348, acc(batch): 0.8750, grad:35110.6101: : 422it [00:31, 13.32it/s]


acc(train): 0.8661, acc(val): 0.8583, acc(test): 0.8496
___________________________________________________________________________________________________


epoch: 7, lr: 0.00300 | loss: 0.0404, acc(batch): 0.8571, grad:35362.6933: : 422it [00:30, 13.77it/s]


acc(train): 0.8713, acc(val): 0.8577, acc(test): 0.8535
___________________________________________________________________________________________________


epoch: 8, lr: 0.00300 | loss: 0.0341, acc(batch): 0.9018, grad:34309.9433: : 422it [00:30, 13.88it/s]


acc(train): 0.8643, acc(val): 0.8558, acc(test): 0.8467
___________________________________________________________________________________________________


epoch: 9, lr: 0.00300 | loss: 0.0435, acc(batch): 0.8571, grad:31625.9870: : 422it [00:30, 13.62it/s]


acc(train): 0.8597, acc(val): 0.8530, acc(test): 0.8420
___________________________________________________________________________________________________


epoch: 10, lr: 0.00300 | loss: 0.0403, acc(batch): 0.8571, grad:30014.3347: : 422it [00:32, 13.14it/s]


acc(train): 0.8368, acc(val): 0.8243, acc(test): 0.8189
___________________________________________________________________________________________________


epoch: 11, lr: 0.00300 | loss: 0.0348, acc(batch): 0.9018, grad:31581.0618: : 422it [01:00,  6.99it/s]


acc(train): 0.8663, acc(val): 0.8548, acc(test): 0.8474
___________________________________________________________________________________________________


epoch: 12, lr: 0.00300 | loss: 0.0323, acc(batch): 0.8750, grad:30534.0809: : 422it [01:01,  6.84it/s]


acc(train): 0.8454, acc(val): 0.8415, acc(test): 0.8253
___________________________________________________________________________________________________


epoch: 13, lr: 0.00300 | loss: 0.0380, acc(batch): 0.8571, grad:29101.4318: : 422it [01:04,  6.55it/s]


acc(train): 0.8723, acc(val): 0.8572, acc(test): 0.8523
___________________________________________________________________________________________________


epoch: 14, lr: 0.00300 | loss: 0.0351, acc(batch): 0.8929, grad:30219.5908: : 422it [01:08,  6.20it/s]


acc(train): 0.8545, acc(val): 0.8435, acc(test): 0.8357
___________________________________________________________________________________________________


epoch: 15, lr: 0.00300 | loss: 0.0376, acc(batch): 0.8571, grad:29124.6422: : 422it [01:08,  6.16it/s]


acc(train): 0.8650, acc(val): 0.8513, acc(test): 0.8473
___________________________________________________________________________________________________


epoch: 16, lr: 0.00300 | loss: 0.0327, acc(batch): 0.8839, grad:30893.6675: : 422it [01:10,  5.98it/s]


acc(train): 0.8494, acc(val): 0.8363, acc(test): 0.8349
___________________________________________________________________________________________________


epoch: 17, lr: 0.00300 | loss: 0.0415, acc(batch): 0.8571, grad:29207.7372: : 422it [01:12,  5.83it/s]


acc(train): 0.8754, acc(val): 0.8560, acc(test): 0.8535
___________________________________________________________________________________________________


epoch: 18, lr: 0.00300 | loss: 0.0383, acc(batch): 0.8571, grad:27724.5295: : 422it [01:16,  5.53it/s]


acc(train): 0.8686, acc(val): 0.8525, acc(test): 0.8486
___________________________________________________________________________________________________


epoch: 19, lr: 0.00300 | loss: 0.0363, acc(batch): 0.8839, grad:28365.7080: : 422it [01:17,  5.48it/s]


acc(train): 0.8708, acc(val): 0.8605, acc(test): 0.8490
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▆▆▇▇███▇▇▄▇▅█▆▇▆█▇█
train_acc,▁▅▆▇▇▇▇█▇▇▅▇▅█▆▇▆█▇█
train_loss,█▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
val_acc,▁▆▆▇▇████▇▄▇▆█▆▇▆█▇█
val_loss,█▃▃▂▂▁▁▁▂▂▅▂▄▂▃▂▄▁▂▂

0,1
epoch,19.0
test_acc,0.849
train_acc,0.87078
train_loss,0.03282
val_acc,0.8605
val_loss,0.04081


[34m[1mwandb[0m: Agent Starting Run: or57e8y9 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_uniform


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00300 | loss: 0.0396, acc(batch): 0.8839, grad:31784.7837: : 422it [00:13, 32.21it/s]                       


acc(train): 0.8515, acc(val): 0.8497, acc(test): 0.8359
___________________________________________________________________________________________________


epoch: 1, lr: 0.00300 | loss: 0.0390, acc(batch): 0.8839, grad:31589.5958: : 422it [00:12, 33.25it/s]                       


acc(train): 0.8592, acc(val): 0.8508, acc(test): 0.8417
___________________________________________________________________________________________________


epoch: 2, lr: 0.00300 | loss: 0.0371, acc(batch): 0.8929, grad:30876.3644: : 422it [00:12, 32.91it/s]                       


acc(train): 0.8637, acc(val): 0.8523, acc(test): 0.8432
___________________________________________________________________________________________________


epoch: 3, lr: 0.00300 | loss: 0.0336, acc(batch): 0.9018, grad:29189.4154: : 422it [00:12, 33.18it/s]                       


acc(train): 0.8672, acc(val): 0.8582, acc(test): 0.8468
___________________________________________________________________________________________________


epoch: 4, lr: 0.00300 | loss: 0.0315, acc(batch): 0.9018, grad:29900.0850: : 422it [00:12, 33.84it/s]                       


acc(train): 0.8667, acc(val): 0.8555, acc(test): 0.8415
___________________________________________________________________________________________________


epoch: 5, lr: 0.00300 | loss: 0.0323, acc(batch): 0.9107, grad:26480.2946: : 422it [00:12, 33.21it/s]                       


acc(train): 0.8625, acc(val): 0.8545, acc(test): 0.8418
___________________________________________________________________________________________________


epoch: 6, lr: 0.00300 | loss: 0.0301, acc(batch): 0.9196, grad:26245.0682: : 422it [00:12, 33.14it/s]


acc(train): 0.8701, acc(val): 0.8580, acc(test): 0.8473
___________________________________________________________________________________________________


epoch: 7, lr: 0.00300 | loss: 0.0313, acc(batch): 0.9196, grad:25516.3673: : 422it [00:12, 32.78it/s]


acc(train): 0.8720, acc(val): 0.8630, acc(test): 0.8473
___________________________________________________________________________________________________


epoch: 8, lr: 0.00300 | loss: 0.0283, acc(batch): 0.9196, grad:23760.3316: : 422it [00:12, 32.76it/s]                       


acc(train): 0.8784, acc(val): 0.8645, acc(test): 0.8544
___________________________________________________________________________________________________


epoch: 9, lr: 0.00300 | loss: 0.0302, acc(batch): 0.9107, grad:23350.1726: : 422it [00:13, 32.41it/s]                       


acc(train): 0.8745, acc(val): 0.8612, acc(test): 0.8502
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▃▄▅▃▃▅▅█▆
train_acc,▁▃▄▅▅▄▆▆█▇
train_loss,█▄▃▂▂▂▁▁▁▁
val_acc,▁▂▂▅▄▃▅▇█▆
val_loss,█▆▅▃▄▄▃▂▁▁

0,1
epoch,9.0
test_acc,0.8502
train_acc,0.87454
train_loss,0.03246
val_acc,0.86117
val_loss,0.03706


[34m[1mwandb[0m: Agent Starting Run: 3lzh91co with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0480, acc(batch): 0.8214, grad:54420.2805: : 422it [00:17, 23.65it/s]                       


acc(train): 0.8497, acc(val): 0.8457, acc(test): 0.8364
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0428, acc(batch): 0.8482, grad:55406.7308: : 422it [00:17, 23.72it/s]                       


acc(train): 0.8636, acc(val): 0.8543, acc(test): 0.8506
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0407, acc(batch): 0.8482, grad:55985.0064: : 422it [00:17, 23.53it/s]                       


acc(train): 0.8764, acc(val): 0.8667, acc(test): 0.8585
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0381, acc(batch): 0.8571, grad:54489.0460: : 422it [00:18, 23.36it/s]                       


acc(train): 0.8842, acc(val): 0.8732, acc(test): 0.8644
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0345, acc(batch): 0.8839, grad:55279.6546: : 422it [00:17, 23.74it/s]                       


acc(train): 0.8884, acc(val): 0.8732, acc(test): 0.8672
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0336, acc(batch): 0.8839, grad:56132.1473: : 422it [00:17, 23.96it/s]                       


acc(train): 0.8904, acc(val): 0.8737, acc(test): 0.8666
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0319, acc(batch): 0.8839, grad:54811.2114: : 422it [00:17, 23.94it/s]                       


acc(train): 0.8911, acc(val): 0.8722, acc(test): 0.8669
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0277, acc(batch): 0.9018, grad:55081.6464: : 422it [00:17, 23.46it/s]                       


acc(train): 0.8952, acc(val): 0.8745, acc(test): 0.8667
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0274, acc(batch): 0.9196, grad:55088.1017: : 422it [00:17, 23.59it/s]                       


acc(train): 0.9014, acc(val): 0.8755, acc(test): 0.8691
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0267, acc(batch): 0.9107, grad:55185.1489: : 422it [00:17, 23.87it/s]                       


acc(train): 0.8997, acc(val): 0.8745, acc(test): 0.8681
___________________________________________________________________________________________________


epoch: 10, lr: 0.00050 | loss: 0.0232, acc(batch): 0.9196, grad:54880.5058: : 422it [00:17, 23.97it/s]                       


acc(train): 0.9069, acc(val): 0.8783, acc(test): 0.8716
___________________________________________________________________________________________________


epoch: 11, lr: 0.00050 | loss: 0.0223, acc(batch): 0.9286, grad:53546.4236: : 422it [00:17, 23.91it/s]                       


acc(train): 0.9095, acc(val): 0.8802, acc(test): 0.8745
___________________________________________________________________________________________________


epoch: 12, lr: 0.00050 | loss: 0.0220, acc(batch): 0.9107, grad:53892.9341: : 422it [00:17, 23.61it/s]                       


acc(train): 0.9107, acc(val): 0.8785, acc(test): 0.8746
___________________________________________________________________________________________________


epoch: 13, lr: 0.00050 | loss: 0.0208, acc(batch): 0.9375, grad:53117.8285: : 422it [00:17, 23.79it/s]


acc(train): 0.9162, acc(val): 0.8832, acc(test): 0.8753
___________________________________________________________________________________________________


epoch: 14, lr: 0.00050 | loss: 0.0185, acc(batch): 0.9464, grad:53937.6392: : 422it [00:17, 23.95it/s]                       


acc(train): 0.9138, acc(val): 0.8787, acc(test): 0.8736
___________________________________________________________________________________________________


epoch: 15, lr: 0.00050 | loss: 0.0168, acc(batch): 0.9554, grad:53332.2895: : 422it [00:17, 23.72it/s]


acc(train): 0.9129, acc(val): 0.8795, acc(test): 0.8686
___________________________________________________________________________________________________


epoch: 16, lr: 0.00050 | loss: 0.0198, acc(batch): 0.9643, grad:53010.8735: : 422it [00:17, 23.94it/s]                       


acc(train): 0.9177, acc(val): 0.8832, acc(test): 0.8749
___________________________________________________________________________________________________


epoch: 17, lr: 0.00050 | loss: 0.0204, acc(batch): 0.9196, grad:50843.2880: : 422it [00:17, 24.20it/s]


acc(train): 0.9218, acc(val): 0.8835, acc(test): 0.8752
___________________________________________________________________________________________________


epoch: 18, lr: 0.00050 | loss: 0.0177, acc(batch): 0.9464, grad:53778.8906: : 422it [00:17, 23.69it/s]                       


acc(train): 0.9248, acc(val): 0.8860, acc(test): 0.8749
___________________________________________________________________________________________________


epoch: 19, lr: 0.00050 | loss: 0.0213, acc(batch): 0.9286, grad:52613.7280: : 422it [00:18, 23.04it/s]                       


acc(train): 0.9241, acc(val): 0.8858, acc(test): 0.8770
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57947976878…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▅▆▆▆▆▆▇▆▇███▇▇████
train_acc,▁▂▃▄▅▅▅▅▆▆▆▇▇▇▇▇▇███
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▃▅▆▆▆▆▆▆▆▇▇▇█▇▇████
val_loss,█▅▃▂▁▂▂▃▁▃▂▃▂▃▅▆▅▄▆▇

0,1
epoch,19.0
test_acc,0.877
train_acc,0.92407
train_loss,0.01627
val_acc,0.88583
val_loss,0.04169


[34m[1mwandb[0m: Agent Starting Run: i6i2i8zu with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00100 | loss: 0.0416, acc(batch): 0.9196, grad:17986.4461: : 422it [00:06, 62.30it/s]                       


acc(train): 0.8071, acc(val): 0.8053, acc(test): 0.7948
___________________________________________________________________________________________________


epoch: 1, lr: 0.00100 | loss: 0.0346, acc(batch): 0.9107, grad:19708.1976: : 422it [00:06, 64.05it/s]


acc(train): 0.8188, acc(val): 0.8175, acc(test): 0.8061
___________________________________________________________________________________________________


epoch: 2, lr: 0.00100 | loss: 0.0344, acc(batch): 0.8750, grad:18698.1074: : 422it [00:06, 62.80it/s]                       


acc(train): 0.8390, acc(val): 0.8370, acc(test): 0.8234
___________________________________________________________________________________________________


epoch: 3, lr: 0.00100 | loss: 0.0339, acc(batch): 0.8750, grad:16946.7169: : 422it [00:06, 63.70it/s]                       


acc(train): 0.8614, acc(val): 0.8563, acc(test): 0.8445
___________________________________________________________________________________________________


epoch: 4, lr: 0.00100 | loss: 0.0344, acc(batch): 0.9018, grad:17966.5809: : 422it [00:06, 66.35it/s]                       


acc(train): 0.8491, acc(val): 0.8477, acc(test): 0.8319
___________________________________________________________________________________________________


epoch: 5, lr: 0.00100 | loss: 0.0299, acc(batch): 0.9196, grad:18748.8383: : 422it [00:06, 67.53it/s]                       


acc(train): 0.8565, acc(val): 0.8505, acc(test): 0.8369
___________________________________________________________________________________________________


epoch: 6, lr: 0.00100 | loss: 0.0317, acc(batch): 0.9018, grad:19448.8774: : 422it [00:06, 66.36it/s]                       


acc(train): 0.8640, acc(val): 0.8565, acc(test): 0.8466
___________________________________________________________________________________________________


epoch: 7, lr: 0.00100 | loss: 0.0317, acc(batch): 0.8929, grad:17245.5305: : 422it [00:06, 63.68it/s]                       


acc(train): 0.8645, acc(val): 0.8602, acc(test): 0.8467
___________________________________________________________________________________________________


epoch: 8, lr: 0.00100 | loss: 0.0320, acc(batch): 0.8839, grad:16610.8028: : 422it [00:06, 63.24it/s]                       


acc(train): 0.8705, acc(val): 0.8645, acc(test): 0.8529
___________________________________________________________________________________________________


epoch: 9, lr: 0.00100 | loss: 0.0290, acc(batch): 0.9018, grad:18086.7982: : 422it [00:06, 63.10it/s]                       


acc(train): 0.8748, acc(val): 0.8658, acc(test): 0.8562
___________________________________________________________________________________________________


epoch: 10, lr: 0.00100 | loss: 0.0284, acc(batch): 0.9107, grad:16916.2684: : 422it [00:06, 65.71it/s]                       


acc(train): 0.8821, acc(val): 0.8738, acc(test): 0.8624
___________________________________________________________________________________________________


epoch: 11, lr: 0.00100 | loss: 0.0300, acc(batch): 0.9018, grad:17379.4108: : 422it [00:06, 64.46it/s]                       


acc(train): 0.8800, acc(val): 0.8688, acc(test): 0.8598
___________________________________________________________________________________________________


epoch: 12, lr: 0.00100 | loss: 0.0277, acc(batch): 0.9107, grad:19714.1001: : 422it [00:06, 62.12it/s]                       


acc(train): 0.8704, acc(val): 0.8623, acc(test): 0.8509
___________________________________________________________________________________________________


epoch: 13, lr: 0.00100 | loss: 0.0270, acc(batch): 0.9107, grad:18789.5813: : 422it [00:06, 64.35it/s]                       


acc(train): 0.8739, acc(val): 0.8670, acc(test): 0.8537
___________________________________________________________________________________________________


epoch: 14, lr: 0.00100 | loss: 0.0250, acc(batch): 0.9286, grad:18953.5220: : 422it [00:06, 65.83it/s]                       


acc(train): 0.8762, acc(val): 0.8617, acc(test): 0.8559
___________________________________________________________________________________________________


epoch: 15, lr: 0.00100 | loss: 0.0270, acc(batch): 0.9196, grad:19247.3787: : 422it [00:06, 65.62it/s]                       


acc(train): 0.8648, acc(val): 0.8513, acc(test): 0.8403
___________________________________________________________________________________________________


epoch: 16, lr: 0.00100 | loss: 0.0270, acc(batch): 0.9286, grad:19660.5891: : 422it [00:06, 64.21it/s]                       


acc(train): 0.8670, acc(val): 0.8543, acc(test): 0.8432
___________________________________________________________________________________________________


epoch: 17, lr: 0.00100 | loss: 0.0250, acc(batch): 0.9196, grad:17992.7021: : 422it [00:06, 62.96it/s]                       


acc(train): 0.8818, acc(val): 0.8680, acc(test): 0.8584
___________________________________________________________________________________________________


epoch: 18, lr: 0.00100 | loss: 0.0275, acc(batch): 0.9196, grad:19473.8228: : 422it [00:06, 65.18it/s]                       


acc(train): 0.8623, acc(val): 0.8497, acc(test): 0.8371
___________________________________________________________________________________________________


epoch: 19, lr: 0.00100 | loss: 0.0251, acc(batch): 0.9286, grad:18534.4443: : 422it [00:06, 64.38it/s]                       


acc(train): 0.8911, acc(val): 0.8730, acc(test): 0.8676
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57941091539…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▂▄▆▅▅▆▆▇▇▇▇▆▇▇▅▆▇▅█
train_acc,▁▂▄▆▄▅▆▆▆▇▇▇▆▇▇▆▆▇▆█
train_loss,█▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▂▄▆▅▆▆▇▇▇█▇▇▇▇▆▆▇▆█
val_loss,█▇▅▃▄▃▃▄▃▂▁▂▃▂▂▄▄▂▅▁

0,1
epoch,19.0
test_acc,0.8676
train_acc,0.89115
train_loss,0.03009
val_acc,0.873
val_loss,0.03465


[34m[1mwandb[0m: Agent Starting Run: tg56vk0v with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0356, acc(batch): 0.8839, grad:106696.7236: : 422it [00:54,  7.77it/s]


acc(train): 0.8487, acc(val): 0.8492, acc(test): 0.8340
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0329, acc(batch): 0.8929, grad:105741.7532: : 422it [00:54,  7.74it/s]


acc(train): 0.8630, acc(val): 0.8582, acc(test): 0.8468
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0318, acc(batch): 0.9018, grad:105749.8923: : 422it [00:55,  7.63it/s]


acc(train): 0.8711, acc(val): 0.8625, acc(test): 0.8535
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0310, acc(batch): 0.9018, grad:106299.0906: : 422it [00:54,  7.74it/s]


acc(train): 0.8776, acc(val): 0.8675, acc(test): 0.8551
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0305, acc(batch): 0.9018, grad:106751.6958: : 422it [00:55,  7.67it/s]


acc(train): 0.8826, acc(val): 0.8703, acc(test): 0.8597
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0299, acc(batch): 0.9018, grad:107100.0584: : 422it [00:54,  7.70it/s]


acc(train): 0.8872, acc(val): 0.8735, acc(test): 0.8612
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0294, acc(batch): 0.9018, grad:107414.1697: : 422it [00:54,  7.70it/s]


acc(train): 0.8917, acc(val): 0.8763, acc(test): 0.8640
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0288, acc(batch): 0.9018, grad:107649.2686: : 422it [00:54,  7.68it/s]


acc(train): 0.8962, acc(val): 0.8793, acc(test): 0.8679
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0283, acc(batch): 0.9018, grad:107865.9686: : 422it [00:53,  7.88it/s]


acc(train): 0.8999, acc(val): 0.8820, acc(test): 0.8700
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0278, acc(batch): 0.9018, grad:107992.6348: : 422it [00:52,  8.00it/s]


acc(train): 0.9035, acc(val): 0.8850, acc(test): 0.8711
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0273, acc(batch): 0.9018, grad:108007.9735: : 422it [00:52,  8.08it/s]


acc(train): 0.9066, acc(val): 0.8865, acc(test): 0.8742
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0269, acc(batch): 0.9018, grad:108004.6788: : 422it [00:52,  8.02it/s]


acc(train): 0.9094, acc(val): 0.8885, acc(test): 0.8768
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0265, acc(batch): 0.9107, grad:107957.9956: : 422it [00:53,  7.89it/s]


acc(train): 0.9119, acc(val): 0.8885, acc(test): 0.8784
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0262, acc(batch): 0.9107, grad:107935.2179: : 422it [00:53,  7.94it/s]


acc(train): 0.9146, acc(val): 0.8888, acc(test): 0.8791
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0258, acc(batch): 0.9018, grad:107958.9899: : 422it [00:52,  7.99it/s]


acc(train): 0.9168, acc(val): 0.8907, acc(test): 0.8794
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0255, acc(batch): 0.9286, grad:107977.7716: : 422it [00:52,  7.99it/s]


acc(train): 0.9187, acc(val): 0.8912, acc(test): 0.8798
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0251, acc(batch): 0.9375, grad:107984.2676: : 422it [00:52,  8.00it/s]


acc(train): 0.9201, acc(val): 0.8925, acc(test): 0.8806
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0248, acc(batch): 0.9375, grad:107972.4479: : 422it [00:53,  7.89it/s]


acc(train): 0.9224, acc(val): 0.8943, acc(test): 0.8813
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0245, acc(batch): 0.9375, grad:107946.9313: : 422it [00:53,  7.96it/s]


acc(train): 0.9240, acc(val): 0.8940, acc(test): 0.8816
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0243, acc(batch): 0.9375, grad:107910.2732: : 422it [00:52,  8.03it/s]


acc(train): 0.9258, acc(val): 0.8943, acc(test): 0.8824
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57938799076…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▄▅▅▅▆▆▆▇▇▇███████
train_acc,▁▂▃▄▄▄▅▅▆▆▆▇▇▇▇▇▇███
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▂▃▄▄▅▅▆▆▇▇▇▇▇▇█████
val_loss,█▆▅▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8824
train_acc,0.92581
train_loss,0.02054
val_acc,0.89433
val_loss,0.03105


[34m[1mwandb[0m: Agent Starting Run: 7gbhwwpi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0277, acc(batch): 0.8958, grad:18901.8806: : 844it [00:13, 60.71it/s]                       


acc(train): 0.8296, acc(val): 0.8310, acc(test): 0.8209
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0240, acc(batch): 0.9583, grad:19526.4617: : 844it [00:14, 59.88it/s]                       


acc(train): 0.8446, acc(val): 0.8438, acc(test): 0.8315
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0216, acc(batch): 0.9583, grad:20034.9119: : 844it [00:15, 55.32it/s]                       


acc(train): 0.8524, acc(val): 0.8485, acc(test): 0.8377
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0208, acc(batch): 0.9583, grad:20373.2158: : 844it [00:15, 54.51it/s]                       


acc(train): 0.8604, acc(val): 0.8552, acc(test): 0.8445
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0199, acc(batch): 0.9583, grad:20446.8577: : 844it [00:15, 54.73it/s]                       


acc(train): 0.8634, acc(val): 0.8558, acc(test): 0.8465
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0194, acc(batch): 0.9583, grad:20494.5490: : 844it [00:15, 55.22it/s]                       


acc(train): 0.8671, acc(val): 0.8578, acc(test): 0.8480
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0191, acc(batch): 0.9583, grad:20605.5242: : 844it [00:15, 55.65it/s]                       


acc(train): 0.8703, acc(val): 0.8592, acc(test): 0.8491
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0196, acc(batch): 0.9583, grad:20931.2395: : 844it [00:15, 54.02it/s]


acc(train): 0.8721, acc(val): 0.8593, acc(test): 0.8515
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0184, acc(batch): 0.9583, grad:20950.3528: : 844it [00:15, 52.83it/s]                       


acc(train): 0.8745, acc(val): 0.8618, acc(test): 0.8537
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0180, acc(batch): 0.9583, grad:20678.5071: : 844it [00:15, 54.89it/s]                       


acc(train): 0.8759, acc(val): 0.8635, acc(test): 0.8518
___________________________________________________________________________________________________


epoch: 10, lr: 0.00050 | loss: 0.0181, acc(batch): 0.9583, grad:20949.8316: : 844it [00:15, 54.52it/s]                       


acc(train): 0.8789, acc(val): 0.8667, acc(test): 0.8539
___________________________________________________________________________________________________


epoch: 11, lr: 0.00050 | loss: 0.0179, acc(batch): 0.9375, grad:20483.3069: : 844it [00:15, 55.92it/s]                       


acc(train): 0.8826, acc(val): 0.8693, acc(test): 0.8564
___________________________________________________________________________________________________


epoch: 12, lr: 0.00050 | loss: 0.0179, acc(batch): 0.9375, grad:20558.8360: : 844it [00:15, 54.62it/s]                       


acc(train): 0.8831, acc(val): 0.8693, acc(test): 0.8574
___________________________________________________________________________________________________


epoch: 13, lr: 0.00050 | loss: 0.0172, acc(batch): 0.9583, grad:20663.5350: : 844it [00:15, 56.20it/s]                       


acc(train): 0.8856, acc(val): 0.8713, acc(test): 0.8596
___________________________________________________________________________________________________


epoch: 14, lr: 0.00050 | loss: 0.0177, acc(batch): 0.9375, grad:20554.4036: : 844it [00:15, 56.15it/s]                       


acc(train): 0.8868, acc(val): 0.8735, acc(test): 0.8605
___________________________________________________________________________________________________


epoch: 15, lr: 0.00050 | loss: 0.0172, acc(batch): 0.9375, grad:20376.2660: : 844it [00:15, 56.18it/s]                       


acc(train): 0.8873, acc(val): 0.8725, acc(test): 0.8599
___________________________________________________________________________________________________


epoch: 16, lr: 0.00050 | loss: 0.0170, acc(batch): 0.9167, grad:20357.7011: : 844it [00:14, 57.18it/s]


acc(train): 0.8888, acc(val): 0.8737, acc(test): 0.8619
___________________________________________________________________________________________________


epoch: 17, lr: 0.00050 | loss: 0.0165, acc(batch): 0.9167, grad:20699.7549: : 844it [00:14, 57.57it/s]


acc(train): 0.8909, acc(val): 0.8735, acc(test): 0.8619
___________________________________________________________________________________________________


epoch: 18, lr: 0.00050 | loss: 0.0164, acc(batch): 0.9167, grad:20300.2279: : 844it [00:15, 56.03it/s]                       


acc(train): 0.8914, acc(val): 0.8725, acc(test): 0.8617
___________________________________________________________________________________________________


epoch: 19, lr: 0.00050 | loss: 0.0154, acc(batch): 0.9375, grad:20553.8472: : 844it [00:15, 55.52it/s]                       


acc(train): 0.8934, acc(val): 0.8735, acc(test): 0.8646
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57957827845…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇████
train_acc,▁▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇███
train_loss,█▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▃▄▅▅▅▆▆▆▆▇▇▇███████
val_loss,█▆▅▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8646
train_acc,0.89343
train_loss,0.029
val_acc,0.8735
val_loss,0.03613


In [16]:
wandb.agent("1463mzwx", function=spawn_fn, count=10)

[34m[1mwandb[0m: Agent Starting Run: 7uitvlls with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0817, acc(batch): 0.8958, grad:13974.4253: : 844it [00:10, 77.94it/s]                       


acc(train): 0.7300, acc(val): 0.7305, acc(test): 0.7233
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0503, acc(batch): 0.9375, grad:15169.5247: : 844it [00:10, 77.85it/s]


acc(train): 0.7712, acc(val): 0.7748, acc(test): 0.7628
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0365, acc(batch): 0.9375, grad:14811.3835: : 844it [00:10, 80.41it/s]                       


acc(train): 0.7987, acc(val): 0.8037, acc(test): 0.7910
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0293, acc(batch): 0.9375, grad:14890.5081: : 844it [00:10, 79.54it/s]                       


acc(train): 0.8168, acc(val): 0.8218, acc(test): 0.8060
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0252, acc(batch): 0.9375, grad:14954.6467: : 844it [00:10, 79.74it/s]                       


acc(train): 0.8280, acc(val): 0.8305, acc(test): 0.8151
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0227, acc(batch): 0.9375, grad:15204.2862: : 844it [00:10, 81.38it/s]                       


acc(train): 0.8361, acc(val): 0.8380, acc(test): 0.8214
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0211, acc(batch): 0.9375, grad:15388.4464: : 844it [00:10, 81.26it/s]                       


acc(train): 0.8417, acc(val): 0.8435, acc(test): 0.8281
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0201, acc(batch): 0.9375, grad:15347.2773: : 844it [00:10, 82.71it/s]                       


acc(train): 0.8466, acc(val): 0.8475, acc(test): 0.8323
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0195, acc(batch): 0.9375, grad:15344.9171: : 844it [00:10, 79.38it/s]


acc(train): 0.8504, acc(val): 0.8502, acc(test): 0.8349
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0189, acc(batch): 0.9375, grad:15406.2563: : 844it [00:10, 80.85it/s]                       


acc(train): 0.8537, acc(val): 0.8525, acc(test): 0.8380
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0184, acc(batch): 0.9375, grad:15516.3534: : 844it [00:10, 79.80it/s]                       


acc(train): 0.8568, acc(val): 0.8543, acc(test): 0.8394
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0179, acc(batch): 0.9375, grad:15645.1069: : 844it [00:10, 78.27it/s]                       


acc(train): 0.8595, acc(val): 0.8565, acc(test): 0.8412
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0173, acc(batch): 0.9375, grad:15775.8329: : 844it [00:10, 81.14it/s]                       


acc(train): 0.8622, acc(val): 0.8588, acc(test): 0.8426
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0168, acc(batch): 0.9375, grad:15893.6686: : 844it [00:10, 81.68it/s]                       


acc(train): 0.8641, acc(val): 0.8617, acc(test): 0.8451
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0163, acc(batch): 0.9375, grad:15988.6780: : 844it [00:10, 79.58it/s]


acc(train): 0.8666, acc(val): 0.8618, acc(test): 0.8465
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0158, acc(batch): 0.9375, grad:16063.9349: : 844it [00:10, 79.20it/s]                       


acc(train): 0.8682, acc(val): 0.8623, acc(test): 0.8486
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0154, acc(batch): 0.9375, grad:16124.9142: : 844it [00:10, 80.47it/s]                       


acc(train): 0.8697, acc(val): 0.8620, acc(test): 0.8507
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0151, acc(batch): 0.9375, grad:16172.7439: : 844it [00:10, 81.14it/s]                       


acc(train): 0.8711, acc(val): 0.8632, acc(test): 0.8511
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0147, acc(batch): 0.9375, grad:16207.9634: : 844it [00:10, 82.11it/s]                       


acc(train): 0.8723, acc(val): 0.8637, acc(test): 0.8533
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0144, acc(batch): 0.9375, grad:16230.5059: : 844it [00:10, 81.63it/s]                       


acc(train): 0.8737, acc(val): 0.8637, acc(test): 0.8539
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▅▅▆▆▇▇▇▇▇▇▇███████
train_acc,▁▃▄▅▆▆▆▇▇▇▇▇▇███████
train_loss,█▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▇▇▇▇▇██████████
val_loss,█▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8539
train_acc,0.87372
train_loss,0.03658
val_acc,0.86367
val_loss,0.03886


[34m[1mwandb[0m: Agent Starting Run: a8dlcc0z with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00100 | loss: 0.0395, acc(batch): 0.8929, grad:16026.6877: : 422it [00:05, 77.89it/s]                       


acc(train): 0.8392, acc(val): 0.8430, acc(test): 0.8261
___________________________________________________________________________________________________


epoch: 1, lr: 0.00100 | loss: 0.0375, acc(batch): 0.8929, grad:17106.7960: : 422it [00:05, 78.41it/s]


acc(train): 0.8515, acc(val): 0.8498, acc(test): 0.8355
___________________________________________________________________________________________________


epoch: 2, lr: 0.00100 | loss: 0.0362, acc(batch): 0.8750, grad:17369.5281: : 422it [00:05, 76.59it/s]


acc(train): 0.8588, acc(val): 0.8555, acc(test): 0.8415
___________________________________________________________________________________________________


epoch: 3, lr: 0.00100 | loss: 0.0351, acc(batch): 0.8750, grad:17581.3264: : 422it [00:05, 78.73it/s]                       


acc(train): 0.8650, acc(val): 0.8590, acc(test): 0.8461
___________________________________________________________________________________________________


epoch: 4, lr: 0.00100 | loss: 0.0341, acc(batch): 0.8750, grad:17844.1719: : 422it [00:05, 79.16it/s]                       


acc(train): 0.8684, acc(val): 0.8605, acc(test): 0.8511
___________________________________________________________________________________________________


epoch: 5, lr: 0.00100 | loss: 0.0335, acc(batch): 0.8839, grad:17955.1467: : 422it [00:05, 77.23it/s]                       


acc(train): 0.8710, acc(val): 0.8618, acc(test): 0.8520
___________________________________________________________________________________________________


epoch: 6, lr: 0.00100 | loss: 0.0329, acc(batch): 0.8839, grad:18005.3514: : 422it [00:05, 75.54it/s]                       


acc(train): 0.8734, acc(val): 0.8648, acc(test): 0.8546
___________________________________________________________________________________________________


epoch: 7, lr: 0.00100 | loss: 0.0325, acc(batch): 0.8839, grad:18018.2259: : 422it [00:05, 76.51it/s]                       


acc(train): 0.8759, acc(val): 0.8665, acc(test): 0.8558
___________________________________________________________________________________________________


epoch: 8, lr: 0.00100 | loss: 0.0321, acc(batch): 0.8839, grad:18030.0101: : 422it [00:05, 74.93it/s]                       


acc(train): 0.8781, acc(val): 0.8672, acc(test): 0.8567
___________________________________________________________________________________________________


epoch: 9, lr: 0.00100 | loss: 0.0319, acc(batch): 0.8839, grad:18109.3231: : 422it [00:05, 74.86it/s]                       


acc(train): 0.8799, acc(val): 0.8688, acc(test): 0.8570
___________________________________________________________________________________________________


epoch: 10, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8839, grad:18223.8328: : 422it [00:05, 71.68it/s]                       


acc(train): 0.8809, acc(val): 0.8693, acc(test): 0.8581
___________________________________________________________________________________________________


epoch: 11, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18323.7855: : 422it [00:05, 73.33it/s]                       


acc(train): 0.8819, acc(val): 0.8710, acc(test): 0.8590
___________________________________________________________________________________________________


epoch: 12, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8839, grad:18388.7537: : 422it [00:05, 74.19it/s]                       


acc(train): 0.8827, acc(val): 0.8710, acc(test): 0.8598
___________________________________________________________________________________________________


epoch: 13, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18422.2501: : 422it [00:05, 73.69it/s]                       


acc(train): 0.8834, acc(val): 0.8715, acc(test): 0.8600
___________________________________________________________________________________________________


epoch: 14, lr: 0.00100 | loss: 0.0319, acc(batch): 0.8929, grad:18431.3552: : 422it [00:05, 75.03it/s]                       


acc(train): 0.8844, acc(val): 0.8718, acc(test): 0.8602
___________________________________________________________________________________________________


epoch: 15, lr: 0.00100 | loss: 0.0319, acc(batch): 0.8929, grad:18421.1632: : 422it [00:05, 76.01it/s]                       


acc(train): 0.8849, acc(val): 0.8737, acc(test): 0.8604
___________________________________________________________________________________________________


epoch: 16, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18418.7548: : 422it [00:05, 77.52it/s]                       


acc(train): 0.8855, acc(val): 0.8748, acc(test): 0.8607
___________________________________________________________________________________________________


epoch: 17, lr: 0.00100 | loss: 0.0318, acc(batch): 0.8929, grad:18402.0034: : 422it [00:05, 76.14it/s]


acc(train): 0.8860, acc(val): 0.8748, acc(test): 0.8609
___________________________________________________________________________________________________


epoch: 18, lr: 0.00100 | loss: 0.0318, acc(batch): 0.9018, grad:18367.1699: : 422it [00:05, 74.88it/s]                       


acc(train): 0.8864, acc(val): 0.8757, acc(test): 0.8610
___________________________________________________________________________________________________


epoch: 19, lr: 0.00100 | loss: 0.0317, acc(batch): 0.9018, grad:18316.2449: : 422it [00:06, 68.40it/s]                       


acc(train): 0.8868, acc(val): 0.8762, acc(test): 0.8607
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57941091539…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▆▇▇▇▇▇█████████
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇███████
train_loss,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▂▄▄▅▅▆▆▆▆▇▇▇▇▇▇████
val_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8607
train_acc,0.88676
train_loss,0.02982
val_acc,0.87617
val_loss,0.03665


[34m[1mwandb[0m: Agent Starting Run: zmp8yloz with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0221, acc(batch): 0.9375, grad:64399.3219: : 844it [00:37, 22.62it/s]                       


acc(train): 0.8439, acc(val): 0.8458, acc(test): 0.8351
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0204, acc(batch): 0.9167, grad:63689.2142: : 844it [00:37, 22.67it/s]                       


acc(train): 0.8619, acc(val): 0.8555, acc(test): 0.8491
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0196, acc(batch): 0.9167, grad:63024.9530: : 844it [00:36, 22.83it/s]                       


acc(train): 0.8706, acc(val): 0.8608, acc(test): 0.8566
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0187, acc(batch): 0.9167, grad:62917.7614: : 844it [00:37, 22.74it/s]                       


acc(train): 0.8766, acc(val): 0.8647, acc(test): 0.8620
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0177, acc(batch): 0.9167, grad:62918.3835: : 844it [00:37, 22.71it/s]                       


acc(train): 0.8812, acc(val): 0.8687, acc(test): 0.8649
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0168, acc(batch): 0.9167, grad:63092.8218: : 844it [00:37, 22.69it/s]                       


acc(train): 0.8852, acc(val): 0.8723, acc(test): 0.8676
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0158, acc(batch): 0.9167, grad:63189.6915: : 844it [00:37, 22.45it/s]                       


acc(train): 0.8883, acc(val): 0.8752, acc(test): 0.8684
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0149, acc(batch): 0.9375, grad:63216.4673: : 844it [00:37, 22.53it/s]                       


acc(train): 0.8914, acc(val): 0.8770, acc(test): 0.8695
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0140, acc(batch): 0.9375, grad:63199.2412: : 844it [00:37, 22.33it/s]                       


acc(train): 0.8941, acc(val): 0.8790, acc(test): 0.8708
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0132, acc(batch): 0.9375, grad:63169.5793: : 844it [00:37, 22.51it/s]                       


acc(train): 0.8968, acc(val): 0.8832, acc(test): 0.8723
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0124, acc(batch): 0.9375, grad:63148.5305: : 844it [00:37, 22.44it/s]                       


acc(train): 0.8989, acc(val): 0.8828, acc(test): 0.8730
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0116, acc(batch): 0.9583, grad:63032.5241: : 844it [00:37, 22.49it/s]                       


acc(train): 0.9006, acc(val): 0.8837, acc(test): 0.8740
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0109, acc(batch): 0.9583, grad:62892.1694: : 844it [00:37, 22.41it/s]                       


acc(train): 0.9027, acc(val): 0.8843, acc(test): 0.8753
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0103, acc(batch): 0.9792, grad:62854.4642: : 844it [00:37, 22.54it/s]                       


acc(train): 0.9046, acc(val): 0.8855, acc(test): 0.8758
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0097, acc(batch): 0.9792, grad:62820.3329: : 844it [00:39, 21.16it/s]                       


acc(train): 0.9059, acc(val): 0.8865, acc(test): 0.8772
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0091, acc(batch): 0.9792, grad:62812.2001: : 844it [00:37, 22.66it/s]                       


acc(train): 0.9074, acc(val): 0.8883, acc(test): 0.8783
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0085, acc(batch): 0.9792, grad:62812.1138: : 844it [00:37, 22.68it/s]                       


acc(train): 0.9089, acc(val): 0.8890, acc(test): 0.8791
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0080, acc(batch): 0.9792, grad:62786.4879: : 844it [00:37, 22.57it/s]                       


acc(train): 0.9106, acc(val): 0.8897, acc(test): 0.8804
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0076, acc(batch): 0.9792, grad:62806.7517: : 844it [00:37, 22.43it/s]                       


acc(train): 0.9126, acc(val): 0.8892, acc(test): 0.8804
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0072, acc(batch): 0.9792, grad:62807.3841: : 844it [00:37, 22.49it/s]                       


acc(train): 0.9141, acc(val): 0.8893, acc(test): 0.8813
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▆▆▆▆▇▇▇▇▇▇█████
train_acc,▁▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇███
train_loss,█▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▃▃▄▅▅▆▆▆▇▇▇▇▇▇█████
val_loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8813
train_acc,0.91409
train_loss,0.02495
val_acc,0.88933
val_loss,0.03076


[34m[1mwandb[0m: Agent Starting Run: cq4pu5wy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00030 | loss: 0.0373, acc(batch): 0.8839, grad:100932.2099: : 422it [00:50,  8.41it/s]


acc(train): 0.8585, acc(val): 0.8600, acc(test): 0.8477
___________________________________________________________________________________________________


epoch: 1, lr: 0.00030 | loss: 0.0328, acc(batch): 0.9107, grad:99950.8492: : 422it [00:50,  8.39it/s]


acc(train): 0.8723, acc(val): 0.8657, acc(test): 0.8559
___________________________________________________________________________________________________


epoch: 2, lr: 0.00030 | loss: 0.0323, acc(batch): 0.9018, grad:99286.2738: : 422it [00:50,  8.42it/s]


acc(train): 0.8822, acc(val): 0.8685, acc(test): 0.8606
___________________________________________________________________________________________________


epoch: 3, lr: 0.00030 | loss: 0.0314, acc(batch): 0.9107, grad:98320.6648: : 422it [00:50,  8.41it/s]


acc(train): 0.8887, acc(val): 0.8735, acc(test): 0.8633
___________________________________________________________________________________________________


epoch: 4, lr: 0.00030 | loss: 0.0302, acc(batch): 0.9107, grad:97422.6394: : 422it [00:50,  8.43it/s]


acc(train): 0.8940, acc(val): 0.8757, acc(test): 0.8683
___________________________________________________________________________________________________


epoch: 5, lr: 0.00030 | loss: 0.0294, acc(batch): 0.9196, grad:95302.8585: : 422it [00:49,  8.48it/s]


acc(train): 0.8966, acc(val): 0.8773, acc(test): 0.8697
___________________________________________________________________________________________________


epoch: 6, lr: 0.00030 | loss: 0.0287, acc(batch): 0.9286, grad:95747.2129: : 422it [00:49,  8.53it/s]


acc(train): 0.8988, acc(val): 0.8795, acc(test): 0.8703
___________________________________________________________________________________________________


epoch: 7, lr: 0.00030 | loss: 0.0278, acc(batch): 0.9286, grad:94330.2569: : 422it [00:49,  8.49it/s]


acc(train): 0.9022, acc(val): 0.8793, acc(test): 0.8710
___________________________________________________________________________________________________


epoch: 8, lr: 0.00030 | loss: 0.0270, acc(batch): 0.9375, grad:93477.8353: : 422it [00:49,  8.53it/s]


acc(train): 0.9046, acc(val): 0.8808, acc(test): 0.8728
___________________________________________________________________________________________________


epoch: 9, lr: 0.00030 | loss: 0.0268, acc(batch): 0.9375, grad:92662.7078: : 422it [00:49,  8.51it/s]


acc(train): 0.9081, acc(val): 0.8815, acc(test): 0.8728
___________________________________________________________________________________________________


epoch: 10, lr: 0.00030 | loss: 0.0264, acc(batch): 0.9375, grad:92680.4022: : 422it [00:49,  8.52it/s]


acc(train): 0.9104, acc(val): 0.8802, acc(test): 0.8727
___________________________________________________________________________________________________


epoch: 11, lr: 0.00030 | loss: 0.0261, acc(batch): 0.9375, grad:93017.1797: : 422it [00:49,  8.57it/s]


acc(train): 0.9123, acc(val): 0.8792, acc(test): 0.8738
___________________________________________________________________________________________________


epoch: 12, lr: 0.00030 | loss: 0.0256, acc(batch): 0.9375, grad:92889.4153: : 422it [00:49,  8.50it/s]


acc(train): 0.9149, acc(val): 0.8797, acc(test): 0.8742
___________________________________________________________________________________________________


epoch: 13, lr: 0.00030 | loss: 0.0248, acc(batch): 0.9375, grad:92792.5052: : 422it [00:49,  8.48it/s]


acc(train): 0.9155, acc(val): 0.8815, acc(test): 0.8741
___________________________________________________________________________________________________


epoch: 14, lr: 0.00030 | loss: 0.0258, acc(batch): 0.9286, grad:93039.9633: : 422it [00:50,  8.43it/s]


acc(train): 0.9143, acc(val): 0.8770, acc(test): 0.8721
___________________________________________________________________________________________________


epoch: 15, lr: 0.00030 | loss: 0.0262, acc(batch): 0.9464, grad:94656.9018: : 422it [00:50,  8.44it/s]


acc(train): 0.9179, acc(val): 0.8798, acc(test): 0.8738
___________________________________________________________________________________________________


epoch: 16, lr: 0.00030 | loss: 0.0248, acc(batch): 0.9464, grad:94473.2426: : 422it [00:49,  8.52it/s]


acc(train): 0.9176, acc(val): 0.8782, acc(test): 0.8727
___________________________________________________________________________________________________


epoch: 17, lr: 0.00030 | loss: 0.0243, acc(batch): 0.9464, grad:92445.5427: : 422it [00:50,  8.41it/s]


acc(train): 0.9220, acc(val): 0.8805, acc(test): 0.8737
___________________________________________________________________________________________________


epoch: 18, lr: 0.00030 | loss: 0.0251, acc(batch): 0.9196, grad:91908.2061: : 422it [00:49,  8.55it/s]


acc(train): 0.9215, acc(val): 0.8757, acc(test): 0.8706
___________________________________________________________________________________________________


epoch: 19, lr: 0.00030 | loss: 0.0263, acc(batch): 0.9196, grad:91309.4906: : 422it [00:49,  8.45it/s]


acc(train): 0.9196, acc(val): 0.8755, acc(test): 0.8686
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▄▅▆▇▇▇██████▇███▇▇
train_acc,▁▃▄▄▅▅▅▆▆▆▇▇▇▇▇█████
train_loss,█▅▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▃▄▅▆▇▇▇███▇▇█▇▇▇█▆▆
val_loss,█▅▃▂▂▁▁▁▁▁▁▂▂▂▃▃▄▄▅▇

0,1
epoch,19.0
test_acc,0.8686
train_acc,0.91956
train_loss,0.01652
val_acc,0.8755
val_loss,0.03945


[34m[1mwandb[0m: Agent Starting Run: dkxo52c5 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: Xavier_uniform


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00010 | loss: 0.0500, acc(batch): 0.8839, grad:33584.7646: : 422it [00:11, 37.25it/s]


acc(train): 0.8000, acc(val): 0.8002, acc(test): 0.7905
___________________________________________________________________________________________________


epoch: 1, lr: 0.00010 | loss: 0.0370, acc(batch): 0.8839, grad:33569.7829: : 422it [00:11, 36.94it/s]


acc(train): 0.8311, acc(val): 0.8265, acc(test): 0.8202
___________________________________________________________________________________________________


epoch: 2, lr: 0.00010 | loss: 0.0337, acc(batch): 0.9107, grad:33453.4632: : 422it [00:11, 36.63it/s]                       


acc(train): 0.8446, acc(val): 0.8423, acc(test): 0.8316
___________________________________________________________________________________________________


epoch: 3, lr: 0.00010 | loss: 0.0327, acc(batch): 0.8929, grad:33353.3586: : 422it [00:11, 37.06it/s]                       


acc(train): 0.8529, acc(val): 0.8497, acc(test): 0.8383
___________________________________________________________________________________________________


epoch: 4, lr: 0.00010 | loss: 0.0323, acc(batch): 0.8929, grad:33350.4648: : 422it [00:11, 37.02it/s]


acc(train): 0.8584, acc(val): 0.8557, acc(test): 0.8429
___________________________________________________________________________________________________


epoch: 5, lr: 0.00010 | loss: 0.0318, acc(batch): 0.8929, grad:33401.2496: : 422it [00:11, 37.54it/s]                       


acc(train): 0.8635, acc(val): 0.8592, acc(test): 0.8462
___________________________________________________________________________________________________


epoch: 6, lr: 0.00010 | loss: 0.0314, acc(batch): 0.8929, grad:33359.7711: : 422it [00:11, 36.95it/s]                       


acc(train): 0.8674, acc(val): 0.8612, acc(test): 0.8509
___________________________________________________________________________________________________


epoch: 7, lr: 0.00010 | loss: 0.0310, acc(batch): 0.8929, grad:33232.7049: : 422it [00:11, 37.52it/s]                       


acc(train): 0.8714, acc(val): 0.8642, acc(test): 0.8533
___________________________________________________________________________________________________


epoch: 8, lr: 0.00010 | loss: 0.0306, acc(batch): 0.8929, grad:33102.8670: : 422it [00:11, 37.70it/s]


acc(train): 0.8739, acc(val): 0.8667, acc(test): 0.8556
___________________________________________________________________________________________________


epoch: 9, lr: 0.00010 | loss: 0.0302, acc(batch): 0.8839, grad:32939.8454: : 422it [00:11, 37.40it/s]                       


acc(train): 0.8764, acc(val): 0.8687, acc(test): 0.8576
___________________________________________________________________________________________________


epoch: 10, lr: 0.00010 | loss: 0.0298, acc(batch): 0.8839, grad:32797.0708: : 422it [00:11, 36.88it/s]


acc(train): 0.8786, acc(val): 0.8703, acc(test): 0.8603
___________________________________________________________________________________________________


epoch: 11, lr: 0.00010 | loss: 0.0294, acc(batch): 0.8839, grad:32712.9691: : 422it [00:11, 36.88it/s]                       


acc(train): 0.8807, acc(val): 0.8717, acc(test): 0.8614
___________________________________________________________________________________________________


epoch: 12, lr: 0.00010 | loss: 0.0290, acc(batch): 0.8929, grad:32648.4800: : 422it [00:11, 37.54it/s]


acc(train): 0.8827, acc(val): 0.8718, acc(test): 0.8619
___________________________________________________________________________________________________


epoch: 13, lr: 0.00010 | loss: 0.0287, acc(batch): 0.8929, grad:32629.8726: : 422it [00:11, 37.34it/s]                       


acc(train): 0.8845, acc(val): 0.8728, acc(test): 0.8632
___________________________________________________________________________________________________


epoch: 14, lr: 0.00010 | loss: 0.0283, acc(batch): 0.8929, grad:32660.7244: : 422it [00:11, 37.17it/s]                       


acc(train): 0.8860, acc(val): 0.8733, acc(test): 0.8643
___________________________________________________________________________________________________


epoch: 15, lr: 0.00010 | loss: 0.0280, acc(batch): 0.8929, grad:32709.8503: : 422it [00:11, 36.73it/s]                       


acc(train): 0.8876, acc(val): 0.8737, acc(test): 0.8655
___________________________________________________________________________________________________


epoch: 16, lr: 0.00010 | loss: 0.0278, acc(batch): 0.8929, grad:32768.7120: : 422it [00:11, 36.51it/s]                       


acc(train): 0.8890, acc(val): 0.8745, acc(test): 0.8668
___________________________________________________________________________________________________


epoch: 17, lr: 0.00010 | loss: 0.0275, acc(batch): 0.9018, grad:32812.6700: : 422it [00:11, 37.15it/s]                       


acc(train): 0.8905, acc(val): 0.8753, acc(test): 0.8677
___________________________________________________________________________________________________


epoch: 18, lr: 0.00010 | loss: 0.0273, acc(batch): 0.9018, grad:32855.9469: : 422it [00:11, 36.61it/s]                       


acc(train): 0.8920, acc(val): 0.8755, acc(test): 0.8680
___________________________________________________________________________________________________


epoch: 19, lr: 0.00010 | loss: 0.0271, acc(batch): 0.9018, grad:32893.8976: : 422it [00:11, 37.34it/s]


acc(train): 0.8932, acc(val): 0.8768, acc(test): 0.8683
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▄▅▅▆▆▆▇▇▇▇▇▇███████
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇█████
train_loss,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▆▇▇▇▇▇█████████
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8683
train_acc,0.89324
train_loss,0.03023
val_acc,0.87683
val_loss,0.0334


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xv7ymibt with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0491, acc(batch): 0.8571, grad:36631.3625: : 422it [00:12, 34.78it/s]                       


acc(train): 0.8406, acc(val): 0.8422, acc(test): 0.8302
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0434, acc(batch): 0.8661, grad:36830.3102: : 422it [00:12, 34.66it/s]


acc(train): 0.8547, acc(val): 0.8570, acc(test): 0.8431
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0400, acc(batch): 0.8839, grad:36782.1833: : 422it [00:12, 34.40it/s]                       


acc(train): 0.8637, acc(val): 0.8623, acc(test): 0.8513
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0378, acc(batch): 0.9018, grad:36787.1750: : 422it [00:12, 34.54it/s]                       


acc(train): 0.8696, acc(val): 0.8667, acc(test): 0.8537
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0362, acc(batch): 0.9107, grad:36761.0226: : 422it [00:12, 34.53it/s]                       


acc(train): 0.8736, acc(val): 0.8667, acc(test): 0.8545
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0347, acc(batch): 0.9196, grad:36362.3463: : 422it [00:12, 34.10it/s]                       


acc(train): 0.8768, acc(val): 0.8687, acc(test): 0.8586
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0336, acc(batch): 0.9196, grad:35995.0995: : 422it [00:12, 34.67it/s]


acc(train): 0.8782, acc(val): 0.8683, acc(test): 0.8581
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0328, acc(batch): 0.9196, grad:35995.1145: : 422it [00:12, 34.97it/s]                       


acc(train): 0.8805, acc(val): 0.8693, acc(test): 0.8613
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0323, acc(batch): 0.9196, grad:36097.4167: : 422it [00:12, 34.32it/s]                       


acc(train): 0.8822, acc(val): 0.8697, acc(test): 0.8624
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0315, acc(batch): 0.9196, grad:35733.2205: : 422it [00:12, 34.87it/s]


acc(train): 0.8839, acc(val): 0.8715, acc(test): 0.8648
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▄▅▆▆▇▇▇██
train_acc,▁▃▅▆▆▇▇▇██
train_loss,█▄▃▃▂▂▂▁▁▁
val_acc,▁▅▆▇▇▇▇▇██
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,9.0
test_acc,0.8648
train_acc,0.88387
train_loss,0.03272
val_acc,0.8715
val_loss,0.03582


[34m[1mwandb[0m: Agent Starting Run: pf6v970q with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: He_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00030 | loss: 0.0445, acc(batch): 0.8571, grad:173785.9407: : 422it [02:21,  2.98it/s]


acc(train): 0.8551, acc(val): 0.8510, acc(test): 0.8426
___________________________________________________________________________________________________


epoch: 1, lr: 0.00030 | loss: 0.0406, acc(batch): 0.8482, grad:174813.6768: : 422it [02:20,  3.00it/s]


acc(train): 0.8742, acc(val): 0.8648, acc(test): 0.8586
___________________________________________________________________________________________________


epoch: 2, lr: 0.00030 | loss: 0.0372, acc(batch): 0.8839, grad:175121.9531: : 422it [02:23,  2.94it/s]


acc(train): 0.8857, acc(val): 0.8725, acc(test): 0.8660
___________________________________________________________________________________________________


epoch: 3, lr: 0.00030 | loss: 0.0344, acc(batch): 0.8839, grad:174435.4188: : 422it [02:23,  2.94it/s]


acc(train): 0.8946, acc(val): 0.8762, acc(test): 0.8715
___________________________________________________________________________________________________


epoch: 4, lr: 0.00030 | loss: 0.0321, acc(batch): 0.9107, grad:174126.2045: : 422it [02:22,  2.97it/s]


acc(train): 0.9017, acc(val): 0.8793, acc(test): 0.8730
___________________________________________________________________________________________________


epoch: 5, lr: 0.00030 | loss: 0.0301, acc(batch): 0.9196, grad:173929.1696: : 422it [02:21,  2.98it/s]


acc(train): 0.9061, acc(val): 0.8807, acc(test): 0.8759
___________________________________________________________________________________________________


epoch: 6, lr: 0.00030 | loss: 0.0282, acc(batch): 0.9196, grad:173962.4115: : 422it [02:21,  2.99it/s]


acc(train): 0.9099, acc(val): 0.8833, acc(test): 0.8764
___________________________________________________________________________________________________


epoch: 7, lr: 0.00030 | loss: 0.0261, acc(batch): 0.9375, grad:174261.7242: : 422it [02:21,  2.98it/s]


acc(train): 0.9137, acc(val): 0.8843, acc(test): 0.8788
___________________________________________________________________________________________________


epoch: 8, lr: 0.00030 | loss: 0.0239, acc(batch): 0.9464, grad:174709.6601: : 422it [02:21,  2.97it/s]


acc(train): 0.9166, acc(val): 0.8852, acc(test): 0.8803
___________________________________________________________________________________________________


epoch: 9, lr: 0.00030 | loss: 0.0215, acc(batch): 0.9643, grad:174098.1276: : 422it [02:22,  2.96it/s]


acc(train): 0.9179, acc(val): 0.8862, acc(test): 0.8802
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▄▅▆▇▇▇███
train_acc,▁▃▄▅▆▇▇███
train_loss,█▅▄▃▃▂▂▂▁▁
val_acc,▁▄▅▆▇▇▇███
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,9.0
test_acc,0.8802
train_acc,0.91785
train_loss,0.01967
val_acc,0.88617
val_loss,0.03189


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g3ntr2cp with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00100 | loss: 0.0706, acc(batch): 0.7679, grad:31912.3937: : 422it [00:11, 36.80it/s]                       


acc(train): 0.7349, acc(val): 0.7357, acc(test): 0.7278
___________________________________________________________________________________________________


epoch: 1, lr: 0.00100 | loss: 0.0528, acc(batch): 0.8482, grad:31482.9904: : 422it [00:11, 36.60it/s]                       


acc(train): 0.7897, acc(val): 0.7880, acc(test): 0.7816
___________________________________________________________________________________________________


epoch: 2, lr: 0.00100 | loss: 0.0473, acc(batch): 0.8482, grad:30700.2445: : 422it [00:11, 35.88it/s]                       


acc(train): 0.8208, acc(val): 0.8155, acc(test): 0.8093
___________________________________________________________________________________________________


epoch: 3, lr: 0.00100 | loss: 0.0433, acc(batch): 0.8571, grad:30182.6538: : 422it [00:11, 36.47it/s]                       


acc(train): 0.8402, acc(val): 0.8347, acc(test): 0.8285
___________________________________________________________________________________________________


epoch: 4, lr: 0.00100 | loss: 0.0406, acc(batch): 0.8750, grad:30181.3648: : 422it [00:11, 36.59it/s]                       


acc(train): 0.8505, acc(val): 0.8433, acc(test): 0.8383
___________________________________________________________________________________________________


epoch: 5, lr: 0.00100 | loss: 0.0393, acc(batch): 0.8929, grad:30223.8044: : 422it [00:11, 36.46it/s]                       


acc(train): 0.8557, acc(val): 0.8478, acc(test): 0.8423
___________________________________________________________________________________________________


epoch: 6, lr: 0.00100 | loss: 0.0386, acc(batch): 0.9018, grad:30265.7466: : 422it [00:11, 36.63it/s]                       


acc(train): 0.8596, acc(val): 0.8510, acc(test): 0.8459
___________________________________________________________________________________________________


epoch: 7, lr: 0.00100 | loss: 0.0380, acc(batch): 0.9018, grad:30277.6865: : 422it [00:11, 36.78it/s]                       


acc(train): 0.8625, acc(val): 0.8542, acc(test): 0.8471
___________________________________________________________________________________________________


epoch: 8, lr: 0.00100 | loss: 0.0377, acc(batch): 0.9018, grad:30269.4015: : 422it [00:11, 37.27it/s]                       


acc(train): 0.8644, acc(val): 0.8575, acc(test): 0.8488
___________________________________________________________________________________________________


epoch: 9, lr: 0.00100 | loss: 0.0374, acc(batch): 0.9018, grad:30204.7168: : 422it [00:11, 37.24it/s]                       


acc(train): 0.8669, acc(val): 0.8592, acc(test): 0.8506
___________________________________________________________________________________________________


epoch: 10, lr: 0.00100 | loss: 0.0372, acc(batch): 0.9018, grad:30111.8622: : 422it [00:11, 35.79it/s]


acc(train): 0.8693, acc(val): 0.8617, acc(test): 0.8518
___________________________________________________________________________________________________


epoch: 11, lr: 0.00100 | loss: 0.0370, acc(batch): 0.9018, grad:30010.6992: : 422it [00:11, 36.59it/s]


acc(train): 0.8713, acc(val): 0.8617, acc(test): 0.8536
___________________________________________________________________________________________________


epoch: 12, lr: 0.00100 | loss: 0.0369, acc(batch): 0.9018, grad:29911.6027: : 422it [00:11, 36.29it/s]


acc(train): 0.8727, acc(val): 0.8623, acc(test): 0.8550
___________________________________________________________________________________________________


epoch: 13, lr: 0.00100 | loss: 0.0368, acc(batch): 0.9018, grad:29817.3005: : 422it [00:11, 36.85it/s]                       


acc(train): 0.8739, acc(val): 0.8633, acc(test): 0.8549
___________________________________________________________________________________________________


epoch: 14, lr: 0.00100 | loss: 0.0367, acc(batch): 0.9018, grad:29727.9019: : 422it [00:11, 36.68it/s]                       


acc(train): 0.8747, acc(val): 0.8640, acc(test): 0.8550
___________________________________________________________________________________________________


epoch: 15, lr: 0.00100 | loss: 0.0367, acc(batch): 0.8929, grad:29645.6071: : 422it [00:11, 35.92it/s]                       


acc(train): 0.8756, acc(val): 0.8638, acc(test): 0.8557
___________________________________________________________________________________________________


epoch: 16, lr: 0.00100 | loss: 0.0366, acc(batch): 0.8929, grad:29574.0485: : 422it [00:11, 36.26it/s]                       


acc(train): 0.8765, acc(val): 0.8655, acc(test): 0.8566
___________________________________________________________________________________________________


epoch: 17, lr: 0.00100 | loss: 0.0366, acc(batch): 0.8839, grad:29510.5106: : 422it [00:11, 35.87it/s]


acc(train): 0.8771, acc(val): 0.8652, acc(test): 0.8575
___________________________________________________________________________________________________


epoch: 18, lr: 0.00100 | loss: 0.0367, acc(batch): 0.8839, grad:29452.3214: : 422it [00:11, 36.26it/s]


acc(train): 0.8778, acc(val): 0.8662, acc(test): 0.8578
___________________________________________________________________________________________________


epoch: 19, lr: 0.00100 | loss: 0.0367, acc(batch): 0.8929, grad:29401.0394: : 422it [00:11, 35.43it/s]                       


acc(train): 0.8783, acc(val): 0.8658, acc(test): 0.8587
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57941091539…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▄▅▆▇▇▇▇▇███████████
train_acc,▁▄▅▆▇▇▇▇▇▇██████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▇▇▇▇████████████
val_loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8587
train_acc,0.87828
train_loss,0.03443
val_acc,0.86583
val_loss,0.03768


[34m[1mwandb[0m: Agent Starting Run: mrsawphd with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0383, acc(batch): 0.9018, grad:84510.3052: : 422it [00:44,  9.49it/s]


acc(train): 0.8624, acc(val): 0.8553, acc(test): 0.8507
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0339, acc(batch): 0.9018, grad:82911.1339: : 422it [00:44,  9.44it/s]


acc(train): 0.8745, acc(val): 0.8627, acc(test): 0.8562
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0321, acc(batch): 0.9107, grad:82299.1022: : 422it [00:44,  9.47it/s]                       


acc(train): 0.8842, acc(val): 0.8678, acc(test): 0.8642
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0306, acc(batch): 0.9107, grad:82730.9595: : 422it [00:44,  9.45it/s]                       


acc(train): 0.8876, acc(val): 0.8707, acc(test): 0.8651
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0314, acc(batch): 0.9107, grad:82695.8833: : 422it [00:45,  9.34it/s]


acc(train): 0.8922, acc(val): 0.8735, acc(test): 0.8670
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0300, acc(batch): 0.9286, grad:81280.7322: : 422it [00:44,  9.42it/s]


acc(train): 0.8960, acc(val): 0.8758, acc(test): 0.8693
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0292, acc(batch): 0.9107, grad:79421.4574: : 422it [00:44,  9.42it/s]


acc(train): 0.8999, acc(val): 0.8772, acc(test): 0.8690
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0288, acc(batch): 0.9107, grad:81245.3982: : 422it [00:44,  9.38it/s]


acc(train): 0.9034, acc(val): 0.8797, acc(test): 0.8723
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0288, acc(batch): 0.9107, grad:79396.4565: : 422it [00:44,  9.57it/s]


acc(train): 0.9054, acc(val): 0.8820, acc(test): 0.8717
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0275, acc(batch): 0.9286, grad:80314.0182: : 422it [00:44,  9.47it/s]


acc(train): 0.9077, acc(val): 0.8822, acc(test): 0.8727
___________________________________________________________________________________________________


epoch: 10, lr: 0.00050 | loss: 0.0263, acc(batch): 0.9375, grad:79100.8522: : 422it [00:44,  9.55it/s]                       


acc(train): 0.9109, acc(val): 0.8860, acc(test): 0.8749
___________________________________________________________________________________________________


epoch: 11, lr: 0.00050 | loss: 0.0246, acc(batch): 0.9464, grad:77878.5610: : 422it [00:44,  9.53it/s]                       


acc(train): 0.9113, acc(val): 0.8848, acc(test): 0.8740
___________________________________________________________________________________________________


epoch: 12, lr: 0.00050 | loss: 0.0241, acc(batch): 0.9375, grad:77220.4674: : 422it [00:44,  9.44it/s]


acc(train): 0.9134, acc(val): 0.8857, acc(test): 0.8762
___________________________________________________________________________________________________


epoch: 13, lr: 0.00050 | loss: 0.0238, acc(batch): 0.9554, grad:77828.9646: : 422it [00:45,  9.27it/s]


acc(train): 0.9174, acc(val): 0.8862, acc(test): 0.8787
___________________________________________________________________________________________________


epoch: 14, lr: 0.00050 | loss: 0.0248, acc(batch): 0.9464, grad:75986.5784: : 422it [00:45,  9.35it/s]


acc(train): 0.9175, acc(val): 0.8872, acc(test): 0.8773
___________________________________________________________________________________________________


epoch: 15, lr: 0.00050 | loss: 0.0233, acc(batch): 0.9464, grad:77027.1146: : 422it [00:44,  9.51it/s]                       


acc(train): 0.9193, acc(val): 0.8878, acc(test): 0.8782
___________________________________________________________________________________________________


epoch: 16, lr: 0.00050 | loss: 0.0218, acc(batch): 0.9554, grad:76915.1394: : 422it [00:47,  8.94it/s]


acc(train): 0.9218, acc(val): 0.8890, acc(test): 0.8790
___________________________________________________________________________________________________


epoch: 17, lr: 0.00050 | loss: 0.0224, acc(batch): 0.9375, grad:75114.9376: : 422it [00:51,  8.21it/s]


acc(train): 0.9238, acc(val): 0.8910, acc(test): 0.8809
___________________________________________________________________________________________________


epoch: 18, lr: 0.00050 | loss: 0.0224, acc(batch): 0.9464, grad:75826.0208: : 422it [00:48,  8.77it/s]


acc(train): 0.9234, acc(val): 0.8880, acc(test): 0.8780
___________________________________________________________________________________________________


epoch: 19, lr: 0.00050 | loss: 0.0206, acc(batch): 0.9554, grad:76208.1416: : 422it [00:47,  8.90it/s]


acc(train): 0.9263, acc(val): 0.8910, acc(test): 0.8807
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57943385326…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▂▄▄▅▅▅▆▆▆▇▆▇▇▇▇██▇█
train_acc,▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇████
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▂▃▄▅▅▅▆▆▆▇▇▇▇▇▇██▇█
val_loss,█▅▃▃▂▂▂▁▁▁▁▂▂▁▂▂▁▁▂▁

0,1
epoch,19.0
test_acc,0.8807
train_acc,0.92631
train_loss,0.0172
val_acc,0.891
val_loss,0.03284


[34m[1mwandb[0m: Agent Starting Run: zxhwhi0m with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init_method: Xavier_normal


(54000, 784) (54000, 10) (10000, 784) (10000, 10) (6000, 784) (6000, 10)


epoch: 0, lr: 0.00050 | loss: 0.0752, acc(batch): 0.7500, grad:19615.4083: : 1688it [00:25, 65.19it/s]                        


acc(train): 0.7499, acc(val): 0.7467, acc(test): 0.7427
___________________________________________________________________________________________________


epoch: 1, lr: 0.00050 | loss: 0.0515, acc(batch): 0.8750, grad:19987.6963: : 1688it [00:26, 63.00it/s]                        


acc(train): 0.7876, acc(val): 0.7845, acc(test): 0.7767
___________________________________________________________________________________________________


epoch: 2, lr: 0.00050 | loss: 0.0431, acc(batch): 0.8750, grad:19817.0564: : 1688it [00:25, 65.14it/s]                        


acc(train): 0.8078, acc(val): 0.8083, acc(test): 0.7950
___________________________________________________________________________________________________


epoch: 3, lr: 0.00050 | loss: 0.0402, acc(batch): 0.8750, grad:19635.1823: : 1688it [00:26, 64.29it/s]                        


acc(train): 0.8181, acc(val): 0.8198, acc(test): 0.8073
___________________________________________________________________________________________________


epoch: 4, lr: 0.00050 | loss: 0.0393, acc(batch): 0.8750, grad:19474.7674: : 1688it [00:28, 59.36it/s]


acc(train): 0.8260, acc(val): 0.8250, acc(test): 0.8134
___________________________________________________________________________________________________


epoch: 5, lr: 0.00050 | loss: 0.0389, acc(batch): 0.8750, grad:19296.0841: : 1688it [00:26, 64.02it/s]                        


acc(train): 0.8313, acc(val): 0.8297, acc(test): 0.8174
___________________________________________________________________________________________________


epoch: 6, lr: 0.00050 | loss: 0.0387, acc(batch): 0.8750, grad:19116.1983: : 1688it [00:24, 69.73it/s]                        


acc(train): 0.8354, acc(val): 0.8338, acc(test): 0.8214
___________________________________________________________________________________________________


epoch: 7, lr: 0.00050 | loss: 0.0385, acc(batch): 0.8750, grad:19000.3456: : 1688it [00:24, 68.44it/s]                        


acc(train): 0.8395, acc(val): 0.8382, acc(test): 0.8253
___________________________________________________________________________________________________


epoch: 8, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8750, grad:18968.1091: : 1688it [00:24, 69.88it/s]                        


acc(train): 0.8427, acc(val): 0.8402, acc(test): 0.8273
___________________________________________________________________________________________________


epoch: 9, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8750, grad:18927.1525: : 1688it [00:24, 68.34it/s]                        


acc(train): 0.8450, acc(val): 0.8430, acc(test): 0.8294
___________________________________________________________________________________________________


epoch: 10, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8125, grad:18881.5846: : 1688it [00:24, 68.01it/s]                        


acc(train): 0.8471, acc(val): 0.8453, acc(test): 0.8313
___________________________________________________________________________________________________


epoch: 11, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8125, grad:18850.6842: : 1688it [00:24, 68.67it/s]                        


acc(train): 0.8491, acc(val): 0.8478, acc(test): 0.8329
___________________________________________________________________________________________________


epoch: 12, lr: 0.00050 | loss: 0.0384, acc(batch): 0.8125, grad:18841.7971: : 1688it [00:24, 67.58it/s]                        


acc(train): 0.8509, acc(val): 0.8483, acc(test): 0.8348
___________________________________________________________________________________________________


epoch: 13, lr: 0.00050 | loss: 0.0385, acc(batch): 0.8125, grad:18848.7130: : 1688it [00:25, 67.07it/s]                        


acc(train): 0.8523, acc(val): 0.8488, acc(test): 0.8364
___________________________________________________________________________________________________


epoch: 14, lr: 0.00050 | loss: 0.0386, acc(batch): 0.8125, grad:18839.1270: : 1688it [00:24, 67.56it/s]                        


acc(train): 0.8541, acc(val): 0.8510, acc(test): 0.8377
___________________________________________________________________________________________________


epoch: 15, lr: 0.00050 | loss: 0.0387, acc(batch): 0.8125, grad:18816.7240: : 1688it [00:25, 67.25it/s]                        


acc(train): 0.8556, acc(val): 0.8525, acc(test): 0.8393
___________________________________________________________________________________________________


epoch: 16, lr: 0.00050 | loss: 0.0387, acc(batch): 0.8125, grad:18785.4361: : 1688it [00:26, 64.70it/s]                        


acc(train): 0.8569, acc(val): 0.8542, acc(test): 0.8402
___________________________________________________________________________________________________


epoch: 17, lr: 0.00050 | loss: 0.0388, acc(batch): 0.8125, grad:18755.4512: : 1688it [00:24, 68.07it/s]                        


acc(train): 0.8580, acc(val): 0.8543, acc(test): 0.8415
___________________________________________________________________________________________________


epoch: 18, lr: 0.00050 | loss: 0.0388, acc(batch): 0.8125, grad:18722.3485: : 1688it [00:25, 66.89it/s]


acc(train): 0.8592, acc(val): 0.8552, acc(test): 0.8421
___________________________________________________________________________________________________


epoch: 19, lr: 0.00050 | loss: 0.0389, acc(batch): 0.8125, grad:18683.6124: : 1688it [00:24, 67.53it/s]                        


acc(train): 0.8603, acc(val): 0.8558, acc(test): 0.8423
___________________________________________________________________________________________________


VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.57929642445…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc,▁▃▅▆▆▆▇▇▇▇▇▇▇███████
train_acc,▁▃▅▅▆▆▆▇▇▇▇▇▇▇██████
train_loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▆▇▇▇▇▇▇████████
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,0.8423
train_acc,0.86026
train_loss,0.03931
val_acc,0.85583
val_loss,0.04049
