In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import torch.optim as optim


from zenml.pipelines import pipeline
from zenml.steps import step, Output

import optuna
from optuna.trial import TrialState

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
@pipeline
def pytorch_hpo_pipeline(
    load_data,
    run_hpo,
    train_test,
):
    """A `pipeline` to load data, load model, and train/evaluate the model."""
    train_dataloader, test_dataloader = load_data()
    best_hparams = run_hpo(train_dataloader, test_dataloader)
    train_test(best_hparams, train_dataloader, test_dataloader)


In [3]:
def get_mnist():
    """A `step` to load the Fashion MNIST dataset as a tuple of torch Datasets."""
    batch_size = 64

    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader

@step
def load_data() -> Output(
    train_dataloader=DataLoader, test_dataloader=DataLoader
):
    train_dataloader, test_dataloader = get_mnist()

    return train_dataloader, test_dataloader

In [4]:
def define_hpo_model(trial):
    CLASSES = 10
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []
    in_features = 28 * 28
    layers.append(nn.Flatten())
    
    for i in range(n_layers):
        out_features = trial.suggest_int(f"n_units_l{i}", 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float(f"dropout_l{i}", 0.2, 0.5)
        layers.append(nn.Dropout(p))
        in_features = out_features
        
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

def objective(trial, train_dataloader, test_dataloader):

    # Generate model
    model = define_hpo_model(trial)
    
    # Train model
    model = model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    
    size = len(train_dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in test_dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    test_accuracy = 100*correct
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
    return test_accuracy


@step
def run_hpo(
    train_dataloader: DataLoader, 
    test_dataloader: DataLoader
) -> dict:
    
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, train_dataloader, test_dataloader), n_trials=30, timeout=600)
    
    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    best = trial.params
    print("Found best hparam dict")
    print(best)
    
    return best

In [5]:
def define_model(hparam:dict):
    CLASSES = 10
    layers = []
    in_features = 28 * 28
    layers.append(nn.Flatten())
    
    print(hparam)
    
    for i in range(hparam['n_layers']):
        out_features = hparam[f"n_units_l{i}"]
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = hparam[f"dropout_l{i}"]
        layers.append(nn.Dropout(p))
        in_features = out_features
        
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

In [6]:
def train(dataloader, model, loss_fn, optimizer):
    """A function to train a model for one epoch."""
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    """A function to test a model on the validation / test dataset."""
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    test_accuracy = 100*correct
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    return test_accuracy

In [7]:
@step
def train_test(
    best_hparams: dict,
    train_dataloader: DataLoader, 
    test_dataloader: DataLoader
) -> Output(trained_model=nn.Module, test_acc=float):
    """A `step` to train and evaluate a torch model on given dataloaders."""
    
    epochs = 5
    
    model = define_model(best_hparams)
    
    print("Training optimized model:")
    print(model)

    model = model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    
    lr = best_hparams['lr']
    optimizer_name = best_hparams['optimizer_name']
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    
    test_acc = 0
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test_acc = test(test_dataloader, model, loss_fn)
    print("Done!")

    return model, test_acc

In [8]:
pytorch_hpo_pipeline(
    load_data=load_data(),
    run_hpo=run_hpo(),
    train_test=train_test(),
).run(unlisted=True)

[33mUnable to find ZenML repository in your current working directory (/home/dnth/Desktop/zenml-optuna) or any parent directories. If you want to use an existing repository which is in a different location, set the environment variable 'ZENML_REPOSITORY_PATH'. If you want to create a new repository, run [0m[33mzenml init[33m.[0m
[1;35mRunning without an active repository root.[0m
[1;35mRunning unlisted pipeline on stack [0m[33mdefault[1;35m (caching enabled)[0m
[1;35mStep [0m[33mload_data[1;35m has started.[0m
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/5148 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw

[1;35mStep [0m[33mload_data[1;35m has finished in 21.214s.[0m
[1;35mStep [0m[33mrun_hpo[1;35m has started.[0m


[32m[I 2022-11-28 15:17:59,109][0m A new study created in memory with name: no-name-df324dac-2557-4144-aa15-b88bcb4b1076[0m
[32m[I 2022-11-28 15:18:05,849][0m Trial 0 finished with value: 26.200000000000003 and parameters: {'n_layers': 2, 'n_units_l0': 9, 'dropout_l0': 0.3961601697991479, 'n_units_l1': 82, 'dropout_l1': 0.3035510660490758, 'optimizer_name': 'Adam', 'lr': 0.019021419954910607}. Best is trial 0 with value: 26.200000000000003.[0m


Test Error: 
 Accuracy: 26.2%, Avg loss: 1.768600 



[32m[I 2022-11-28 15:18:10,616][0m Trial 1 finished with value: 75.08 and parameters: {'n_layers': 1, 'n_units_l0': 74, 'dropout_l0': 0.36826553637785686, 'optimizer_name': 'Adam', 'lr': 0.00010390044730062207}. Best is trial 1 with value: 75.08.[0m


Test Error: 
 Accuracy: 75.1%, Avg loss: 0.728888 



[32m[I 2022-11-28 15:18:14,994][0m Trial 2 finished with value: 60.8 and parameters: {'n_layers': 1, 'n_units_l0': 71, 'dropout_l0': 0.347159602323059, 'optimizer_name': 'SGD', 'lr': 0.0018918920206082803}. Best is trial 1 with value: 75.08.[0m


Test Error: 
 Accuracy: 60.8%, Avg loss: 1.626096 



[32m[I 2022-11-28 15:18:19,867][0m Trial 3 finished with value: 41.88 and parameters: {'n_layers': 3, 'n_units_l0': 9, 'dropout_l0': 0.26815935032289706, 'n_units_l1': 116, 'dropout_l1': 0.41863132025606486, 'n_units_l2': 43, 'dropout_l2': 0.49916675447958436, 'optimizer_name': 'RMSprop', 'lr': 0.001626086763211247}. Best is trial 1 with value: 75.08.[0m


Test Error: 
 Accuracy: 41.9%, Avg loss: 1.368762 



[32m[I 2022-11-28 15:18:24,293][0m Trial 4 finished with value: 24.87 and parameters: {'n_layers': 2, 'n_units_l0': 88, 'dropout_l0': 0.34485062668012256, 'n_units_l1': 65, 'dropout_l1': 0.37613460964794454, 'optimizer_name': 'SGD', 'lr': 0.0007920479418469426}. Best is trial 1 with value: 75.08.[0m


Test Error: 
 Accuracy: 24.9%, Avg loss: 2.265801 



[32m[I 2022-11-28 15:18:28,671][0m Trial 5 finished with value: 82.59 and parameters: {'n_layers': 1, 'n_units_l0': 42, 'dropout_l0': 0.2903400160309909, 'optimizer_name': 'RMSprop', 'lr': 0.002654689681801413}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 82.6%, Avg loss: 0.479191 



[32m[I 2022-11-28 15:18:33,659][0m Trial 6 finished with value: 67.36 and parameters: {'n_layers': 3, 'n_units_l0': 60, 'dropout_l0': 0.4571999254928992, 'n_units_l1': 86, 'dropout_l1': 0.3457918763958716, 'n_units_l2': 14, 'dropout_l2': 0.2834799937822633, 'optimizer_name': 'Adam', 'lr': 0.014222548381593557}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 67.4%, Avg loss: 0.900442 



[32m[I 2022-11-28 15:18:38,674][0m Trial 7 finished with value: 26.07 and parameters: {'n_layers': 2, 'n_units_l0': 59, 'dropout_l0': 0.3461377791047407, 'n_units_l1': 86, 'dropout_l1': 0.2374503532201533, 'optimizer_name': 'SGD', 'lr': 0.000837764650650975}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 26.1%, Avg loss: 2.234208 



[32m[I 2022-11-28 15:18:42,962][0m Trial 8 finished with value: 6.49 and parameters: {'n_layers': 2, 'n_units_l0': 63, 'dropout_l0': 0.4327630048816772, 'n_units_l1': 30, 'dropout_l1': 0.20451898829192297, 'optimizer_name': 'SGD', 'lr': 1.39321814723787e-05}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 6.5%, Avg loss: 2.310410 



[32m[I 2022-11-28 15:18:47,389][0m Trial 9 finished with value: 69.39 and parameters: {'n_layers': 2, 'n_units_l0': 120, 'dropout_l0': 0.28312308345066056, 'n_units_l1': 49, 'dropout_l1': 0.4105980650589946, 'optimizer_name': 'SGD', 'lr': 0.020613039336336233}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 69.4%, Avg loss: 0.766851 



[32m[I 2022-11-28 15:18:51,690][0m Trial 10 finished with value: 75.84 and parameters: {'n_layers': 1, 'n_units_l0': 38, 'dropout_l0': 0.2088113357158836, 'optimizer_name': 'RMSprop', 'lr': 0.0001369201393087794}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 75.8%, Avg loss: 0.703749 



[32m[I 2022-11-28 15:18:55,947][0m Trial 11 finished with value: 71.72 and parameters: {'n_layers': 1, 'n_units_l0': 37, 'dropout_l0': 0.20214569077929342, 'optimizer_name': 'RMSprop', 'lr': 9.099161798301082e-05}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 71.7%, Avg loss: 0.791519 



[32m[I 2022-11-28 15:19:00,330][0m Trial 12 finished with value: 75.48 and parameters: {'n_layers': 1, 'n_units_l0': 33, 'dropout_l0': 0.20681887059754236, 'optimizer_name': 'RMSprop', 'lr': 0.0001503753718716785}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 75.5%, Avg loss: 0.700117 



[32m[I 2022-11-28 15:19:04,887][0m Trial 13 finished with value: 10.100000000000001 and parameters: {'n_layers': 1, 'n_units_l0': 34, 'dropout_l0': 0.2651358800347818, 'optimizer_name': 'RMSprop', 'lr': 0.08751351672194238}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 10.1%, Avg loss: 2.307105 



[32m[I 2022-11-28 15:19:09,566][0m Trial 14 finished with value: 51.71 and parameters: {'n_layers': 1, 'n_units_l0': 44, 'dropout_l0': 0.29387175020678136, 'optimizer_name': 'RMSprop', 'lr': 1.0019391469664987e-05}. Best is trial 5 with value: 82.59.[0m


Test Error: 
 Accuracy: 51.7%, Avg loss: 1.773739 



[32m[I 2022-11-28 15:19:14,215][0m Trial 15 finished with value: 82.91 and parameters: {'n_layers': 1, 'n_units_l0': 96, 'dropout_l0': 0.227952943193382, 'optimizer_name': 'RMSprop', 'lr': 0.004079093928767621}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 82.9%, Avg loss: 0.460972 



[32m[I 2022-11-28 15:19:19,001][0m Trial 16 finished with value: 82.67 and parameters: {'n_layers': 1, 'n_units_l0': 106, 'dropout_l0': 0.24061471681651747, 'optimizer_name': 'RMSprop', 'lr': 0.004847977927040362}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 82.7%, Avg loss: 0.489378 



[32m[I 2022-11-28 15:19:24,018][0m Trial 17 finished with value: 35.65 and parameters: {'n_layers': 3, 'n_units_l0': 113, 'dropout_l0': 0.2417402966314778, 'n_units_l1': 5, 'dropout_l1': 0.4889414127177789, 'n_units_l2': 128, 'dropout_l2': 0.2091755833983707, 'optimizer_name': 'RMSprop', 'lr': 0.00570967468744407}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 35.6%, Avg loss: 1.474371 



[32m[I 2022-11-28 15:19:28,661][0m Trial 18 finished with value: 25.480000000000004 and parameters: {'n_layers': 1, 'n_units_l0': 98, 'dropout_l0': 0.24388482039276696, 'optimizer_name': 'RMSprop', 'lr': 0.07837101634337358}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 25.5%, Avg loss: 1.906742 



[32m[I 2022-11-28 15:19:33,564][0m Trial 19 finished with value: 71.17 and parameters: {'n_layers': 2, 'n_units_l0': 104, 'dropout_l0': 0.24857175790153668, 'n_units_l1': 122, 'dropout_l1': 0.4893164852314044, 'optimizer_name': 'RMSprop', 'lr': 0.006522000083956164}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 71.2%, Avg loss: 0.919177 



[32m[I 2022-11-28 15:19:38,271][0m Trial 20 finished with value: 82.31 and parameters: {'n_layers': 1, 'n_units_l0': 128, 'dropout_l0': 0.30752102026984907, 'optimizer_name': 'Adam', 'lr': 0.0003849796069931933}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 82.3%, Avg loss: 0.503292 



[32m[I 2022-11-28 15:19:42,745][0m Trial 21 finished with value: 82.74000000000001 and parameters: {'n_layers': 1, 'n_units_l0': 89, 'dropout_l0': 0.2994710525274578, 'optimizer_name': 'RMSprop', 'lr': 0.004749284214492657}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 82.7%, Avg loss: 0.473017 



[32m[I 2022-11-28 15:19:47,349][0m Trial 22 finished with value: 79.46 and parameters: {'n_layers': 1, 'n_units_l0': 88, 'dropout_l0': 0.3206119353692389, 'optimizer_name': 'RMSprop', 'lr': 0.005711688947205896}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 79.5%, Avg loss: 0.516663 



[32m[I 2022-11-28 15:19:52,001][0m Trial 23 finished with value: 81.67999999999999 and parameters: {'n_layers': 1, 'n_units_l0': 86, 'dropout_l0': 0.22536349584921261, 'optimizer_name': 'RMSprop', 'lr': 0.0033627163836343994}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 81.7%, Avg loss: 0.483227 



[32m[I 2022-11-28 15:19:56,663][0m Trial 24 finished with value: 81.66 and parameters: {'n_layers': 1, 'n_units_l0': 103, 'dropout_l0': 0.25244944016443394, 'optimizer_name': 'RMSprop', 'lr': 0.009054660012823212}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 81.7%, Avg loss: 0.491318 



[32m[I 2022-11-28 15:20:01,524][0m Trial 25 finished with value: 10.0 and parameters: {'n_layers': 2, 'n_units_l0': 80, 'dropout_l0': 0.23012950420907588, 'n_units_l1': 8, 'dropout_l1': 0.2687997299398122, 'optimizer_name': 'RMSprop', 'lr': 0.03776395997768375}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 10.0%, Avg loss: 2.305852 



[32m[I 2022-11-28 15:20:06,249][0m Trial 26 finished with value: 81.67999999999999 and parameters: {'n_layers': 1, 'n_units_l0': 97, 'dropout_l0': 0.4911411140953038, 'optimizer_name': 'RMSprop', 'lr': 0.00040309169758078004}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 81.7%, Avg loss: 0.528528 



[32m[I 2022-11-28 15:20:10,906][0m Trial 27 finished with value: 82.44 and parameters: {'n_layers': 1, 'n_units_l0': 118, 'dropout_l0': 0.31029329505520586, 'optimizer_name': 'RMSprop', 'lr': 0.0033680340438012176}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 82.4%, Avg loss: 0.474469 



[32m[I 2022-11-28 15:20:15,627][0m Trial 28 finished with value: 35.449999999999996 and parameters: {'n_layers': 2, 'n_units_l0': 108, 'dropout_l0': 0.2696647060182065, 'n_units_l1': 35, 'dropout_l1': 0.3125263569372484, 'optimizer_name': 'RMSprop', 'lr': 0.03442046512320943}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 35.4%, Avg loss: 1.686688 



[32m[I 2022-11-28 15:20:20,477][0m Trial 29 finished with value: 73.3 and parameters: {'n_layers': 2, 'n_units_l0': 91, 'dropout_l0': 0.40794996837653785, 'n_units_l1': 104, 'dropout_l1': 0.43539262406314616, 'optimizer_name': 'Adam', 'lr': 0.01316552053542253}. Best is trial 15 with value: 82.91.[0m


Test Error: 
 Accuracy: 73.3%, Avg loss: 0.705535 

Study statistics: 
  Number of finished trials:  30
  Number of pruned trials:  0
  Number of complete trials:  30
Best trial:
  Value:  82.91
  Params: 
    n_layers: 1
    n_units_l0: 96
    dropout_l0: 0.227952943193382
    optimizer_name: RMSprop
    lr: 0.004079093928767621
Found best hparam dict
{'n_layers': 1, 'n_units_l0': 96, 'dropout_l0': 0.227952943193382, 'optimizer_name': 'RMSprop', 'lr': 0.004079093928767621}
[1;35mStep [0m[33mrun_hpo[1;35m has finished in 2m21s.[0m
[1;35mStep [0m[33mtrain_test[1;35m has started.[0m
{'n_layers': 1, 'n_units_l0': 96, 'dropout_l0': 0.227952943193382, 'optimizer_name': 'RMSprop', 'lr': 0.004079093928767621}
Training optimized model:
Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=96, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.227952943193382, inplace=False)
  (4): Linear(in_features=96, out_features=10, bias=True)
  (5): LogSoftmax(