In [None]:
#!pip install optuna
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import optuna
from sklearn.externals import joblib
import numpy as np
from torch.utils.tensorboard import SummaryWriter

**Loading Data and subsampling:**

For training, we keep only 5% of the available sample so that we can easily overfit and thus observe the impact of regularization techniques later.


In [None]:
batch_size = 300

transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
trainset = datasets.MNIST('../data', train=True, download=True, transform=transform)
subsample = np.random.choice(range(len(trainset)), size=int(len(trainset)*0.05))
rest = list(set(range(len(trainset))).difference(set(subsample)))
trainset_ = torch.utils.data.Subset(trainset, subsample)
restset = torch.utils.data.Subset(trainset, rest)
train_loader = torch.utils.data.DataLoader(trainset_, batch_size)
valset = datasets.MNIST('../data', train=False, transform=transform)
val_loader = torch.utils.data.DataLoader(valset, batch_size)
rest_loader = torch.utils.data.DataLoader(restset, batch_size)

In [None]:
print("examples kept for later fine-tuning of hyperparameters: (95%)", len(restset))
print("training examples: (5%) ", len(trainset_))
print("validation examples: ", len(valset))

examples kept for later fine-tuning of hyperparameters: (95%) 57070
training examples: (5%)  3000
validation examples:  10000


**Definition of the core network:** 

An MLP with 3 hidden **linear** layers (100 units each, relu activation), and an **output** layer of dim 10 (softmaxed) for multilaclass classifciation purpose. 

By setting

- useDropout = float: we activate dropout over 2nd and 3rd linear layers. (the dorpout proba is given by the float value useDropout)
- useBN = float: we activate batch normalization over 1st, 2nd and 3rd linear layers. (epsilon is stored in useBN)
- useL1/L2 = float: we activate L1/L2 regularizations. (reg parameters are specified in the useL1/useL2 args)

In [None]:
class Net(nn.Module):

    def __init__(self, useDropout=None, useBN=None, useL1=None, useL2=None):
        
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(28*28, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 100)
        self.out = nn.Linear(100, 10)
        
        self.useDropout = useDropout
        self.useBN = useBN
        self.useL1 = useL1
        self.useL2 = useL2

    def forward(self, x):
        
        x = x.view(-1, 28*28)
        #layer 1, [batchnorm]
        x = self.fc1(x)
        if self.useBN is not None:
            x = nn.BatchNorm1d(num_features=100, eps=self.useBN)(x)
        x = F.relu(x)

        #layer 2, [batchnorm, dropout]
        x = self.fc2(x)
        if self.useBN is not None:
            x = nn.BatchNorm1d(num_features=100, eps=self.useBN)(x)
        x = F.relu(x)
        if self.useDropout is not None:
            x = F.dropout(x, p=float(self.useDropout))

        #layer 3, [batchnorm, dropout]
        x = self.fc3(x)
        if self.useBN is not None:
            x = nn.BatchNorm1d(num_features=100, eps=self.useBN)(x)
        x = F.relu(x)
        if self.useDropout is not None:
            x = F.dropout(x, p=float(self.useDropout))

        #out, softmax
        x = self.out(x)
        output = F.softmax(x, dim=0)
        
        return output

At this level, we should define the training/validation loops. For that, we use `model.train()` or `model.eval()` (which allows turning on/off the units when using dropout for example). When training, we use torch.utils.tensorboard to save:

    The costs (train, validation).
    The weights of each linear layer (histogram).
    The gradient at the entrance of each linear layer

We avoid recording histograms at each iteration; we save twenty at most during the training in order to save storage space and computing time.

We distinguish two modes of functionning for the model "train mode" set via net.train() and "evaluation mode" set via net.eval() (this allows for instance taking into account the dropout, batchnormalization when training but restore things back when infering.

In [None]:
def train(log_interval, model, train_loader, optimizer, criterion, epoch):
    model.train()
    cnt_loss_train = 0
    for batch_idx, (img_batch, label_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        output_batch = model(img_batch)
        loss = criterion(input=output_batch, target=label_batch)
        loss.backward()
        writer.add_scalars('Loss',{'train':loss}, cnt_loss_train)
        cnt_loss_train += 1
        optimizer.step()
        
        if batch_idx % log_interval == 0:
          print('Train Epoch: {} Loss: {:.6f}'.format(
                epoch , loss.item()))
    if epoch % 50 == 0:
        for name, param in model.named_parameters() :
            if name == "fc1.weight" or name == "fc1.bias":
                writer.add_histogram(name, param.data.view(-1), epoch//50)
            if name == "fc2.weight" or name == "fc2.bias":
                writer.add_histogram(name, param.data.view(-1), epoch//50)
            if name == "fc3.weight" or name == 'fc3.bias':
                writer.add_histogram(name, param.data.view(-1), epoch//50)
            writer.add_histogram(name + '_grad', param.grad, epoch//50)  


def test(model, test_loader, criterion):
    model.eval()
    cnt_loss_val = 0
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            loss = criterion(output, target).item()
            test_loss += loss  # sum up batch loss
            writer.add_scalars('Loss',{'eval':loss}, cnt_loss_val)
            cnt_loss_val += 1
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    return accuracy

**Basic training loop**

All together in one function that iterates through epochs and applies the previous learning/validation functions to the model respectively on training/validation sets. Here, we do not use any regularization technique, all params being set to None in the definition of the model. We want to see that the model overfits easily. And next, we'll refer to some regularization techniques, and optimise the set of hyperparameters they would suggest.

In [None]:
def train_mnist(trial):

  cfg = { 'n_epochs' : 60,
          'seed' : 0,
          'log_interval' : 50,
          'lr' : 1e-3,          
          'useL1': None, 
          'useL2': None, 
          'useBN': None, 
          'useDropout': None, 
          }

  torch.manual_seed(cfg['seed'])
  model = Net()
  optimizer = optimizer = torch.optim.Adam(model.parameters(),lr=cfg['lr'])
  criterion = nn.CrossEntropyLoss()
  
  for epoch in range(1, cfg['n_epochs'] + 1):
      train(cfg['log_interval'], model, train_loader, optimizer, criterion, epoch)
      test_accuracy = test(model, val_loader, criterion)

  return test_accuracy

In [None]:
writer = SummaryWriter("runs/")
sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=train_mnist, n_trials=1)

[32m[I 2021-02-23 21:06:52,209][0m A new study created in memory with name: no-name-8d41ca10-5ff5-45e7-a43b-cf930c67554d[0m


Train Epoch: 1 Loss: 2.302570

Test set: Average loss: 0.0078, Accuracy: 6843/10000 (68%)
Train Epoch: 2 Loss: 2.297976

Test set: Average loss: 0.0078, Accuracy: 6420/10000 (64%)
Train Epoch: 3 Loss: 2.283094

Test set: Average loss: 0.0077, Accuracy: 5865/10000 (59%)
Train Epoch: 4 Loss: 2.276287

Test set: Average loss: 0.0077, Accuracy: 7194/10000 (72%)
Train Epoch: 5 Loss: 2.274063

Test set: Average loss: 0.0077, Accuracy: 7508/10000 (75%)
Train Epoch: 6 Loss: 2.273400

Test set: Average loss: 0.0077, Accuracy: 7505/10000 (75%)
Train Epoch: 7 Loss: 2.273252

Test set: Average loss: 0.0077, Accuracy: 7535/10000 (75%)
Train Epoch: 8 Loss: 2.273107

Test set: Average loss: 0.0077, Accuracy: 7576/10000 (76%)
Train Epoch: 9 Loss: 2.273041

Test set: Average loss: 0.0077, Accuracy: 7635/10000 (76%)
Train Epoch: 10 Loss: 2.273009

Test set: Average loss: 0.0077, Accuracy: 7679/10000 (77%)
Train Epoch: 11 Loss: 2.273002

Test set: Average loss: 0.0077, Accuracy: 7727/10000 (77%)
Train Ep

[32m[I 2021-02-23 21:08:58,638][0m Trial 0 finished with value: 82.8 and parameters: {}. Best is trial 0 with value: 82.8.[0m



Test set: Average loss: 0.0077, Accuracy: 8280/10000 (83%)


In [None]:
df = study.trials_dataframe()
df.head(5)
optuna.visualization.plot_optimization_history(study)

In [None]:
study.best_trial

FrozenTrial(number=0, values=[82.8], datetime_start=datetime.datetime(2021, 2, 23, 21, 6, 52, 210076), datetime_complete=datetime.datetime(2021, 2, 23, 21, 8, 58, 638396), params={}, distributions={}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

Now, we can check learning curves in tensorboard ..


In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir 'runs'

Reusing TensorBoard on port 6006 (pid 748), started 0:11:03 ago. (Use '!kill 748' to kill it.)

<IPython.core.display.Javascript object>

In [None]:
from tensorboard import notebook
notebook.display(height=1000) 

Selecting TensorBoard with logdir runs (started 0:11:03 ago; port 6006, pid 748).


<IPython.core.display.Javascript object>

The best accuracy is show in ` value` column above.

**Using regularization, and fine tuning hyperparameters**

We will try to search for the best hyperparameters using the 95%
of the remaining train set as validation. You can try to combine different types of regularizations.

- L1 regularization (Lasso) : will end up with sparse weights with (many zeros)

- L2 regularization (Ridge) : will end up with small values of weights

- Dropout regularization: During training‚ for each training example randomly turn-off the neurons of hidden units (with p, a hyperparam)‚ this also removes the connections‚ for different training examples, turn-off different units

- BatchNorm: subtracting a measure of location and dividing by a measure of scale

Those techniques involve new hyperameters in the final algorithm. Thos are :

- The proba $p$ for dropout
- reg params for L1/L2
- $\epsilon$ for batchNorm ..

In the following cells, we will add those into consideration in the definition of the model, and we will use optuna to search for the best values for them, so that the accuracy obtained finally is better, and we avoid overfitting.

In [None]:
def train_mnist_optuned(trial):

  cfg = { 'n_epochs' : 60,
          'seed' : 0,
          'log_interval' : 50,
          'save_model' : False,
          'lr' : 1e-3,          
          'useL1': trial.suggest_uniform('useL1', 1e-3, 1.5),
          'useL2': trial.suggest_uniform('useL2', 1e-3, 1.5),
          'useBN': trial.suggest_uniform('useBN', 1e-6, 1e-4),
          'useDropout': trial.suggest_uniform('useDropout', 0.01, 0.55),
          }

  torch.manual_seed(cfg['seed'])
  model = Net(cfg['useDropout'], cfg['useBN'], cfg['useL1'], cfg['useL2'])
  optimizer = optimizer = torch.optim.Adam(model.parameters(),lr=cfg['lr'])
  criterion = nn.CrossEntropyLoss()
  
  for epoch in range(1, cfg['n_epochs'] + 1):
      train(cfg['log_interval'], model, train_loader, optimizer, criterion, epoch)
      test_accuracy = test(model, val_loader, criterion)
      print(test_accuracy)
  if cfg['save_model']:
      torch.save(model.state_dict(), "mnist_cnn.pt")

  return test_accuracy

In [14]:
writer = SummaryWriter("runs_optuna/")
sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=train_mnist_optuned, n_trials=10)

[32m[I 2021-02-23 21:08:59,288][0m A new study created in memory with name: no-name-1569ea58-af79-472a-bfd9-c2671285af2e[0m


Train Epoch: 1 Loss: 2.302338

Test set: Average loss: 0.0078, Accuracy: 6932/10000 (69%)
69.32
Train Epoch: 2 Loss: 2.290055

Test set: Average loss: 0.0078, Accuracy: 6431/10000 (64%)
64.31
Train Epoch: 3 Loss: 2.282984

Test set: Average loss: 0.0077, Accuracy: 5828/10000 (58%)
58.28
Train Epoch: 4 Loss: 2.277974

Test set: Average loss: 0.0077, Accuracy: 5233/10000 (52%)
52.33
Train Epoch: 5 Loss: 2.275911

Test set: Average loss: 0.0077, Accuracy: 5925/10000 (59%)
59.25
Train Epoch: 6 Loss: 2.274642

Test set: Average loss: 0.0077, Accuracy: 6326/10000 (63%)
63.26
Train Epoch: 7 Loss: 2.274062

Test set: Average loss: 0.0077, Accuracy: 6385/10000 (64%)
63.85
Train Epoch: 8 Loss: 2.273649

Test set: Average loss: 0.0077, Accuracy: 6451/10000 (65%)
64.51
Train Epoch: 9 Loss: 2.273745

Test set: Average loss: 0.0077, Accuracy: 6391/10000 (64%)
63.91
Train Epoch: 10 Loss: 2.273628

Test set: Average loss: 0.0077, Accuracy: 6405/10000 (64%)
64.05
Train Epoch: 11 Loss: 2.273569

Test se

[32m[I 2021-02-23 21:11:13,471][0m Trial 0 finished with value: 82.21 and parameters: {'useL1': 1.3299979099384567, 'useL2': 1.2171163641616152, 'useBN': 8.793668416014829e-05, 'useDropout': 0.09885545807242042}. Best is trial 0 with value: 82.21.[0m



Test set: Average loss: 0.0077, Accuracy: 8221/10000 (82%)
82.21
Train Epoch: 1 Loss: 2.302455

Test set: Average loss: 0.0078, Accuracy: 5418/10000 (54%)
54.18
Train Epoch: 2 Loss: 2.294991

Test set: Average loss: 0.0078, Accuracy: 5470/10000 (55%)
54.7
Train Epoch: 3 Loss: 2.287803

Test set: Average loss: 0.0078, Accuracy: 5607/10000 (56%)
56.07
Train Epoch: 4 Loss: 2.284765

Test set: Average loss: 0.0078, Accuracy: 5563/10000 (56%)
55.63
Train Epoch: 5 Loss: 2.279413

Test set: Average loss: 0.0077, Accuracy: 5231/10000 (52%)
52.31
Train Epoch: 6 Loss: 2.277336

Test set: Average loss: 0.0077, Accuracy: 5262/10000 (53%)
52.62
Train Epoch: 7 Loss: 2.276395

Test set: Average loss: 0.0077, Accuracy: 5732/10000 (57%)
57.32
Train Epoch: 8 Loss: 2.274555

Test set: Average loss: 0.0077, Accuracy: 6058/10000 (61%)
60.58
Train Epoch: 9 Loss: 2.273979

Test set: Average loss: 0.0077, Accuracy: 6087/10000 (61%)
60.87
Train Epoch: 10 Loss: 2.274008

Test set: Average loss: 0.0077, Accurac

[32m[I 2021-02-23 21:13:29,217][0m Trial 1 finished with value: 82.37 and parameters: {'useL1': 0.7884187681395458, 'useL2': 1.414916436850826, 'useBN': 5.332709397669999e-06, 'useDropout': 0.4047077278036265}. Best is trial 1 with value: 82.37.[0m



Test set: Average loss: 0.0077, Accuracy: 8237/10000 (82%)
82.37
Train Epoch: 1 Loss: 2.302356

Test set: Average loss: 0.0078, Accuracy: 6749/10000 (67%)
67.49
Train Epoch: 2 Loss: 2.291683

Test set: Average loss: 0.0078, Accuracy: 6548/10000 (65%)
65.48
Train Epoch: 3 Loss: 2.284290

Test set: Average loss: 0.0077, Accuracy: 5974/10000 (60%)
59.74
Train Epoch: 4 Loss: 2.278826

Test set: Average loss: 0.0077, Accuracy: 5230/10000 (52%)
52.3
Train Epoch: 5 Loss: 2.276105

Test set: Average loss: 0.0077, Accuracy: 5411/10000 (54%)
54.11
Train Epoch: 6 Loss: 2.275342

Test set: Average loss: 0.0077, Accuracy: 6333/10000 (63%)
63.33
Train Epoch: 7 Loss: 2.274183

Test set: Average loss: 0.0077, Accuracy: 6335/10000 (63%)
63.35
Train Epoch: 8 Loss: 2.273837

Test set: Average loss: 0.0077, Accuracy: 6473/10000 (65%)
64.73
Train Epoch: 9 Loss: 2.273517

Test set: Average loss: 0.0077, Accuracy: 6465/10000 (65%)
64.65
Train Epoch: 10 Loss: 2.273623

Test set: Average loss: 0.0077, Accurac

[32m[I 2021-02-23 21:15:44,984][0m Trial 2 finished with value: 85.8 and parameters: {'useL1': 1.0954845697545692, 'useL2': 0.7319594457514821, 'useBN': 8.376489584144275e-05, 'useDropout': 0.1905706043379012}. Best is trial 2 with value: 85.8.[0m



Test set: Average loss: 0.0077, Accuracy: 8580/10000 (86%)
85.8
Train Epoch: 1 Loss: 2.302399

Test set: Average loss: 0.0078, Accuracy: 7056/10000 (71%)
70.56
Train Epoch: 2 Loss: 2.289455

Test set: Average loss: 0.0078, Accuracy: 6395/10000 (64%)
63.95
Train Epoch: 3 Loss: 2.281791

Test set: Average loss: 0.0077, Accuracy: 5677/10000 (57%)
56.77
Train Epoch: 4 Loss: 2.277287

Test set: Average loss: 0.0077, Accuracy: 5345/10000 (53%)
53.45
Train Epoch: 5 Loss: 2.275430

Test set: Average loss: 0.0077, Accuracy: 6470/10000 (65%)
64.7
Train Epoch: 6 Loss: 2.273951

Test set: Average loss: 0.0077, Accuracy: 6276/10000 (63%)
62.76
Train Epoch: 7 Loss: 2.273802

Test set: Average loss: 0.0077, Accuracy: 6283/10000 (63%)
62.83
Train Epoch: 8 Loss: 2.273604

Test set: Average loss: 0.0077, Accuracy: 6336/10000 (63%)
63.36
Train Epoch: 9 Loss: 2.273712

Test set: Average loss: 0.0077, Accuracy: 6350/10000 (64%)
63.5
Train Epoch: 10 Loss: 2.273501

Test set: Average loss: 0.0077, Accuracy:

[32m[I 2021-02-23 21:18:00,880][0m Trial 3 finished with value: 79.0 and parameters: {'useL1': 0.49997322857943116, 'useL2': 1.2458204131995336, 'useBN': 5.245638752026432e-05, 'useDropout': 0.03903115994955283}. Best is trial 2 with value: 85.8.[0m



Test set: Average loss: 0.0077, Accuracy: 7900/10000 (79%)
79.0
Train Epoch: 1 Loss: 2.302373

Test set: Average loss: 0.0078, Accuracy: 6045/10000 (60%)
60.45
Train Epoch: 2 Loss: 2.293587

Test set: Average loss: 0.0078, Accuracy: 5973/10000 (60%)
59.73
Train Epoch: 3 Loss: 2.286195

Test set: Average loss: 0.0078, Accuracy: 5795/10000 (58%)
57.95
Train Epoch: 4 Loss: 2.282235

Test set: Average loss: 0.0077, Accuracy: 5682/10000 (57%)
56.82
Train Epoch: 5 Loss: 2.278370

Test set: Average loss: 0.0077, Accuracy: 5369/10000 (54%)
53.69
Train Epoch: 6 Loss: 2.276321

Test set: Average loss: 0.0077, Accuracy: 5659/10000 (57%)
56.59
Train Epoch: 7 Loss: 2.275139

Test set: Average loss: 0.0077, Accuracy: 6255/10000 (63%)
62.55
Train Epoch: 8 Loss: 2.274055

Test set: Average loss: 0.0077, Accuracy: 6319/10000 (63%)
63.19
Train Epoch: 9 Loss: 2.273958

Test set: Average loss: 0.0077, Accuracy: 6240/10000 (62%)
62.4
Train Epoch: 10 Loss: 2.273910

Test set: Average loss: 0.0077, Accuracy

[32m[I 2021-02-23 21:20:16,889][0m Trial 4 finished with value: 85.53 and parameters: {'useL1': 1.445213304579667, 'useL2': 0.20290042232917543, 'useBN': 2.8811004858937098e-05, 'useDropout': 0.30324713649000123}. Best is trial 2 with value: 85.8.[0m



Test set: Average loss: 0.0077, Accuracy: 8553/10000 (86%)
85.53
Train Epoch: 1 Loss: 2.302396

Test set: Average loss: 0.0078, Accuracy: 6052/10000 (61%)
60.52
Train Epoch: 2 Loss: 2.293695

Test set: Average loss: 0.0078, Accuracy: 5924/10000 (59%)
59.24
Train Epoch: 3 Loss: 2.286338

Test set: Average loss: 0.0078, Accuracy: 5802/10000 (58%)
58.02
Train Epoch: 4 Loss: 2.282177

Test set: Average loss: 0.0077, Accuracy: 5674/10000 (57%)
56.74
Train Epoch: 5 Loss: 2.277806

Test set: Average loss: 0.0077, Accuracy: 5252/10000 (53%)
52.52
Train Epoch: 6 Loss: 2.276273

Test set: Average loss: 0.0077, Accuracy: 5615/10000 (56%)
56.15
Train Epoch: 7 Loss: 2.275402

Test set: Average loss: 0.0077, Accuracy: 6199/10000 (62%)
61.99
Train Epoch: 8 Loss: 2.274058

Test set: Average loss: 0.0077, Accuracy: 6286/10000 (63%)
62.86
Train Epoch: 9 Loss: 2.274046

Test set: Average loss: 0.0077, Accuracy: 6206/10000 (62%)
62.06
Train Epoch: 10 Loss: 2.273834

Test set: Average loss: 0.0077, Accura

[32m[I 2021-02-23 21:22:32,996][0m Trial 5 finished with value: 85.0 and parameters: {'useL1': 1.0915451311091195, 'useL2': 0.656132167917855, 'useBN': 4.944518662634461e-05, 'useDropout': 0.31883186013787307}. Best is trial 2 with value: 85.8.[0m



Test set: Average loss: 0.0077, Accuracy: 8500/10000 (85%)
85.0
Train Epoch: 1 Loss: 2.302483

Test set: Average loss: 0.0078, Accuracy: 5020/10000 (50%)
50.2
Train Epoch: 2 Loss: 2.295223

Test set: Average loss: 0.0078, Accuracy: 5327/10000 (53%)
53.27
Train Epoch: 3 Loss: 2.288407

Test set: Average loss: 0.0078, Accuracy: 5430/10000 (54%)
54.3
Train Epoch: 4 Loss: 2.285362

Test set: Average loss: 0.0078, Accuracy: 5380/10000 (54%)
53.8
Train Epoch: 5 Loss: 2.280049

Test set: Average loss: 0.0077, Accuracy: 5146/10000 (51%)
51.46
Train Epoch: 6 Loss: 2.277508

Test set: Average loss: 0.0077, Accuracy: 4988/10000 (50%)
49.88
Train Epoch: 7 Loss: 2.276504

Test set: Average loss: 0.0077, Accuracy: 5408/10000 (54%)
54.08
Train Epoch: 8 Loss: 2.274813

Test set: Average loss: 0.0077, Accuracy: 5792/10000 (58%)
57.92
Train Epoch: 9 Loss: 2.274299

Test set: Average loss: 0.0077, Accuracy: 5857/10000 (59%)
58.57
Train Epoch: 10 Loss: 2.273946

Test set: Average loss: 0.0077, Accuracy: 

[32m[I 2021-02-23 21:24:49,061][0m Trial 6 finished with value: 79.58 and parameters: {'useL1': 0.7043677929346749, 'useL2': 1.4058901137639848, 'useBN': 9.832279247522227e-05, 'useDropout': 0.4481277117848583}. Best is trial 2 with value: 85.8.[0m



Test set: Average loss: 0.0077, Accuracy: 7958/10000 (80%)
79.58
Train Epoch: 1 Loss: 2.302445

Test set: Average loss: 0.0078, Accuracy: 5394/10000 (54%)
53.94
Train Epoch: 2 Loss: 2.295125

Test set: Average loss: 0.0078, Accuracy: 5478/10000 (55%)
54.78
Train Epoch: 3 Loss: 2.287757

Test set: Average loss: 0.0078, Accuracy: 5570/10000 (56%)
55.7
Train Epoch: 4 Loss: 2.284721

Test set: Average loss: 0.0078, Accuracy: 5560/10000 (56%)
55.6
Train Epoch: 5 Loss: 2.279564

Test set: Average loss: 0.0077, Accuracy: 5232/10000 (52%)
52.32
Train Epoch: 6 Loss: 2.277420

Test set: Average loss: 0.0077, Accuracy: 5317/10000 (53%)
53.17
Train Epoch: 7 Loss: 2.276284

Test set: Average loss: 0.0077, Accuracy: 5775/10000 (58%)
57.75
Train Epoch: 8 Loss: 2.274524

Test set: Average loss: 0.0077, Accuracy: 6047/10000 (60%)
60.47
Train Epoch: 9 Loss: 2.273998

Test set: Average loss: 0.0077, Accuracy: 6094/10000 (61%)
60.94
Train Epoch: 10 Loss: 2.273950

Test set: Average loss: 0.0077, Accuracy

[32m[I 2021-02-23 21:27:05,247][0m Trial 7 finished with value: 81.51 and parameters: {'useL1': 1.4687190611794578, 'useL2': 0.8719185738624136, 'useBN': 3.782153898763246e-05, 'useDropout': 0.40604501524697706}. Best is trial 2 with value: 85.8.[0m



Test set: Average loss: 0.0077, Accuracy: 8151/10000 (82%)
81.51
Train Epoch: 1 Loss: 2.302352

Test set: Average loss: 0.0078, Accuracy: 6454/10000 (65%)
64.54
Train Epoch: 2 Loss: 2.292426

Test set: Average loss: 0.0078, Accuracy: 6204/10000 (62%)
62.04
Train Epoch: 3 Loss: 2.285414

Test set: Average loss: 0.0078, Accuracy: 6030/10000 (60%)
60.3
Train Epoch: 4 Loss: 2.281284

Test set: Average loss: 0.0077, Accuracy: 5637/10000 (56%)
56.37
Train Epoch: 5 Loss: 2.276997

Test set: Average loss: 0.0077, Accuracy: 5574/10000 (56%)
55.74
Train Epoch: 6 Loss: 2.275547

Test set: Average loss: 0.0077, Accuracy: 6260/10000 (63%)
62.6
Train Epoch: 7 Loss: 2.274502

Test set: Average loss: 0.0077, Accuracy: 6247/10000 (62%)
62.47
Train Epoch: 8 Loss: 2.273790

Test set: Average loss: 0.0077, Accuracy: 6363/10000 (64%)
63.63
Train Epoch: 9 Loss: 2.273838

Test set: Average loss: 0.0077, Accuracy: 6316/10000 (63%)
63.16
Train Epoch: 10 Loss: 2.273766

Test set: Average loss: 0.0077, Accuracy

[32m[I 2021-02-23 21:29:20,415][0m Trial 8 finished with value: 86.03 and parameters: {'useL1': 0.26182515280403906, 'useL2': 0.349625716329634, 'useBN': 6.638627796272163e-05, 'useDropout': 0.24968739428956332}. Best is trial 8 with value: 86.03.[0m



Test set: Average loss: 0.0077, Accuracy: 8603/10000 (86%)
86.03
Train Epoch: 1 Loss: 2.302336

Test set: Average loss: 0.0078, Accuracy: 6806/10000 (68%)
68.06
Train Epoch: 2 Loss: 2.291413

Test set: Average loss: 0.0078, Accuracy: 6529/10000 (65%)
65.29
Train Epoch: 3 Loss: 2.284100

Test set: Average loss: 0.0077, Accuracy: 5947/10000 (59%)
59.47
Train Epoch: 4 Loss: 2.278512

Test set: Average loss: 0.0077, Accuracy: 5215/10000 (52%)
52.15
Train Epoch: 5 Loss: 2.276135

Test set: Average loss: 0.0077, Accuracy: 5548/10000 (55%)
55.48
Train Epoch: 6 Loss: 2.275214

Test set: Average loss: 0.0077, Accuracy: 6466/10000 (65%)
64.66
Train Epoch: 7 Loss: 2.274267

Test set: Average loss: 0.0077, Accuracy: 6473/10000 (65%)
64.73
Train Epoch: 8 Loss: 2.273848

Test set: Average loss: 0.0077, Accuracy: 6456/10000 (65%)
64.56
Train Epoch: 9 Loss: 2.273612

Test set: Average loss: 0.0077, Accuracy: 6453/10000 (65%)
64.53
Train Epoch: 10 Loss: 2.273530

Test set: Average loss: 0.0077, Accura

[32m[I 2021-02-23 21:31:36,304][0m Trial 9 finished with value: 85.54 and parameters: {'useL1': 1.3831164197303696, 'useL2': 0.01714319674534926, 'useBN': 2.270824174542475e-05, 'useDropout': 0.1709245212520233}. Best is trial 8 with value: 86.03.[0m



Test set: Average loss: 0.0077, Accuracy: 8554/10000 (86%)
85.54


In [15]:
df = study.trials_dataframe()
df.head(5)
optuna.visualization.plot_optimization_history(study)

In [16]:
study.best_trial

FrozenTrial(number=8, values=[86.03], datetime_start=datetime.datetime(2021, 2, 23, 21, 27, 5, 248685), datetime_complete=datetime.datetime(2021, 2, 23, 21, 29, 20, 415602), params={'useL1': 0.26182515280403906, 'useL2': 0.349625716329634, 'useBN': 6.638627796272163e-05, 'useDropout': 0.24968739428956332}, distributions={'useL1': UniformDistribution(high=1.5, low=0.001), 'useL2': UniformDistribution(high=1.5, low=0.001), 'useBN': UniformDistribution(high=0.0001, low=1e-06), 'useDropout': UniformDistribution(high=0.55, low=0.01)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=8, state=TrialState.COMPLETE, value=None)

Again, we can check learning curves in tensorboard ..



In [17]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [18]:
%tensorboard --logdir 'runs_optuna'

<IPython.core.display.Javascript object>

In [19]:
from tensorboard import notebook
notebook.display(height=1000) 

Selecting TensorBoard with logdir runs_optuna (started 0:00:01 ago; port 6007, pid 861).


<IPython.core.display.Javascript object>