### Используя сверточные слои и архитектуру получите на fashion mnist качество не ниже 89,5%

In [1]:
import matplotlib.pyplot as plt
import torch
import numpy as np
import torchvision as tv
import time
%matplotlib inline
from torch import nn

In [2]:
train_dataset = tv.datasets.FashionMNIST('./fashion', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.FashionMNIST('./fashion', train=False, transform=tv.transforms.ToTensor(), download=True)


In [3]:
BATCH_SIZE = 512
train_iter= torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_iter = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Возьмем пока Lenet

In [4]:
model = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.Linear(300, 50),
    nn.ReLU(),
    nn.Linear(50, 10)
)

In [5]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
dev

device(type='cuda')

In [7]:
model = model.to(dev)

In [8]:
def train(net, train_iter, test_iter, trainer, num_epochs, dev):
    loss = nn.CrossEntropyLoss(reduction='sum')
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        net.train()
        for X, y in train_iter:
            trainer.zero_grad()
            X, y = X.to(dev), y.to(dev)
            y_hat = net(X)
            #print(y_hat.argmax(axis=1))
            l = loss(y_hat, y)
            l.backward()
            trainer.step()
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().item()
            n += y.shape[0]
        net.eval()
        test_acc = evaluate_accuracy(test_iter, net, dev)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
                 time.time() - start))

In [9]:
def evaluate_accuracy(data_iter, net, dev):
    acc_sum, n = torch.Tensor([0]).to(dev), 0
    for X, y in data_iter:
        X, y = X.to(dev), y.to(dev)
        acc_sum += (net(X).argmax(axis=1) == y).sum()
        n += y.shape[0]
    return acc_sum.item() / n

In [10]:
lr, num_epochs = 0.001, 30
trainer = torch.optim.SGD(model.parameters(), lr=lr)
train(model, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 2.1281, train acc 0.216, test acc 0.660, time 5.4 sec
epoch 2, loss 0.7738, train acc 0.705, test acc 0.765, time 5.4 sec
epoch 3, loss 0.5879, train acc 0.777, test acc 0.809, time 5.4 sec
epoch 4, loss 0.5040, train acc 0.814, test acc 0.834, time 5.4 sec
epoch 5, loss 0.4327, train acc 0.840, test acc 0.841, time 5.4 sec
epoch 6, loss 0.3998, train acc 0.852, test acc 0.860, time 5.4 sec
epoch 7, loss 0.3732, train acc 0.862, test acc 0.855, time 5.4 sec
epoch 8, loss 0.3590, train acc 0.867, test acc 0.864, time 5.3 sec
epoch 9, loss 0.3382, train acc 0.874, test acc 0.868, time 5.4 sec
epoch 10, loss 0.3295, train acc 0.877, test acc 0.868, time 5.4 sec
epoch 11, loss 0.3199, train acc 0.881, test acc 0.873, time 5.4 sec
epoch 12, loss 0.3048, train acc 0.885, test acc 0.873, time 5.3 sec
epoch 13, loss 0.3018, train acc 0.887, test acc 0.877, time 5.5 sec
epoch 14, loss 0.2961, train acc 0.889, test acc 0.876, time 5.5 sec
epoch 15, loss 0.2891, train acc 0.892, tes

In [11]:
lr, num_epochs = 0.0005, 30
trainer = torch.optim.SGD(model.parameters(), lr=lr)
train(model, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.2103, train acc 0.921, test acc 0.892, time 5.6 sec
epoch 2, loss 0.2079, train acc 0.921, test acc 0.889, time 5.4 sec
epoch 3, loss 0.2054, train acc 0.922, test acc 0.891, time 5.6 sec
epoch 4, loss 0.2039, train acc 0.923, test acc 0.892, time 5.4 sec
epoch 5, loss 0.2041, train acc 0.923, test acc 0.891, time 5.4 sec
epoch 6, loss 0.2021, train acc 0.923, test acc 0.894, time 5.4 sec
epoch 7, loss 0.2005, train acc 0.924, test acc 0.893, time 5.4 sec
epoch 8, loss 0.1995, train acc 0.924, test acc 0.893, time 5.3 sec
epoch 9, loss 0.1987, train acc 0.924, test acc 0.895, time 5.4 sec
epoch 10, loss 0.1974, train acc 0.926, test acc 0.893, time 5.3 sec
epoch 11, loss 0.1954, train acc 0.925, test acc 0.891, time 5.2 sec
epoch 12, loss 0.1960, train acc 0.925, test acc 0.892, time 5.3 sec
epoch 13, loss 0.1934, train acc 0.927, test acc 0.890, time 5.4 sec
epoch 14, loss 0.1918, train acc 0.927, test acc 0.894, time 5.4 sec
epoch 15, loss 0.1917, train acc 0.928, tes

* результаты сравнимы с  MLP

### Добавим в DENSE BatchNorm и dropout

In [29]:
model2 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.BatchNorm1d(300),
    nn.Linear(300, 50),
    nn.ReLU(),
    nn.BatchNorm1d(50),
    nn.Dropout(0.3),
    nn.Linear(50, 10)
)

In [31]:
model2.cuda()

Sequential(
  (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (4): ReLU()
  (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): Flatten()
  (7): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Linear(in_features=300, out_features=50, bias=True)
  (9): ReLU()
  (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): Dropout(p=0.3, inplace=False)
  (12): Linear(in_features=50, out_features=10, bias=True)
)

In [32]:
lr, num_epochs = 0.001, 30
trainer = torch.optim.SGD(model2.parameters(), lr=lr)
train(model2, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.6542, train acc 0.759, test acc 0.798, time 5.5 sec
epoch 2, loss 0.4395, train acc 0.840, test acc 0.844, time 5.4 sec
epoch 3, loss 0.3884, train acc 0.858, test acc 0.843, time 5.6 sec
epoch 4, loss 0.3675, train acc 0.865, test acc 0.871, time 5.6 sec
epoch 5, loss 0.3480, train acc 0.873, test acc 0.876, time 5.5 sec
epoch 6, loss 0.3345, train acc 0.877, test acc 0.876, time 5.7 sec
epoch 7, loss 0.3263, train acc 0.879, test acc 0.881, time 5.5 sec
epoch 8, loss 0.3192, train acc 0.881, test acc 0.876, time 5.7 sec
epoch 9, loss 0.3106, train acc 0.885, test acc 0.883, time 5.6 sec
epoch 10, loss 0.3041, train acc 0.888, test acc 0.875, time 5.6 sec
epoch 11, loss 0.3012, train acc 0.889, test acc 0.882, time 5.6 sec
epoch 12, loss 0.2944, train acc 0.892, test acc 0.889, time 5.6 sec
epoch 13, loss 0.2906, train acc 0.893, test acc 0.885, time 5.6 sec
epoch 14, loss 0.2903, train acc 0.894, test acc 0.881, time 5.5 sec
epoch 15, loss 0.2838, train acc 0.896, tes

In [33]:
lr, num_epochs = 0.0006, 30
trainer = torch.optim.SGD(model2.parameters(), lr=lr)
train(model2, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.2413, train acc 0.911, test acc 0.902, time 5.5 sec
epoch 2, loss 0.2382, train acc 0.911, test acc 0.900, time 5.6 sec
epoch 3, loss 0.2371, train acc 0.913, test acc 0.898, time 5.6 sec
epoch 4, loss 0.2391, train acc 0.912, test acc 0.898, time 5.6 sec
epoch 5, loss 0.2351, train acc 0.913, test acc 0.853, time 5.5 sec
epoch 6, loss 0.2376, train acc 0.912, test acc 0.901, time 5.5 sec
epoch 7, loss 0.2340, train acc 0.914, test acc 0.897, time 5.5 sec
epoch 8, loss 0.2320, train acc 0.914, test acc 0.898, time 5.5 sec
epoch 9, loss 0.2343, train acc 0.913, test acc 0.897, time 5.5 sec
epoch 10, loss 0.2316, train acc 0.915, test acc 0.897, time 5.6 sec
epoch 11, loss 0.2320, train acc 0.914, test acc 0.901, time 5.5 sec
epoch 12, loss 0.2304, train acc 0.915, test acc 0.903, time 5.4 sec
epoch 13, loss 0.2296, train acc 0.915, test acc 0.900, time 5.4 sec
epoch 14, loss 0.2316, train acc 0.915, test acc 0.904, time 5.5 sec
epoch 15, loss 0.2290, train acc 0.915, tes

* Вот уже достигли искомы резултататов

### Добавим еще один слой конволюции

In [80]:
model3 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.Conv2d(12, 12, kernel_size=5,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.BatchNorm1d(300),
    nn.Linear(300, 50),
    nn.ReLU(),
    nn.BatchNorm1d(50),
    nn.Dropout(0.3),
    nn.Linear(50, 10)
)

In [82]:
model3.cuda()
lr, num_epochs = 0.001, 30
trainer = torch.optim.SGD(model3.parameters(), lr=lr)
train(model3, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.7881, train acc 0.703, test acc 0.783, time 5.9 sec
epoch 2, loss 0.5158, train acc 0.808, test acc 0.823, time 5.9 sec
epoch 3, loss 0.4485, train acc 0.835, test acc 0.840, time 6.0 sec
epoch 4, loss 0.4114, train acc 0.848, test acc 0.851, time 5.9 sec
epoch 5, loss 0.3772, train acc 0.860, test acc 0.862, time 5.9 sec
epoch 6, loss 0.3613, train acc 0.868, test acc 0.864, time 5.9 sec
epoch 7, loss 0.3482, train acc 0.871, test acc 0.854, time 5.9 sec
epoch 8, loss 0.3349, train acc 0.876, test acc 0.864, time 5.9 sec
epoch 9, loss 0.3287, train acc 0.879, test acc 0.871, time 5.9 sec
epoch 10, loss 0.3164, train acc 0.884, test acc 0.884, time 6.0 sec
epoch 11, loss 0.3110, train acc 0.885, test acc 0.882, time 5.9 sec
epoch 12, loss 0.3022, train acc 0.889, test acc 0.867, time 6.0 sec
epoch 13, loss 0.2972, train acc 0.890, test acc 0.882, time 5.8 sec
epoch 14, loss 0.2916, train acc 0.893, test acc 0.889, time 5.9 sec
epoch 15, loss 0.2861, train acc 0.895, tes

In [83]:
lr, num_epochs = 0.0005, 30
trainer = torch.optim.SGD(model3.parameters(), lr=lr)
train(model3, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.2319, train acc 0.914, test acc 0.893, time 5.9 sec
epoch 2, loss 0.2278, train acc 0.915, test acc 0.899, time 5.9 sec
epoch 3, loss 0.2271, train acc 0.917, test acc 0.898, time 5.9 sec
epoch 4, loss 0.2242, train acc 0.917, test acc 0.897, time 5.9 sec
epoch 5, loss 0.2236, train acc 0.917, test acc 0.900, time 5.9 sec
epoch 6, loss 0.2219, train acc 0.918, test acc 0.898, time 5.9 sec
epoch 7, loss 0.2228, train acc 0.917, test acc 0.903, time 6.0 sec
epoch 8, loss 0.2204, train acc 0.919, test acc 0.900, time 5.9 sec
epoch 9, loss 0.2176, train acc 0.919, test acc 0.897, time 5.9 sec
epoch 10, loss 0.2181, train acc 0.919, test acc 0.898, time 5.9 sec
epoch 11, loss 0.2177, train acc 0.920, test acc 0.899, time 5.9 sec
epoch 12, loss 0.2163, train acc 0.921, test acc 0.896, time 5.9 sec
epoch 13, loss 0.2171, train acc 0.919, test acc 0.904, time 5.9 sec
epoch 14, loss 0.2160, train acc 0.920, test acc 0.902, time 6.0 sec
epoch 15, loss 0.2130, train acc 0.921, tes

* снова релуьтат улучшился

### Добавим еще один линейный слой

In [111]:
model4 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.Conv2d(12, 12, kernel_size=5,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(12, 36 ,kernel_size=4,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.BatchNorm1d(324),
    nn.Linear(324, 50),
    nn.ReLU(),
    nn.BatchNorm1d(50),
    nn.Dropout(0.3),
    nn.Linear(50, 10)
)
"""
X = train_dataset[0][0]
X = X.reshape(1, 1, 28, 28)
print(X.shape)
for l in model4:
    X = l(X)
    print("Layer {}. X shape: {}".format(l, X.shape))
"""

'\nX = train_dataset[0][0]\nX = X.reshape(1, 1, 28, 28)\nprint(X.shape)\nfor l in model4:\n    X = l(X)\n    print("Layer {}. X shape: {}".format(l, X.shape))\n'

In [112]:
model4.cuda()
lr, num_epochs = 0.001, 30
trainer = torch.optim.SGD(model4.parameters(), lr=lr)
train(model4, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.9617, train acc 0.644, test acc 0.676, time 6.1 sec
epoch 2, loss 0.6890, train acc 0.742, test acc 0.772, time 6.2 sec
epoch 3, loss 0.5641, train acc 0.792, test acc 0.801, time 6.0 sec
epoch 4, loss 0.5410, train acc 0.805, test acc 0.691, time 6.0 sec
epoch 5, loss 0.5125, train acc 0.812, test acc 0.794, time 6.1 sec
epoch 6, loss 0.4611, train acc 0.834, test acc 0.840, time 6.1 sec
epoch 7, loss 0.4168, train acc 0.848, test acc 0.838, time 6.1 sec
epoch 8, loss 0.3918, train acc 0.856, test acc 0.858, time 6.1 sec
epoch 9, loss 0.3711, train acc 0.865, test acc 0.861, time 6.2 sec
epoch 10, loss 0.3566, train acc 0.870, test acc 0.867, time 6.0 sec
epoch 11, loss 0.3454, train acc 0.875, test acc 0.844, time 6.1 sec
epoch 12, loss 0.3352, train acc 0.877, test acc 0.858, time 6.0 sec
epoch 13, loss 0.3311, train acc 0.879, test acc 0.855, time 6.1 sec
epoch 14, loss 0.3225, train acc 0.882, test acc 0.864, time 6.1 sec
epoch 15, loss 0.3157, train acc 0.886, tes

In [113]:
lr, num_epochs = 0.0005, 30
trainer = torch.optim.SGD(model4.parameters(), lr=lr)
train(model4, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.2314, train acc 0.915, test acc 0.896, time 6.1 sec
epoch 2, loss 0.2272, train acc 0.916, test acc 0.902, time 6.0 sec
epoch 3, loss 0.2246, train acc 0.917, test acc 0.898, time 5.9 sec
epoch 4, loss 0.2207, train acc 0.919, test acc 0.904, time 6.1 sec
epoch 5, loss 0.2193, train acc 0.919, test acc 0.884, time 6.1 sec
epoch 6, loss 0.2188, train acc 0.919, test acc 0.895, time 6.2 sec
epoch 7, loss 0.2187, train acc 0.920, test acc 0.886, time 6.1 sec
epoch 8, loss 0.2158, train acc 0.921, test acc 0.902, time 6.0 sec
epoch 9, loss 0.2147, train acc 0.921, test acc 0.898, time 6.1 sec
epoch 10, loss 0.2120, train acc 0.921, test acc 0.901, time 6.1 sec
epoch 11, loss 0.2138, train acc 0.922, test acc 0.870, time 6.1 sec
epoch 12, loss 0.2114, train acc 0.922, test acc 0.902, time 6.1 sec
epoch 13, loss 0.2081, train acc 0.923, test acc 0.900, time 6.1 sec
epoch 14, loss 0.2095, train acc 0.922, test acc 0.905, time 6.1 sec
epoch 15, loss 0.2068, train acc 0.924, tes

* существенно ничего не поменяялось

### добавим пару линеных слоев

In [117]:
model5 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.Conv2d(12, 12, kernel_size=5,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(12, 36 ,kernel_size=4,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.BatchNorm1d(324),
    nn.Linear(324, 200),
    nn.ReLU(),
    nn.BatchNorm1d(200),
    nn.Dropout(0.3),
    nn.Linear(200, 100),
    nn.ReLU(),
    nn.BatchNorm1d(100),
    nn.Dropout(0.3),
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.BatchNorm1d(50),
    nn.Dropout(0.3),
    nn.Linear(50, 10)
)

In [118]:
model5.cuda()
lr, num_epochs = 0.001, 30
trainer = torch.optim.SGD(model5.parameters(), lr=lr)
train(model5, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.9665, train acc 0.634, test acc 0.752, time 6.2 sec
epoch 2, loss 0.6162, train acc 0.770, test acc 0.515, time 6.2 sec
epoch 3, loss 0.5359, train acc 0.807, test acc 0.816, time 6.2 sec
epoch 4, loss 0.4724, train acc 0.829, test acc 0.483, time 6.4 sec
epoch 5, loss 0.4373, train acc 0.842, test acc 0.823, time 6.2 sec
epoch 6, loss 0.4110, train acc 0.850, test acc 0.796, time 6.2 sec
epoch 7, loss 0.3913, train acc 0.858, test acc 0.845, time 6.2 sec
epoch 8, loss 0.3765, train acc 0.865, test acc 0.875, time 6.2 sec
epoch 9, loss 0.3582, train acc 0.871, test acc 0.845, time 6.2 sec
epoch 10, loss 0.3537, train acc 0.873, test acc 0.874, time 6.2 sec
epoch 11, loss 0.3408, train acc 0.877, test acc 0.872, time 6.3 sec
epoch 12, loss 0.3281, train acc 0.883, test acc 0.867, time 6.1 sec
epoch 13, loss 0.3166, train acc 0.886, test acc 0.886, time 6.1 sec
epoch 14, loss 0.3373, train acc 0.880, test acc 0.863, time 6.0 sec
epoch 15, loss 0.3124, train acc 0.887, tes

In [119]:
lr, num_epochs = 0.0005, 30
trainer = torch.optim.SGD(model5.parameters(), lr=lr)
train(model5, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.2242, train acc 0.921, test acc 0.906, time 6.2 sec
epoch 2, loss 0.2251, train acc 0.920, test acc 0.899, time 6.4 sec
epoch 3, loss 0.2212, train acc 0.921, test acc 0.898, time 6.2 sec
epoch 4, loss 0.2195, train acc 0.921, test acc 0.907, time 6.2 sec
epoch 5, loss 0.2166, train acc 0.923, test acc 0.899, time 6.2 sec
epoch 6, loss 0.2132, train acc 0.924, test acc 0.902, time 6.2 sec
epoch 7, loss 0.2092, train acc 0.925, test acc 0.901, time 6.2 sec
epoch 8, loss 0.2082, train acc 0.926, test acc 0.905, time 6.3 sec
epoch 9, loss 0.2077, train acc 0.926, test acc 0.899, time 6.1 sec
epoch 10, loss 0.2050, train acc 0.926, test acc 0.887, time 6.2 sec
epoch 11, loss 0.2026, train acc 0.928, test acc 0.902, time 6.1 sec
epoch 12, loss 0.2049, train acc 0.927, test acc 0.900, time 6.0 sec
epoch 13, loss 0.2006, train acc 0.929, test acc 0.887, time 6.2 sec
epoch 14, loss 0.1997, train acc 0.929, test acc 0.897, time 6.2 sec
epoch 15, loss 0.1948, train acc 0.930, tes

* есть ощущение, что моделька стала только больше перобучаться

### Добавим еще слой конволюции к модели с одним скрытм линейным слоем

In [153]:
model6 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.Conv2d(12, 12, kernel_size=5,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(12, 36 ,kernel_size=4,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(36, 144 ,kernel_size=4,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.BatchNorm1d(576),
    nn.Linear(576, 50),
    nn.ReLU(),
    nn.BatchNorm1d(50),
    nn.Dropout(0.3),
    nn.Linear(50, 10)
)
"""
X = train_dataset[0][0]
X = X.reshape(1, 1, 28, 28)
print(X.shape)
for l in model6:
    X = l(X)
    print("Layer {}. X shape: {}".format(l, X.shape))
"""

'\nX = train_dataset[0][0]\nX = X.reshape(1, 1, 28, 28)\nprint(X.shape)\nfor l in model6:\n    X = l(X)\n    print("Layer {}. X shape: {}".format(l, X.shape))\n'

In [154]:
model6.cuda()
lr, num_epochs = 0.001, 30
trainer = torch.optim.SGD(model6.parameters(), lr=lr)
train(model6, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 1.0226, train acc 0.613, test acc 0.414, time 6.5 sec
epoch 2, loss 0.6473, train acc 0.756, test acc 0.699, time 6.4 sec
epoch 3, loss 0.5544, train acc 0.796, test acc 0.790, time 6.4 sec
epoch 4, loss 0.4794, train acc 0.823, test acc 0.811, time 6.4 sec
epoch 5, loss 0.4373, train acc 0.839, test acc 0.838, time 6.4 sec
epoch 6, loss 0.3982, train acc 0.853, test acc 0.811, time 6.4 sec
epoch 7, loss 0.3761, train acc 0.861, test acc 0.860, time 6.4 sec
epoch 8, loss 0.3506, train acc 0.872, test acc 0.860, time 6.3 sec
epoch 9, loss 0.3385, train acc 0.876, test acc 0.865, time 6.4 sec
epoch 10, loss 0.3255, train acc 0.881, test acc 0.866, time 6.4 sec
epoch 11, loss 0.3132, train acc 0.885, test acc 0.884, time 6.4 sec
epoch 12, loss 0.3055, train acc 0.888, test acc 0.858, time 6.3 sec
epoch 13, loss 0.2949, train acc 0.891, test acc 0.869, time 6.5 sec
epoch 14, loss 0.2895, train acc 0.894, test acc 0.862, time 6.4 sec
epoch 15, loss 0.2810, train acc 0.896, tes

In [155]:
lr, num_epochs = 0.0005, 30
trainer = torch.optim.SGD(model6.parameters(), lr=lr)
train(model6, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.2202, train acc 0.918, test acc 0.895, time 6.6 sec
epoch 2, loss 0.2150, train acc 0.921, test acc 0.892, time 6.6 sec
epoch 3, loss 0.2104, train acc 0.922, test acc 0.896, time 6.7 sec
epoch 4, loss 0.2067, train acc 0.923, test acc 0.904, time 6.7 sec
epoch 5, loss 0.2062, train acc 0.924, test acc 0.902, time 6.7 sec
epoch 6, loss 0.2009, train acc 0.926, test acc 0.897, time 6.8 sec
epoch 7, loss 0.1978, train acc 0.927, test acc 0.901, time 6.9 sec
epoch 8, loss 0.1968, train acc 0.927, test acc 0.904, time 6.8 sec
epoch 9, loss 0.1923, train acc 0.929, test acc 0.903, time 6.8 sec
epoch 10, loss 0.1909, train acc 0.930, test acc 0.903, time 6.5 sec
epoch 11, loss 0.1901, train acc 0.930, test acc 0.899, time 6.7 sec
epoch 12, loss 0.1863, train acc 0.931, test acc 0.897, time 6.5 sec
epoch 13, loss 0.1842, train acc 0.931, test acc 0.904, time 6.4 sec
epoch 14, loss 0.1819, train acc 0.932, test acc 0.904, time 6.3 sec
epoch 15, loss 0.1797, train acc 0.932, tes

In [156]:
lr, num_epochs = 0.0005, 80
trainer = torch.optim.SGD(model6.parameters(), lr=lr)
train(model6, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.1511, train acc 0.942, test acc 0.899, time 6.6 sec
epoch 2, loss 0.1499, train acc 0.944, test acc 0.902, time 6.5 sec
epoch 3, loss 0.1446, train acc 0.945, test acc 0.903, time 6.6 sec
epoch 4, loss 0.1437, train acc 0.947, test acc 0.903, time 6.6 sec
epoch 5, loss 0.1436, train acc 0.945, test acc 0.902, time 6.7 sec
epoch 6, loss 0.1427, train acc 0.946, test acc 0.897, time 6.6 sec
epoch 7, loss 0.1375, train acc 0.948, test acc 0.902, time 6.6 sec
epoch 8, loss 0.1335, train acc 0.950, test acc 0.898, time 6.8 sec
epoch 9, loss 0.1409, train acc 0.946, test acc 0.903, time 6.5 sec
epoch 10, loss 0.1364, train acc 0.947, test acc 0.903, time 6.5 sec
epoch 11, loss 0.1346, train acc 0.949, test acc 0.898, time 6.5 sec
epoch 12, loss 0.1306, train acc 0.950, test acc 0.899, time 6.3 sec
epoch 13, loss 0.1832, train acc 0.933, test acc 0.898, time 6.4 sec
epoch 14, loss 0.1480, train acc 0.943, test acc 0.894, time 6.4 sec
epoch 15, loss 0.1307, train acc 0.950, tes

* Более сложная модель переобучавется на трейне, а на тесте показывает даже худший реузльтат

### Возьмем модельку, что была попроще, а давали лучший резхультат ип ропобуем ее Адамом поучить

In [147]:
model3 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(6, 12, kernel_size=5),
    nn.ReLU(),
    nn.Conv2d(12, 12, kernel_size=5,padding=2),
    nn.ReLU(),
    nn.MaxPool2d(2, stride=2),
    nn.Flatten(),
    nn.BatchNorm1d(300),
    nn.Linear(300, 50),
    nn.ReLU(),
    nn.BatchNorm1d(50),
    nn.Dropout(0.3),
    nn.Linear(50, 10)
)

In [148]:
model3.cuda()
lr, num_epochs = 0.001, 30
trainer = torch.optim.Adam(model3.parameters(), lr=lr)
train(model3, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.7799, train acc 0.753, test acc 0.833, time 5.9 sec
epoch 2, loss 0.4229, train acc 0.854, test acc 0.860, time 5.9 sec
epoch 3, loss 0.3573, train acc 0.875, test acc 0.875, time 5.9 sec
epoch 4, loss 0.3195, train acc 0.888, test acc 0.881, time 6.0 sec
epoch 5, loss 0.3015, train acc 0.893, test acc 0.884, time 6.0 sec
epoch 6, loss 0.2843, train acc 0.899, test acc 0.881, time 6.0 sec
epoch 7, loss 0.2770, train acc 0.901, test acc 0.889, time 6.0 sec
epoch 8, loss 0.2650, train acc 0.905, test acc 0.892, time 6.0 sec
epoch 9, loss 0.2578, train acc 0.907, test acc 0.891, time 6.1 sec
epoch 10, loss 0.2506, train acc 0.910, test acc 0.882, time 6.4 sec
epoch 11, loss 0.2486, train acc 0.910, test acc 0.893, time 6.3 sec
epoch 12, loss 0.2400, train acc 0.912, test acc 0.898, time 6.1 sec
epoch 13, loss 0.2321, train acc 0.916, test acc 0.896, time 6.0 sec
epoch 14, loss 0.2300, train acc 0.917, test acc 0.899, time 6.0 sec
epoch 15, loss 0.2279, train acc 0.917, tes

In [149]:

lr, num_epochs = 0.0005, 30
trainer = torch.optim.Adam(model3.parameters(), lr=lr)
train(model3, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.1720, train acc 0.937, test acc 0.905, time 6.1 sec
epoch 2, loss 0.1667, train acc 0.939, test acc 0.900, time 6.0 sec
epoch 3, loss 0.1659, train acc 0.940, test acc 0.904, time 6.0 sec
epoch 4, loss 0.1615, train acc 0.940, test acc 0.907, time 6.0 sec
epoch 5, loss 0.1640, train acc 0.940, test acc 0.905, time 6.0 sec
epoch 6, loss 0.1610, train acc 0.941, test acc 0.903, time 6.0 sec
epoch 7, loss 0.1591, train acc 0.941, test acc 0.905, time 6.0 sec
epoch 8, loss 0.1581, train acc 0.942, test acc 0.903, time 6.0 sec
epoch 9, loss 0.1548, train acc 0.943, test acc 0.901, time 6.0 sec
epoch 10, loss 0.1556, train acc 0.942, test acc 0.906, time 5.9 sec
epoch 11, loss 0.1557, train acc 0.942, test acc 0.905, time 5.9 sec
epoch 12, loss 0.1531, train acc 0.944, test acc 0.908, time 6.1 sec
epoch 13, loss 0.1521, train acc 0.944, test acc 0.906, time 6.0 sec
epoch 14, loss 0.1522, train acc 0.944, test acc 0.904, time 6.1 sec
epoch 15, loss 0.1503, train acc 0.944, tes

In [150]:

lr, num_epochs = 0.0001, 30
trainer = torch.optim.Adam(model3.parameters(), lr=lr)
train(model3, train_iter, test_iter, trainer, num_epochs, dev)

epoch 1, loss 0.1274, train acc 0.953, test acc 0.908, time 6.1 sec
epoch 2, loss 0.1234, train acc 0.954, test acc 0.906, time 5.9 sec
epoch 3, loss 0.1227, train acc 0.955, test acc 0.908, time 5.9 sec
epoch 4, loss 0.1208, train acc 0.956, test acc 0.908, time 5.9 sec
epoch 5, loss 0.1201, train acc 0.956, test acc 0.907, time 6.0 sec
epoch 6, loss 0.1198, train acc 0.955, test acc 0.909, time 6.0 sec
epoch 7, loss 0.1193, train acc 0.956, test acc 0.908, time 6.0 sec
epoch 8, loss 0.1194, train acc 0.956, test acc 0.908, time 6.0 sec
epoch 9, loss 0.1195, train acc 0.956, test acc 0.908, time 6.0 sec
epoch 10, loss 0.1187, train acc 0.956, test acc 0.908, time 6.0 sec
epoch 11, loss 0.1182, train acc 0.956, test acc 0.908, time 6.1 sec
epoch 12, loss 0.1191, train acc 0.955, test acc 0.907, time 6.0 sec
epoch 13, loss 0.1181, train acc 0.957, test acc 0.908, time 6.1 sec
epoch 14, loss 0.1178, train acc 0.957, test acc 0.907, time 6.0 sec
epoch 15, loss 0.1195, train acc 0.955, tes

* видимо, в какой-то минимум мы пришли, 90,8 на тесте


### В качесте вывода, можно отметить, что с какого-то момента усложнение архитерутры ведет к переобучению  или я неправильно усложняю