In [9]:
import time

import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torchvision as tv

 Загрузка данных

In [12]:
BATCH_SIZE=256
train_dataset = tv.datasets.MNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.MNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

### Базовая модель!

In [13]:
num_epochs = 10
loss = torch.nn.CrossEntropyLoss()

In [15]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 280),
    torch.nn.ReLU(),
    torch.nn.Linear(280, 10)
)

In [16]:
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=280, bias=True)
  (2): ReLU()
  (3): Linear(in_features=280, out_features=10, bias=True)
)

In [17]:
trainer = torch.optim.SGD(model.parameters(), lr=.01)

In [18]:
def train_model():
    for ep in range(num_epochs):
        train_iters, train_passed  = 0, 0
        train_loss, train_acc = 0., 0.
        start=time.time()
        
        model.train()
        for X, y in train:
            trainer.zero_grad()
            y_pred = model(X)
            l = loss(y_pred, y)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == y).sum().item()
            train_iters += 1
            train_passed += len(X)
        
        test_iters, test_passed  = 0, 0
        test_loss, test_acc = 0., 0.
        model.eval()
        for X, y in test:
            y_pred = model(X)
            l = loss(y_pred, y)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
            test_iters += 1
            test_passed += len(X)
            
        print("ep: {}, taked: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc / train_passed,
            test_loss / test_iters, test_acc / test_passed)
        )

In [19]:
train_model()

ep: 0, taked: 2.543, train_loss: 2.030889265587989, train_acc: 0.5610833333333334, test_loss: 1.6681602120399475, test_acc: 0.7438
ep: 1, taked: 2.522, train_loss: 1.318907409779569, train_acc: 0.7762, test_loss: 0.9911847203969956, test_acc: 0.817
ep: 2, taked: 2.480, train_loss: 0.8534280244340288, train_acc: 0.82825, test_loss: 0.7022526800632477, test_acc: 0.8494
ep: 3, taked: 2.521, train_loss: 0.6567650067045333, train_acc: 0.8514166666666667, test_loss: 0.5716592539101839, test_acc: 0.8672
ep: 4, taked: 2.487, train_loss: 0.5582369789164117, train_acc: 0.8644833333333334, test_loss: 0.4990766070783138, test_acc: 0.8769
ep: 5, taked: 2.498, train_loss: 0.49953448525134553, train_acc: 0.87405, test_loss: 0.4530746672302485, test_acc: 0.8835
ep: 6, taked: 2.502, train_loss: 0.4605356029373534, train_acc: 0.8808833333333334, test_loss: 0.4213456977158785, test_acc: 0.8879
ep: 7, taked: 2.542, train_loss: 0.4326506283689053, train_acc: 0.8852666666666666, test_loss: 0.398091431707143

### Модель та же, оптимизатор ADAM

In [20]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 2.845, train_loss: 0.2791221217509914, train_acc: 0.9196333333333333, test_loss: 0.13947316671255977, test_acc: 0.9548
ep: 1, taked: 2.556, train_loss: 0.11146213900139357, train_acc: 0.9656833333333333, test_loss: 0.12139254375360906, test_acc: 0.9642
ep: 2, taked: 2.574, train_loss: 0.07755408009236797, train_acc: 0.97605, test_loss: 0.12023709187051282, test_acc: 0.9651
ep: 3, taked: 2.600, train_loss: 0.058777775043780185, train_acc: 0.9814, test_loss: 0.09742028797627426, test_acc: 0.972
ep: 4, taked: 2.651, train_loss: 0.053700930397323474, train_acc: 0.9821333333333333, test_loss: 0.1184852261794731, test_acc: 0.9684
ep: 5, taked: 2.574, train_loss: 0.04526746704421462, train_acc: 0.9854833333333334, test_loss: 0.12313873084785883, test_acc: 0.9685
ep: 6, taked: 2.566, train_loss: 0.04122199494827618, train_acc: 0.9860833333333333, test_loss: 0.11060350458719767, test_acc: 0.9721
ep: 7, taked: 2.531, train_loss: 0.04367182032215072, train_acc: 0.9853666666666666, t

In [23]:
trainer = torch.optim.Adam(model.parameters(), lr=.01, weight_decay=.00001)
train_model()

ep: 0, taked: 2.600, train_loss: 0.07343536421280117, train_acc: 0.9766333333333334, test_loss: 0.13257493253913707, test_acc: 0.9649
ep: 1, taked: 2.874, train_loss: 0.06114110290568243, train_acc: 0.98055, test_loss: 0.1466785636759596, test_acc: 0.965
ep: 2, taked: 2.931, train_loss: 0.05811400160272705, train_acc: 0.9813166666666666, test_loss: 0.1297523963934509, test_acc: 0.9679
ep: 3, taked: 3.031, train_loss: 0.052610865409703965, train_acc: 0.9823833333333334, test_loss: 0.14272555520101377, test_acc: 0.9669
ep: 4, taked: 2.829, train_loss: 0.05364435728282688, train_acc: 0.9824833333333334, test_loss: 0.14242301852318634, test_acc: 0.9684
ep: 5, taked: 2.937, train_loss: 0.05164318871387142, train_acc: 0.9831833333333333, test_loss: 0.12712270568372333, test_acc: 0.9683
ep: 6, taked: 2.802, train_loss: 0.044573068147168514, train_acc: 0.9853166666666666, test_loss: 0.1497264314966742, test_acc: 0.9657
ep: 7, taked: 2.983, train_loss: 0.04395138074871787, train_acc: 0.98561666

## Расширенная модель c новыми вложенными слоями, оптимизатор ADAM 

In [26]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)
trainer = torch.optim.Adam(model.parameters(), lr=.01)

In [27]:
train_model()

ep: 0, taked: 2.864, train_loss: 0.33688838737125093, train_acc: 0.89765, test_loss: 0.2208868912421167, test_acc: 0.9352
ep: 1, taked: 2.830, train_loss: 0.1323227901308936, train_acc: 0.9611166666666666, test_loss: 0.13094604073558003, test_acc: 0.9639
ep: 2, taked: 2.839, train_loss: 0.09295476523842267, train_acc: 0.9734166666666667, test_loss: 0.1277302926609991, test_acc: 0.9688
ep: 3, taked: 2.834, train_loss: 0.0827249093971988, train_acc: 0.9754166666666667, test_loss: 0.12029314061292098, test_acc: 0.9704
ep: 4, taked: 2.859, train_loss: 0.0697178323693732, train_acc: 0.9793666666666667, test_loss: 0.17237889487532812, test_acc: 0.9614
ep: 5, taked: 2.830, train_loss: 0.06685882171695219, train_acc: 0.9799166666666667, test_loss: 0.15388997847912833, test_acc: 0.9678
ep: 6, taked: 2.810, train_loss: 0.07211822369433146, train_acc: 0.9800666666666666, test_loss: 0.14082879549387145, test_acc: 0.9699
ep: 7, taked: 2.823, train_loss: 0.05568225264073686, train_acc: 0.98381666666

In [28]:
trainer = torch.optim.Adam(model.parameters(), lr=.01, weight_decay=.00001)
train_model()

ep: 0, taked: 2.817, train_loss: 0.07399510852249458, train_acc: 0.98145, test_loss: 0.15103355180472136, test_acc: 0.9687
ep: 1, taked: 2.819, train_loss: 0.05989445929039032, train_acc: 0.984, test_loss: 0.1374051682651043, test_acc: 0.9686
ep: 2, taked: 2.831, train_loss: 0.04733505429937802, train_acc: 0.9868166666666667, test_loss: 0.14348005824722349, test_acc: 0.9715
ep: 3, taked: 2.837, train_loss: 0.05404503925485497, train_acc: 0.9851166666666666, test_loss: 0.15068912280894436, test_acc: 0.9692
ep: 4, taked: 2.840, train_loss: 0.051904101006964104, train_acc: 0.9857833333333333, test_loss: 0.1602115301277081, test_acc: 0.9675
ep: 5, taked: 2.842, train_loss: 0.0472360121111009, train_acc: 0.98715, test_loss: 0.13363377345914956, test_acc: 0.9755
ep: 6, taked: 2.818, train_loss: 0.043474548553770524, train_acc: 0.9877833333333333, test_loss: 0.15744542738557357, test_acc: 0.9696
ep: 7, taked: 2.818, train_loss: 0.04628770930415138, train_acc: 0.9874166666666667, test_loss: 0.

In [30]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 420),
    torch.nn.ReLU(),
    torch.nn.Linear(420, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 3.409, train_loss: 0.3868577084642775, train_acc: 0.8830666666666667, test_loss: 0.18940905279014258, test_acc: 0.9488
ep: 1, taked: 3.361, train_loss: 0.1495398185037552, train_acc: 0.9595, test_loss: 0.14540453211520799, test_acc: 0.9638
ep: 2, taked: 3.358, train_loss: 0.1182873045668957, train_acc: 0.9681, test_loss: 0.15267380598234012, test_acc: 0.9635
ep: 3, taked: 3.344, train_loss: 0.10211818856286242, train_acc: 0.9722333333333333, test_loss: 0.18451977834047284, test_acc: 0.9544
ep: 4, taked: 3.368, train_loss: 0.0883698287033575, train_acc: 0.9760333333333333, test_loss: 0.1517584828354302, test_acc: 0.9702
ep: 5, taked: 3.351, train_loss: 0.07825563163991939, train_acc: 0.9782666666666666, test_loss: 0.1243143544328376, test_acc: 0.9744
ep: 6, taked: 3.365, train_loss: 0.07498621378970788, train_acc: 0.9807333333333333, test_loss: 0.16913745864585508, test_acc: 0.9682
ep: 7, taked: 3.344, train_loss: 0.14683258662277715, train_acc: 0.9685166666666667, test_lo

###  batchnorm-слои

In [31]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 3.086, train_loss: 0.19915773247546972, train_acc: 0.9393, test_loss: 0.13939199645537884, test_acc: 0.9573
ep: 1, taked: 2.989, train_loss: 0.09899997916865222, train_acc: 0.9696666666666667, test_loss: 0.11155526147922501, test_acc: 0.965
ep: 2, taked: 2.972, train_loss: 0.07025942366094666, train_acc: 0.9783666666666667, test_loss: 0.10770108699216507, test_acc: 0.9665
ep: 3, taked: 2.963, train_loss: 0.05529376497700256, train_acc: 0.9822833333333333, test_loss: 0.13164326834958046, test_acc: 0.9638
ep: 4, taked: 2.966, train_loss: 0.04481163256841295, train_acc: 0.9856833333333334, test_loss: 0.09870837096950709, test_acc: 0.9733
ep: 5, taked: 2.951, train_loss: 0.03749513915700323, train_acc: 0.9876666666666667, test_loss: 0.10809384790991317, test_acc: 0.9698
ep: 6, taked: 2.935, train_loss: 0.031934654292591076, train_acc: 0.9891666666666666, test_loss: 0.117412916617468, test_acc: 0.9679
ep: 7, taked: 2.915, train_loss: 0.02717084988699037, train_acc: 0.990966666

In [32]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 420),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(420),
    torch.nn.Linear(420, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),    
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 3.704, train_loss: 0.21234287453141618, train_acc: 0.93455, test_loss: 0.1438282665796578, test_acc: 0.9574
ep: 1, taked: 3.588, train_loss: 0.10895801110866855, train_acc: 0.9665166666666667, test_loss: 0.12667159093543887, test_acc: 0.9617
ep: 2, taked: 3.600, train_loss: 0.07661025325787511, train_acc: 0.9767833333333333, test_loss: 0.10266554319532588, test_acc: 0.9691
ep: 3, taked: 3.577, train_loss: 0.05759265441843804, train_acc: 0.98275, test_loss: 0.10103279988688882, test_acc: 0.9704
ep: 4, taked: 3.595, train_loss: 0.050247343142635806, train_acc: 0.9841666666666666, test_loss: 0.11500746359670302, test_acc: 0.9675
ep: 5, taked: 3.586, train_loss: 0.0412614957100038, train_acc: 0.9871, test_loss: 0.08598435341918957, test_acc: 0.9761
ep: 6, taked: 3.598, train_loss: 0.034423861003783036, train_acc: 0.9894, test_loss: 0.10051494324070517, test_acc: 0.9745
ep: 7, taked: 3.577, train_loss: 0.029406855189479254, train_acc: 0.99045, test_loss: 0.08387665416958043, t

In [34]:
trainer = torch.optim.Adam(model.parameters(), lr=.01, weight_decay=.0001)
train_model()

ep: 0, taked: 3.591, train_loss: 0.1977269309751214, train_acc: 0.9409166666666666, test_loss: 0.19732959512621165, test_acc: 0.9393
ep: 1, taked: 3.549, train_loss: 0.1340564734243015, train_acc: 0.9599833333333333, test_loss: 0.13256317247869448, test_acc: 0.9581
ep: 2, taked: 3.352, train_loss: 0.11004191231616633, train_acc: 0.9673833333333334, test_loss: 0.15802564211189746, test_acc: 0.9537
ep: 3, taked: 3.299, train_loss: 0.09915403906414483, train_acc: 0.9710666666666666, test_loss: 0.13424973711953497, test_acc: 0.9599
ep: 4, taked: 3.246, train_loss: 0.0942427350723363, train_acc: 0.9717, test_loss: 0.13192429919727147, test_acc: 0.9608
ep: 5, taked: 3.181, train_loss: 0.08797780507977339, train_acc: 0.9730333333333333, test_loss: 0.12880568424589, test_acc: 0.9594
ep: 6, taked: 3.198, train_loss: 0.08429147851593952, train_acc: 0.9742666666666666, test_loss: 0.14558879599208013, test_acc: 0.9564
ep: 7, taked: 3.183, train_loss: 0.08071565100170196, train_acc: 0.9755, test_lo

#### Dropout

In [35]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 420),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.2),
    torch.nn.Linear(420, 256),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.2),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.2),
    torch.nn.Linear(128, 10)
)
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 2.930, train_loss: 0.3543879445166664, train_acc: 0.8937, test_loss: 0.17920149893034248, test_acc: 0.9499
ep: 1, taked: 2.970, train_loss: 0.19751442868975883, train_acc: 0.94685, test_loss: 0.14825560890603812, test_acc: 0.9622
ep: 2, taked: 2.829, train_loss: 0.16453508638004039, train_acc: 0.95595, test_loss: 0.13827614121255466, test_acc: 0.9635
ep: 3, taked: 2.839, train_loss: 0.15545102543732586, train_acc: 0.959, test_loss: 0.13475088756822515, test_acc: 0.9653
ep: 4, taked: 2.821, train_loss: 0.16738111031578576, train_acc: 0.9571, test_loss: 0.12200984655355569, test_acc: 0.9716
ep: 5, taked: 2.810, train_loss: 0.16532057692554403, train_acc: 0.9583, test_loss: 0.14407126301521203, test_acc: 0.9658
ep: 6, taked: 2.830, train_loss: 0.15878582276562428, train_acc: 0.9607166666666667, test_loss: 0.12117016334232175, test_acc: 0.9717
ep: 7, taked: 2.821, train_loss: 0.15263730226718683, train_acc: 0.96185, test_loss: 0.16454848424532997, test_acc: 0.9685
ep: 8, take

## Batch слой и Dropout слой

In [36]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 420),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(420),
    torch.nn.Dropout(0.2),

    torch.nn.Linear(420, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Dropout(0.2),

    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Dropout(0.2),

    torch.nn.Linear(128, 10)
)
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 3.268, train_loss: 0.23374723142131845, train_acc: 0.9284333333333333, test_loss: 0.13304949760204182, test_acc: 0.9585
ep: 1, taked: 3.272, train_loss: 0.1313686349726421, train_acc: 0.9598833333333333, test_loss: 0.10853744776686654, test_acc: 0.9641
ep: 2, taked: 3.245, train_loss: 0.100949395981003, train_acc: 0.9691, test_loss: 0.10122985987109132, test_acc: 0.9689
ep: 3, taked: 3.291, train_loss: 0.08352582965521736, train_acc: 0.9745, test_loss: 0.10269346039858647, test_acc: 0.9686
ep: 4, taked: 3.181, train_loss: 0.07224755988238339, train_acc: 0.9777666666666667, test_loss: 0.08616987960485858, test_acc: 0.9756
ep: 5, taked: 3.260, train_loss: 0.06260753465776747, train_acc: 0.9801333333333333, test_loss: 0.09228396331891417, test_acc: 0.9743
ep: 6, taked: 3.260, train_loss: 0.05875541496665237, train_acc: 0.9810166666666666, test_loss: 0.07375505996024004, test_acc: 0.9771
ep: 7, taked: 3.259, train_loss: 0.04936894037503194, train_acc: 0.9841, test_loss: 0.084

# Вывод: 
- максимальная точность для тестовой выборки 0.9764 и модели содержащей как batch (нормализацию) и Dropout (забывания ) слои