In [74]:
%matplotlib inline

import matplotlib.pyplot as plt
import torch
import torchvision as tv
from torchmetrics import Accuracy
import numpy as np
import pandas as pd
import time

## Многослойная нейронная сеть

### Загрузка данных

In [75]:
BATCH_SIZE = 256

In [76]:
train_mnist = tv.datasets.MNIST(
    '.',
    train=True,
    transform=tv.transforms.ToTensor(),
    download=True
)

test_mnist = tv.datasets.MNIST(
    '.',
    train=False,
    transform=tv.transforms.ToTensor(),
    download=True
)

In [77]:
train = torch.utils.data.DataLoader(train_mnist, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_mnist, batch_size=BATCH_SIZE)

In [78]:
train_mnist[0][0].shape

torch.Size([1, 28, 28])

In [79]:
test_mnist[0][0].shape

torch.Size([1, 28, 28])

### Построение модели

In [80]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(784, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 10)
        )

    def forward(self, X):
        X = self.flatten(X)
        logits = self.linear_relu_stack(X)

        return logits

In [81]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
print(f"Использую {device}-устройство\n")

model = NeuralNetwork().to(device)
print(model)

Использую cuda-устройство

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=10, bias=True)
  )
)


In [82]:
learning_rate = 1e-3
num_epochs = 10

loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

### Обучение модели

In [83]:
def train_model(model_nn):
    train_accuracy = Accuracy(task='multiclass', num_classes=10).to(device)
    test_accuracy = Accuracy(task='multiclass', num_classes=10).to(device)

    for epoch in range(num_epochs):
        start = time.time()
        
        train_loss = .0 
        train_iters = 0
        model_nn.train()
        for X, y in train:
            X, y = X.to(device), y.to(device)
            y_pred = model_nn(X)

            optimizer.zero_grad()
            l = loss(y_pred, y)
            l.backward()
            optimizer.step()

            train_loss += l.item()
            train_iters += 1
            train_accuracy.update(y_pred, y)
        train_acc_out = train_accuracy.compute()
        train_accuracy.reset()

        test_loss = .0
        test_iters = 0
        model_nn.eval()
        for X, y in test:
            X, y = X.to(device), y.to(device)
            y_pred = model_nn(X)

            l = loss(y_pred, y)

            test_loss += l.item()
            test_iters += 1
            test_accuracy.update(y_pred, y)
        test_acc_out = test_accuracy.compute()
        test_accuracy.reset()

        print(f"ep: {epoch+1} | time: {time.time()-start:.2f} сек | train_loss: {train_loss/train_iters:.2f} | train_acc: {train_acc_out*100:.2f}%" 
              f" | test_loss: {test_loss/test_iters:.2f} | test_acc: {test_acc_out*100:.2f}%")


In [84]:
train_model(model)

ep: 1 | time: 4.15 сек | train_loss: 2.28 | train_acc: 16.76% | test_loss: 2.25 | test_acc: 24.21%
ep: 2 | time: 4.30 сек | train_loss: 2.23 | train_acc: 33.37% | test_loss: 2.20 | test_acc: 41.83%
ep: 3 | time: 4.26 сек | train_loss: 2.18 | train_acc: 46.99% | test_loss: 2.15 | test_acc: 52.88%
ep: 4 | time: 4.20 сек | train_loss: 2.13 | train_acc: 56.60% | test_loss: 2.09 | test_acc: 61.18%
ep: 5 | time: 4.17 сек | train_loss: 2.07 | train_acc: 64.05% | test_loss: 2.03 | test_acc: 66.95%
ep: 6 | time: 4.07 сек | train_loss: 2.01 | train_acc: 68.11% | test_loss: 1.97 | test_acc: 70.11%
ep: 7 | time: 4.04 сек | train_loss: 1.94 | train_acc: 70.57% | test_loss: 1.89 | test_acc: 72.18%
ep: 8 | time: 4.14 сек | train_loss: 1.87 | train_acc: 72.03% | test_loss: 1.82 | test_acc: 73.44%
ep: 9 | time: 3.94 сек | train_loss: 1.79 | train_acc: 73.05% | test_loss: 1.74 | test_acc: 74.43%
ep: 10 | time: 4.16 сек | train_loss: 1.71 | train_acc: 73.99% | test_loss: 1.66 | test_acc: 75.47%


#### Точность модели на тестовых данных составляет 75.47%

### Попробуем заменить SGD на Adam

In [85]:
class NeuralNetworkAdam(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkAdam, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(784, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 10)
        )

    def forward(self, X):
        X = self.flatten(X)
        logits = self.linear_relu_stack(X)

        return logits

model_adam = NeuralNetworkAdam().to(device)

optimizer = torch.optim.Adam(model_adam.parameters(), lr=learning_rate)

In [86]:
train_model(model_adam)

ep: 1 | time: 4.45 сек | train_loss: 0.46 | train_acc: 88.37% | test_loss: 0.24 | test_acc: 93.10%
ep: 2 | time: 4.35 сек | train_loss: 0.21 | train_acc: 93.91% | test_loss: 0.17 | test_acc: 95.00%
ep: 3 | time: 4.25 сек | train_loss: 0.16 | train_acc: 95.55% | test_loss: 0.13 | test_acc: 96.10%
ep: 4 | time: 4.29 сек | train_loss: 0.12 | train_acc: 96.53% | test_loss: 0.11 | test_acc: 96.71%
ep: 5 | time: 4.47 сек | train_loss: 0.10 | train_acc: 97.18% | test_loss: 0.10 | test_acc: 97.11%
ep: 6 | time: 4.44 сек | train_loss: 0.08 | train_acc: 97.71% | test_loss: 0.09 | test_acc: 97.28%
ep: 7 | time: 4.48 сек | train_loss: 0.07 | train_acc: 98.06% | test_loss: 0.09 | test_acc: 97.33%
ep: 8 | time: 4.45 сек | train_loss: 0.06 | train_acc: 98.40% | test_loss: 0.08 | test_acc: 97.35%
ep: 9 | time: 4.44 сек | train_loss: 0.05 | train_acc: 98.66% | test_loss: 0.08 | test_acc: 97.43%
ep: 10 | time: 4.39 сек | train_loss: 0.04 | train_acc: 98.90% | test_loss: 0.08 | test_acc: 97.43%


#### Adam дает лучшую оценку на тестовых данных, нежели SGD

### Добавим больше слоев

In [87]:
class NeuralNetworkAdamMoreLays(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkAdamMoreLays, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(784, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 10)
        )

    def forward(self, X):
        X = self.flatten(X)
        logits = self.linear_relu_stack(X)

        return logits

model_adam_more_lays = NeuralNetworkAdamMoreLays().to(device)

optimizer = torch.optim.Adam(model_adam_more_lays.parameters(), lr=learning_rate)

In [88]:
train_model(model_adam_more_lays)

ep: 1 | time: 4.55 сек | train_loss: 0.42 | train_acc: 87.99% | test_loss: 0.20 | test_acc: 93.53%
ep: 2 | time: 4.48 сек | train_loss: 0.15 | train_acc: 95.54% | test_loss: 0.12 | test_acc: 96.22%
ep: 3 | time: 4.45 сек | train_loss: 0.09 | train_acc: 97.25% | test_loss: 0.10 | test_acc: 97.03%
ep: 4 | time: 4.57 сек | train_loss: 0.07 | train_acc: 98.00% | test_loss: 0.09 | test_acc: 97.03%
ep: 5 | time: 4.60 сек | train_loss: 0.05 | train_acc: 98.45% | test_loss: 0.09 | test_acc: 97.27%
ep: 6 | time: 4.63 сек | train_loss: 0.04 | train_acc: 98.65% | test_loss: 0.08 | test_acc: 97.56%
ep: 7 | time: 4.78 сек | train_loss: 0.04 | train_acc: 98.86% | test_loss: 0.09 | test_acc: 97.65%
ep: 8 | time: 4.72 сек | train_loss: 0.03 | train_acc: 99.12% | test_loss: 0.09 | test_acc: 97.45%
ep: 9 | time: 4.64 сек | train_loss: 0.02 | train_acc: 99.24% | test_loss: 0.09 | test_acc: 97.49%
ep: 10 | time: 4.61 сек | train_loss: 0.02 | train_acc: 99.44% | test_loss: 0.09 | test_acc: 97.64%


#### Результат еще лучше!

### Добавим BatchNorm-слои

In [97]:
class NeuralNetworkBatchNorm(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkBatchNorm, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(784, 512),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(512),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(256),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(128),
            torch.nn.Linear(128, 10)
        )

    def forward(self, X):
        X = self.flatten(X)
        logits = self.linear_relu_stack(X)

        return logits

model_batchnorm = NeuralNetworkBatchNorm().to(device)

optimizer = torch.optim.Adam(model_batchnorm.parameters(), lr=learning_rate)

In [98]:
train_model(model_batchnorm)

ep: 1 | time: 4.61 сек | train_loss: 0.20 | train_acc: 94.31% | test_loss: 0.11 | test_acc: 96.39%
ep: 2 | time: 4.50 сек | train_loss: 0.08 | train_acc: 97.72% | test_loss: 0.10 | test_acc: 96.86%
ep: 3 | time: 4.62 сек | train_loss: 0.05 | train_acc: 98.57% | test_loss: 0.08 | test_acc: 97.55%
ep: 4 | time: 4.57 сек | train_loss: 0.03 | train_acc: 99.01% | test_loss: 0.09 | test_acc: 97.12%
ep: 5 | time: 4.67 сек | train_loss: 0.03 | train_acc: 99.16% | test_loss: 0.10 | test_acc: 97.02%
ep: 6 | time: 4.69 сек | train_loss: 0.02 | train_acc: 99.22% | test_loss: 0.09 | test_acc: 97.27%
ep: 7 | time: 4.74 сек | train_loss: 0.02 | train_acc: 99.43% | test_loss: 0.10 | test_acc: 97.54%
ep: 8 | time: 4.55 сек | train_loss: 0.01 | train_acc: 99.64% | test_loss: 0.09 | test_acc: 97.63%
ep: 9 | time: 4.71 сек | train_loss: 0.01 | train_acc: 99.54% | test_loss: 0.08 | test_acc: 97.71%
ep: 10 | time: 4.65 сек | train_loss: 0.01 | train_acc: 99.61% | test_loss: 0.08 | test_acc: 97.93%


#### С батч-нормализацией модель имеет наилучший результат на тестовых данных!!! 

### Ради интереса попробуем Dropout

In [101]:
class NeuralNetworkDropout(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkDropout, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(784, 2560),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(2560, 1280),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(1280, 640),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(640, 10)
        )

    def forward(self, X):
        X = self.flatten(X)
        logits = self.linear_relu_stack(X)

        return logits

model_dropout = NeuralNetworkDropout().to(device)

optimizer = torch.optim.Adam(model_dropout.parameters(), lr=1e-3)

In [102]:
train_model(model_dropout)

ep: 1 | time: 4.41 сек | train_loss: 0.33 | train_acc: 89.90% | test_loss: 0.16 | test_acc: 94.67%
ep: 2 | time: 4.77 сек | train_loss: 0.14 | train_acc: 95.93% | test_loss: 0.11 | test_acc: 96.74%
ep: 3 | time: 4.87 сек | train_loss: 0.11 | train_acc: 96.72% | test_loss: 0.10 | test_acc: 96.87%
ep: 4 | time: 4.79 сек | train_loss: 0.09 | train_acc: 97.35% | test_loss: 0.08 | test_acc: 97.55%
ep: 5 | time: 4.93 сек | train_loss: 0.08 | train_acc: 97.61% | test_loss: 0.08 | test_acc: 97.60%
ep: 6 | time: 5.06 сек | train_loss: 0.07 | train_acc: 97.88% | test_loss: 0.08 | test_acc: 97.84%
ep: 7 | time: 5.14 сек | train_loss: 0.07 | train_acc: 97.99% | test_loss: 0.08 | test_acc: 97.81%
ep: 8 | time: 4.81 сек | train_loss: 0.06 | train_acc: 98.26% | test_loss: 0.08 | test_acc: 97.89%
ep: 9 | time: 4.97 сек | train_loss: 0.06 | train_acc: 98.23% | test_loss: 0.07 | test_acc: 98.08%
ep: 10 | time: 5.03 сек | train_loss: 0.05 | train_acc: 98.46% | test_loss: 0.07 | test_acc: 98.10%


#### При шаге обучения 0.001 модель показывает хороший результат. Если повысить шаг до 0.01, то оценка на тестовых данных будет гораздо лучше, чем на обучающих