<a href="https://colab.research.google.com/github/dahlia52/Advanced-Statistical-Data-Analysis/blob/main/softmax_classification_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.transforms as transforms
from torchvision.datasets import MNIST, CIFAR10, CIFAR100
from torch.utils.data import DataLoader

import numpy as np
import matplotlib.pyplot as plt

In [None]:
path = './datasets/'

transform = transforms.Compose([transforms.ToTensor()]) # 이미지를 텐서로 변환

# Prepare Data
train_data = MNIST(root = path, train = True, transform = transform, download = True)
test_data = MNIST(root = path, train = False, transform = transform, download = True)

batch_size = 100

# DataLoader
train_loader = DataLoader(dataset = train_data, batch_size = batch_size, shuffle = True, num_workers = 4)
test_loader = DataLoader(dataset = test_data, batch_size = batch_size, shuffle = False, num_workers = 4)

print(train_data)
print(test_data)

input_shape = train_data[0][0].reshape(-1).shape[0] # 28*28 = 784 features
output_shape = len(train_data.classes)

print(input_shape,output_shape)

Dataset MNIST
    Number of datapoints: 60000
    Root location: ./datasets/
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )
Dataset MNIST
    Number of datapoints: 10000
    Root location: ./datasets/
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
           )
784 10




In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps:0")
elif torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu:0")

print(device)

cuda:0


## Experiment1 : SGD optimizer 적용

In [None]:
class SoftmaxClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        # self.linear1 = nn.Linear(input_shape, 256)
        # self.linear2 = nn.Linear(256, 256)
        # self.linear3 = nn.Linear(256, 256)
        # self.linear4 = nn.Linear(256, output_shape)

        # 1*28*28 -> 784
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_shape, 512),
            nn.LeakyReLU(),
            # nn.Dropout(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            # nn.Dropout(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            # nn.Dropout(),
            nn.Linear(128, output_shape)
        )

    def forward(self,x):

        # hidden = F.leaky_relu(self.linear1(x))
        # hidden = F.leaky_relu(self.linear2(hidden))
        # hidden = F.leaky_relu(self.linear3(hidden))
        # output = self.linear4(hidden)

        # return output

        return self.layers(x)

In [None]:
model = SoftmaxClassifier().to(device)
# logit = pre-softmax value = pre prob value = [0.9, 0.05, 0.02, ...] vs 0 = label value
loss = nn.CrossEntropyLoss(reduction = 'sum') # minibatch 안의 100개의 loss를 모두 더함. 100 losses -> sum of 100 losses

# minimize loss
# minimize loss' = loss + weight_decay * l2_regularization
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-3)

In [None]:
num_epoch = 100
train_loss_list, test_loss_list = list(), list()

for i in range(num_epoch):

  # train
  model.train()

  total_loss = 0
  count = 0

  for batch_idx, (x,y) in enumerate(train_loader):
    # MNIST: x.shape = (100,1,28,28) -> x.shape = (100,784), y.shape = (100,1) = (100,)
    x, y = x.to(device), y.to(device)

    # 001000000 x
    # 3
    pred_y_est = model.forward(x)
    cost = loss(pred_y_est, y) # loss 함수가 label을 one-hot vector로 바꾸어 pred_y_est와의 loss를 계산해줌.
    y_est = F.softmax(pred_y_est,dim = -1) # 마지막 차원에 대해 softmax

    total_loss += cost.item()

    # L1 norm 적용 시,
    # for param in model.parameters():
    #   cost += torch.norm(param, 1)*0.001

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    pred = torch.argmax(y_est, dim = -1)
    count += (pred == y).sum().item()

  acc = count / len(train_data)
  avg_loss = total_loss / len(train_data)

  if i % 1 == 0:
        print("\nEpoch %d Train: Loss %.3f / Accuracy %.3f"%(i,avg_loss,acc))

  # test
  model.eval()

  total_loss = 0
  count = 0

  with torch.no_grad(): # gradient 계산X
    for batch_idx, (x,y) in enumerate(test_loader):
      x, y = x.to(device), y.to(device)

      pre_y_est = model.forward(x)
      cost = loss(pred_y_est, y)
      y_est = F.softmax(pre_y_est, dim = -1)

      total_loss += cost.item()

      pred = torch.argmax(y_est, dim = -1)
      count += (pred == y).sum().item()

    acc = count / len(test_data)
    avg_loss = total_loss / len(test_data)

    test_loss_list.append(avg_loss)

    if i % 1 == 0:
      print("Epoch %d Test: Loss %.3f / Accuracy %.3f"%(i,avg_loss,acc))




Epoch 0 Train: Loss 0.777 / Accuracy 0.770
Epoch 0 Test: Loss 7.848 / Accuracy 0.918

Epoch 1 Train: Loss 0.207 / Accuracy 0.939
Epoch 1 Test: Loss 8.536 / Accuracy 0.947

Epoch 2 Train: Loss 0.132 / Accuracy 0.961
Epoch 2 Test: Loss 9.992 / Accuracy 0.964

Epoch 3 Train: Loss 0.095 / Accuracy 0.972
Epoch 3 Test: Loss 11.483 / Accuracy 0.972

Epoch 4 Train: Loss 0.075 / Accuracy 0.978
Epoch 4 Test: Loss 11.486 / Accuracy 0.973

Epoch 5 Train: Loss 0.059 / Accuracy 0.983
Epoch 5 Test: Loss 12.593 / Accuracy 0.978

Epoch 6 Train: Loss 0.048 / Accuracy 0.986
Epoch 6 Test: Loss 12.784 / Accuracy 0.972

Epoch 7 Train: Loss 0.039 / Accuracy 0.988
Epoch 7 Test: Loss 13.683 / Accuracy 0.979

Epoch 8 Train: Loss 0.031 / Accuracy 0.991
Epoch 8 Test: Loss 14.669 / Accuracy 0.979

Epoch 9 Train: Loss 0.025 / Accuracy 0.993
Epoch 9 Test: Loss 15.751 / Accuracy 0.980

Epoch 10 Train: Loss 0.020 / Accuracy 0.995
Epoch 10 Test: Loss 16.674 / Accuracy 0.980

Epoch 11 Train: Loss 0.015 / Accuracy 0.996

In [None]:
num_parameter = 0
for parameter in model.parameters():
    print(parameter.shape)
    num_parameter += np.prod(parameter.size())
print(num_parameter)

## Experiment 2 : Adam Optimizer 적용

In [None]:
class SoftmaxClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        # self.linear1 = nn.Linear(input_shape, 256)
        # self.linear2 = nn.Linear(256, 256)
        # self.linear3 = nn.Linear(256, 256)
        # self.linear4 = nn.Linear(256, output_shape)

        # 1*28*28 -> 784
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_shape, 512),
            nn.LeakyReLU(),
            # nn.Dropout(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            # nn.Dropout(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            # nn.Dropout(),
            nn.Linear(128, output_shape)
        )

    def forward(self,x):

        # hidden = F.leaky_relu(self.linear1(x))
        # hidden = F.leaky_relu(self.linear2(hidden))
        # hidden = F.leaky_relu(self.linear3(hidden))
        # output = self.linear4(hidden)

        # return output

        return self.layers(x)

In [None]:
model = SoftmaxClassifier().to(device)
# logit = pre-softmax value = pre prob value = [0.9, 0.05, 0.02, ...] vs 0 = label value
loss = nn.CrossEntropyLoss(reduction = 'sum') # minibatch 안의 100개의 loss를 모두 더함. 100 losses -> sum of 100 losses

# minimize loss
# minimize loss' = loss + weight_decay * l2_regularization
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-3)

In [None]:
num_epoch = 100
train_loss_list, test_loss_list = list(), list()

for i in range(num_epoch):

  # train
  model.train()

  total_loss = 0
  count = 0

  for batch_idx, (x,y) in enumerate(train_loader):
    # MNIST: x.shape = (100,1,28,28) -> x.shape = (100,784), y.shape = (100,1) = (100,)
    x, y = x.to(device), y.to(device)

    # 001000000 x
    # 3
    pred_y_est = model.forward(x)
    cost = loss(pred_y_est, y) # loss 함수가 label을 one-hot vector로 바꾸어 pred_y_est와의 loss를 계산해줌.
    y_est = F.softmax(pred_y_est,dim = -1) # 마지막 차원에 대해 softmax

    total_loss += cost.item()

    # L1 norm 적용 시,
    # for param in model.parameters():
    #   cost += torch.norm(param, 1)*0.001

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    pred = torch.argmax(y_est, dim = -1)
    count += (pred == y).sum().item()

  acc = count / len(train_data)
  avg_loss = total_loss / len(train_data)

  if i % 1 == 0:
        print("\nEpoch %d Train: Loss %.3f / Accuracy %.3f"%(i,avg_loss,acc))

  # test
  model.eval()

  total_loss = 0
  count = 0

  with torch.no_grad(): # gradient 계산X
    for batch_idx, (x,y) in enumerate(test_loader):
      x, y = x.to(device), y.to(device)

      pre_y_est = model.forward(x)
      cost = loss(pred_y_est, y)
      y_est = F.softmax(pre_y_est, dim = -1)

      total_loss += cost.item()

      pred = torch.argmax(y_est, dim = -1)
      count += (pred == y).sum().item()

    acc = count / len(test_data)
    avg_loss = total_loss / len(test_data)

    test_loss_list.append(avg_loss)

    if i % 1 == 0:
      print("Epoch %d Test: Loss %.3f / Accuracy %.3f"%(i,avg_loss,acc))

## Experiment 3 : Adam Optimizer + Dropout 적용

In [None]:
class SoftmaxClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        # self.linear1 = nn.Linear(input_shape, 256)
        # self.linear2 = nn.Linear(256, 256)
        # self.linear3 = nn.Linear(256, 256)
        # self.linear4 = nn.Linear(256, output_shape)

        # 1*28*28 -> 784
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_shape, 512),
            nn.LeakyReLU(),
            nn.Dropout(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Dropout(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Dropout(),
            nn.Linear(128, output_shape)
        )

    def forward(self,x):

        # hidden = F.leaky_relu(self.linear1(x))
        # hidden = F.leaky_relu(self.linear2(hidden))
        # hidden = F.leaky_relu(self.linear3(hidden))
        # output = self.linear4(hidden)

        # return output

        return self.layers(x)

In [None]:
model = SoftmaxClassifier().to(device)
# logit = pre-softmax value = pre prob value = [0.9, 0.05, 0.02, ...] vs 0 = label value
loss = nn.CrossEntropyLoss(reduction = 'sum') # minibatch 안의 100개의 loss를 모두 더함. 100 losses -> sum of 100 losses

# minimize loss
# minimize loss' = loss + weight_decay * l2_regularization
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-3)

In [None]:
num_epoch = 100
train_loss_list, test_loss_list = list(), list()

for i in range(num_epoch):

  # train
  model.train()

  total_loss = 0
  count = 0

  for batch_idx, (x,y) in enumerate(train_loader):
    # MNIST: x.shape = (100,1,28,28) -> x.shape = (100,784), y.shape = (100,1) = (100,)
    x, y = x.to(device), y.to(device)

    # 001000000 x
    # 3
    pred_y_est = model.forward(x)
    cost = loss(pred_y_est, y) # loss 함수가 label을 one-hot vector로 바꾸어 pred_y_est와의 loss를 계산해줌.
    y_est = F.softmax(pred_y_est,dim = -1) # 마지막 차원에 대해 softmax

    total_loss += cost.item()

    # L1 norm 적용 시,
    # for param in model.parameters():
    #   cost += torch.norm(param, 1)*0.001

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    pred = torch.argmax(y_est, dim = -1)
    count += (pred == y).sum().item()

  acc = count / len(train_data)
  avg_loss = total_loss / len(train_data)

  if i % 1 == 0:
        print("\nEpoch %d Train: Loss %.3f / Accuracy %.3f"%(i,avg_loss,acc))

  # test
  model.eval()

  total_loss = 0
  count = 0

  with torch.no_grad(): # gradient 계산X
    for batch_idx, (x,y) in enumerate(test_loader):
      x, y = x.to(device), y.to(device)

      pre_y_est = model.forward(x)
      cost = loss(pred_y_est, y)
      y_est = F.softmax(pre_y_est, dim = -1)

      total_loss += cost.item()

      pred = torch.argmax(y_est, dim = -1)
      count += (pred == y).sum().item()

    acc = count / len(test_data)
    avg_loss = total_loss / len(test_data)

    test_loss_list.append(avg_loss)

    if i % 1 == 0:
      print("Epoch %d Test: Loss %.3f / Accuracy %.3f"%(i,avg_loss,acc))

- MNIST 데이터셋은 너무 쉽기 때문에 어떤 모델이든 성능이 높게 나오는 경우가 많음.

- without dropout : Train Loss는 빠르게 감소하지만, Test Loss는 줄어들다 어느 순간 증가함.

- with dropout : Train Loss는 상대적으로 덜 빠르게 감소하지만, Test Loss도 같이 감소함. -> 모든 경우에 잘 작동하는 것은 아니지만, 과적합을 방지해줌.