# Drpoout 기법

In [1]:
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('=3')

=3


In [None]:
train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True)
X = train_dataset.data.numpy().astype(np.float32) / 255.0 
y = train_dataset.targets.numpy().astype(np.int64)

In [None]:
X_train = torch.from_numpy(X).view(-1, 28*28)  # Flatten
y_train = torch.from_numpy(y)

batch_size = 64

dropout의 확률을 0.9로 주었을 때

In [None]:
class ClassifierModel(nn.Module):
    def __init__(self, dropout_rate):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
def train(model, X_train, y_train, criterion, optimizer, batch_size, epochs=5):
    start = time.time()
    model.train()
    num_samples = X_train.size(0)

    for epoch in range(epochs):
        permutation = torch.randperm(num_samples)  # 데이터 섞기 (Epoch마다)

        total_loss = 0
        correct = 0
        for i in range(0, num_samples, batch_size):
            idx = permutation[i:i+batch_size]
            images, labels = X_train[idx].to(device), y_train[idx].to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()

        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {total_loss:.4f}, Accuracy: {100. * correct / num_samples:.2f}%")

    print(f"\n⏱ Total Time: {time.time() - start:.2f} seconds")

In [None]:
%%time
model_with_dropout = ClassifierModel(dropout_rate=0.9).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_with_dropout.parameters(), lr=0.001)

train(model_with_dropout, X_train, y_train, criterion, optimizer, batch_size=batch_size, epochs=5)


In [None]:
%%time
model_without_dropout = ClassifierModel(dropout_rate=0).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_without_dropout.parameters(), lr=0.001)

train(model_without_dropout, X_train, y_train, criterion, optimizer, epochs=5)

현재 이 데이터 셋은 학습이 잘 되는 데이터 셋으로, fully connected layer에서도 결과가 잘 나옴을 확인할 수 있는데요. 일부러 중간에 dropout layer를 추가하여 0.9의 확률 값을 주니 학습이 안 됨을 확인하였습니다. 다음은 overfitting이 나는 환경에서 dropout의 중요성을 알아보도록 하겠습니다.

## 과적합 경우

In [None]:
train_size = int(0.99 * len(X))
valid_size = len(X) - train_size
train_data, valid_data, train_label, valid_label = train_test_split(X, y, test_size=0.01, random_state=42)

X_train = torch.from_numpy(train_data).reshape(-1, 28*28)
y_train = torch.from_numpy(train_label)
X_valid = torch.from_numpy(valid_data).reshape(-1, 28*28)
y_valid = torch.from_numpy(valid_label)

In [None]:
class OverfitModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:
def train(model, X_train, y_train, X_valid, y_valid, criterion, optimizer, batch_size = batch_size, epochs=200):
    train_loss, valid_loss, train_acc, valid_acc = [], [], [], []
    model.train()
    num_train = X_train.size(0)
    num_valid = X_valid.size(0)

    for epoch in range(epochs):
        permutation = torch.randperm(num_train)
        total_loss = 0
        correct_train, total_train = 0, 0

        for i in range(0, num_train, batch_size):
            idx = permutation[i:i+batch_size]
            images, labels = X_train[idx].to(device), y_train[idx].to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total_train += labels.size(0)
            correct_train += predicted.eq(labels).sum().item()

        train_loss.append(total_loss / num_train // batch_size)
        train_acc.append(100 * correct_train / total_train)

        # === validation ===
        model.eval()
        temp_loss, correct_valid, total_valid = 0, 0, 0
        with torch.no_grad():
            for i in range(0, num_valid, batch_size):
                images, labels = X_valid[i:i+batch_size].to(device), y_valid[i:i+batch_size].to(device)

                outputs = model(images)
                loss = criterion(outputs, labels)
                temp_loss += loss.item()

                _, predicted = outputs.max(1)
                total_valid += labels.size(0)
                correct_valid += predicted.eq(labels).sum().item()

        valid_loss.append(temp_loss / num_valid // batch_size)
        valid_acc.append(100 * correct_valid / total_valid)
        model.train()

        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss[-1]:.4f}, Valid Loss: {valid_loss[-1]:.4f}, Train Acc: {train_acc[-1]:.2f}%, Valid Acc: {valid_acc[-1]:.2f}%")
    return train_loss, valid_loss, train_acc, valid_acc

In [None]:
%%time
# Dropout이 없는 모델 학습
model_overfit = OverfitModel().to(device)
optimizer = optim.Adam(model_overfit.parameters(), lr=0.001)
train_loss, valid_loss, train_acc, valid_acc = train(model_overfit, X_train, y_train, X_valid, y_valid, criterion, optimizer, batch_size=64, epochs=200)

In [None]:
# loss 값을 plot 해보겠습니다.
plt.plot(train_loss, label='Train Loss', color='blue')
plt.plot(valid_loss, label='Validation Loss', color='red')
plt.legend()
plt.title('Loss Graph without Dropout')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
# accuracy 값을 plot 해보겠습니다.
plt.plot(train_acc, label='Train Accuracy', color='blue')
plt.plot(valid_acc, label='Validation Accuracy', color='red')
plt.legend()
plt.title('Accuracy Graph without Dropout')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

dropout layer가 없는 fully connected layer에서 200번 정도의 학습을 하니 train set의 accuracy는 올라가고, loss는 점점 떨어졌습니다. 그러나 validation set의 accuracy와 loss는 어느 정도 값에서 수렴함을 볼 수 있었습니다.
이렇게 오버피팅을 만든 환경에서 dropout layer를 추가한 뒤 나머지 환경은 같게 한 실험을 살펴보도록 하겠습니다.

In [None]:
class DropModel(nn.Module):
    def __init__(self, dropout_rate):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 256)
        self.relu = nn.ReLU()
        # 여기에 dropout layer를 추가해보았습니다. 나머지 layer는 위의 실습과 같습니다.
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        # 여기에 dropout layer를 추가해보았습니다. 나머지 layer는 위의 실습과 같습니다.
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
%%time
model_dropout = DropModel(dropout_rate=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_dropout.parameters(), lr=0.001)
train_loss, valid_loss, train_acc, valid_acc = train(model_dropout, X_train, y_train, X_valid, y_valid, criterion, optimizer, batch_size = batch_size, epochs = 200)

In [None]:
# Q. loss 값의 그래프를 그려봅시다.
plt.plot(train_loss, label='Train Loss', color='blue')
plt.plot(valid_loss, label='Validation Loss', color='red')
plt.legend()
plt.title('Loss Graph without Dropout')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
# Q. accuracy 값의 그래프를 그려봅시다.
plt.plot(train_acc, label='Train Accuracy', color='blue')
plt.plot(valid_acc, label='Validation Accuracy', color='red')
plt.legend()
plt.title('Accuracy Graph without Dropout')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

좋은 데이터를 가지고 오버피팅을 만드는 환경이 조금 어렵긴 했지만, dropout layer 하나만으로도 오버피팅을 막고, 두 데이터 셋이 정확도도 비슷하게 나옴을 확인하였습니다.