<a href="https://colab.research.google.com/github/dhdbsrlw/MLVU/blob/main/hw1_2nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
"""
Change directory to where this file is located
"""
%cd /content/drive/MyDrive/AI/VIP Lab/Homework 1

/content/drive/MyDrive/AI/VIP Lab/Homework 1


# Import Modules

In [3]:
import copy
import numpy as np
import matplotlib.pyplot as plt
from mnist.data_utils import load_data

#Utils

In [4]:
def sigmoid(z):
    """
    Do NOT modify this function
    """
    return 1/(1+np.exp(-z))

def softmax(X):
    """
    Do NOT modify this function
    """
    logit = np.exp(X-np.amax(X, axis=1, keepdims=True))
    numer = logit
    denom = np.sum(logit, axis=1, keepdims=True)
    return numer/denom

def load_batch(X, Y, batch_size, shuffle=True):
    """
    Generates batches with the remainder dropped.

    Do NOT modify this function
    """
    if shuffle:
        permutation = np.random.permutation(X.shape[0])
        X = X[permutation, :]
        Y = Y[permutation, :]
    num_steps = int(X.shape[0])//batch_size
    step = 0
    while step<num_steps:
        X_batch = X[batch_size*step:batch_size*(step+1)]
        Y_batch = Y[batch_size*step:batch_size*(step+1)]
        step+=1
        yield X_batch, Y_batch

#2-Layer Neural Network

In [5]:
class TwoLayerNN:
    """ a neural network with 2 layers """

    def __init__(self, input_dim, num_hiddens, num_classes):
        """
        Do NOT modify this function.
        """
        self.input_dim = input_dim
        self.num_hiddens = num_hiddens
        self.num_classes = num_classes
        self.params = self.initialize_parameters(input_dim, num_hiddens, num_classes)

    def initialize_parameters(self, input_dim, num_hiddens, num_classes):
        """
        initializes parameters with Xavier Initialization.

        Question (a)
        - refer to https://paperswithcode.com/method/xavier-initialization for Xavier initialization

        Inputs
        - input_dim
        - num_hiddens: the number of hidden units
        - num_classes: the number of classes
        Returns
        - params: a dictionary with the initialized parameters.
        """
        np.random.seed(42)

        W1 = np.random.randn(input_dim, num_hiddens) * np.sqrt(6 / (input_dim + num_hiddens))
        # np.random.randn()함수는 표준정규분포표에서 임의의 수를 추출한뒤, 그 수를 바탕으로 하는 N차원 배열을 생성
        b1 = np.zeros((1, num_hiddens))
        W2 = np.random.randn(num_hiddens, num_classes) * np.sqrt(6 / (num_hiddens + num_classes))
        b2 = np.zeros((1, num_classes))
        params = {
            "W1": W1,
            "b1": b1,
            "W2": W2,
            "b2": b2
        }
        return params


    def forward(self, X):
        """
        Defines and performs the feed forward step of a two-layer neural network.
        Specifically, the network structue is given by

          y = softmax(sigmoid(X W1 + b1) W2 + b2)

        where X is the input matrix of shape (N, D), y is the class distribution matrix
        of shape (N, C), N is the number of examples (either the entire dataset or
        a mini-batch), D is the feature dimensionality, and C is the number of classes.

        Question (b)
        - ff_dict will be used to run backpropagation in backward method.

        Inputs
        - X: the input matrix of shape (N, D)

        Returns
        - y: the output of the model
        - ff_dict: a dictionary with all the fully connected units and activations.
        """
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']

        # 첫번째 레이어
        z1 = np.dot(X, W1) + b1
        a1 = sigmoid(z1)

        # 두번째 레이어
        z2 = np.dot(a1, W2) + b2
        y = softmax(z2)

        ff_dict = {'Z1': z1, 'A1': a1, 'Z2': z2, 'Y': y}

        return y, ff_dict



    def backward(self, X, Y, ff_dict):
        """
        Performs backpropagation over the two-layer neural network, and returns
        a dictionary of gradients of all model parameters.

        Question (c)

        Inputs:
         - X: the input matrix of shape (B, D), where B is the number of examples
              in a mini-batch, D is the feature dimensionality.
         - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
              where B is the number of examples in a mini-batch, C is the number
              of classes.
         - ff_dict: the dictionary containing all the fully connected units and
              activations.

        Returns:
         - grads: a dictionary containing the gradients of corresponding weights and biases.
        """
        A1 = ff_dict['A1']
        Z1 = ff_dict['Z1']
        Y_hat = ff_dict['Y']

        B = X.shape[0]
        dY = (Y_hat - Y) / B

        # 두번째 레이어의 Gradients
        dW2 = np.dot(A1.T, dY)
        db2 = np.sum(dY, axis=0, keepdims=True)

        dA1 = np.dot(dY, self.params['W2'].T)
        dZ1 = dA1 * A1 * (1 - A1)

        # 첫번째 레이어의 Gradients
        dW1 = np.dot(X.T, dZ1)
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        # Gradients 저장
        grads = {
            'W1': dW1,
            'b1': db1,
            'W2': dW2,
            'b2': db2
        }

        return grads

    def compute_loss(self, Y, Y_hat):
        """
        Computes cross entropy loss.

        Do NOT modify this function.

        Inputs
            Y: ground truth labels
            Y_hat: predicted labels
        Returns
            loss:
        """
        loss = -(1/Y.shape[0]) * np.sum(np.multiply(Y, np.log(Y_hat)))
        return loss

    def train(self, X, Y, X_val, Y_val, lr, n_epochs, batch_size, log_interval=1):
        """
        Runs mini-batch gradient descent.

        Do NOT Modify this method.

        Inputs
        - X
        - Y
        - X_val: validation data
        - Y_Val: validation label
        - lr: learning rate
        - n_epochs: the number of epochs to run
        - batch_size
        - log_interval: the epoch interval to log the training progress.
        """
        for epoch in range(n_epochs):
            for X_batch, Y_batch in load_batch(X, Y, batch_size):
                self.train_step(X_batch, Y_batch, batch_size, lr)
            if epoch % log_interval==0:
                Y_hat, ff_dict = self.forward(X)
                train_loss = self.compute_loss(Y, Y_hat)
                train_acc = self.evaluate(Y, Y_hat)
                Y_hat, ff_dict = self.forward(X_val)
                valid_loss = self.compute_loss(Y_val, Y_hat)
                valid_acc = self.evaluate(Y_val, Y_hat)
                print('epoch {:02} - train loss/acc: {:.3f} {:.3f}, valid loss/acc: {:.3f} {:.3f}'.\
                      format(epoch, train_loss, train_acc, valid_loss, valid_acc))

    def train_step(self, X_batch, Y_batch, batch_size, lr):
        """
        Updates the parameters using gradient descent.

        Question (d)

        Inputs
        - X_batch
        - Y_batch
        - batch_size
        - lr: learning rate
        """
        # Forward
        y_pred, ff_dict = self.forward(X_batch)

        # Backward
        grads = self.backward(X_batch, Y_batch, ff_dict)

        # Update
        self.params['W1'] -= lr * grads['W1']
        self.params['b1'] -= lr * grads['b1']
        self.params['W2'] -= lr * grads['W2']
        self.params['b2'] -= lr * grads['b2']


    def evaluate(self, Y, Y_hat):
        """
        Computes classification accuracy.

        Question (e)

        Inputs
        - Y: A numpy array of shape (N, C) containing the one-hot encoded labels,
             where C is the number of classes.
        - Y_hat: A numpy array of shape (N, C) containing the softmax outputs,
             where C is the number of classes.

        Returns
            accuracy: the classification accuracy in float
        """
        Y_pred = np.argmax(Y_hat, axis=1) # 소프트맥스 결과값을 클래스 라벨로 변환 (가장 높은 probability 의 클래스로 변환된다.)
        Y_true = np.argmax(Y, axis=1) # 원-핫 인코딩 라벨을 클래스 라벨로 변환

        accuracy = np.mean(Y_pred == Y_true) # 맞은 개수로 평균

        return accuracy


#Load MNIST

In [6]:
X_train, Y_train, X_test, Y_test = load_data()

idxs = np.arange(len(X_train))
np.random.shuffle(idxs)
split_idx = int(np.ceil(len(idxs)*0.8))
X_valid, Y_valid = X_train[idxs[split_idx:]], Y_train[idxs[split_idx:]]
X_train, Y_train = X_train[idxs[:split_idx]], Y_train[idxs[:split_idx]]
print()
print('Set validation data aside')
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', Y_train.shape)
print('Validation data shape: ', X_valid.shape)
print('Validation labels shape: ', Y_valid.shape)

MNIST data loaded:
Training data shape: (60000, 784)
Training labels shape: (60000, 10)
Test data shape: (10000, 784)
Test labels shape: (10000, 10)

Set validation data aside
Training data shape:  (48000, 784)
Training labels shape:  (48000, 10)
Validation data shape:  (12000, 784)
Validation labels shape:  (12000, 10)


#Training & Evaluation

In [11]:
###
# Question (f)
# Tune the hyperparameters with validation data,
# and print the results by running the lines below.
###

# 하이퍼파라미터 튜닝 실험 횟수 (Random Search 횟수)
exp_num = 10
for i in range(exp_num):

  # 하이퍼파라미터 선언 및 범위 지정
  num_hiddens = 2 ** np.random.randint(5, 9)
  lr = 10 ** np.random.uniform(-6, -2)
  n_epochs = 5 * np.random.randint(1,21)
  batch_size = 2 ** np.random.randint(5, 9)

  print(f'======= TRY {i} | num_hiddens: {num_hiddens}, lr: {lr}, n_epochs: {n_epochs}, b_size = {batch_size} =======')

  # 모델 세팅
  model = TwoLayerNN(input_dim=784, num_hiddens=num_hiddens, num_classes=10)

  # 모델 학습 및 Val 데이터로 평가
  model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)
  print()


epoch 00 - train loss/acc: 2.751 0.105, valid loss/acc: 2.749 0.106
epoch 01 - train loss/acc: 2.729 0.106, valid loss/acc: 2.727 0.106
epoch 02 - train loss/acc: 2.709 0.106, valid loss/acc: 2.706 0.107
epoch 03 - train loss/acc: 2.690 0.108, valid loss/acc: 2.687 0.108
epoch 04 - train loss/acc: 2.672 0.109, valid loss/acc: 2.670 0.109
epoch 05 - train loss/acc: 2.655 0.110, valid loss/acc: 2.653 0.110
epoch 06 - train loss/acc: 2.640 0.113, valid loss/acc: 2.638 0.112
epoch 07 - train loss/acc: 2.625 0.115, valid loss/acc: 2.623 0.114
epoch 08 - train loss/acc: 2.611 0.117, valid loss/acc: 2.609 0.116
epoch 09 - train loss/acc: 2.597 0.120, valid loss/acc: 2.595 0.117

epoch 00 - train loss/acc: 2.882 0.113, valid loss/acc: 2.890 0.111
epoch 01 - train loss/acc: 2.867 0.113, valid loss/acc: 2.875 0.111
epoch 02 - train loss/acc: 2.853 0.113, valid loss/acc: 2.860 0.111
epoch 03 - train loss/acc: 2.839 0.113, valid loss/acc: 2.846 0.111
epoch 04 - train loss/acc: 2.825 0.113, valid l

실험결과, TRY6 또는 TRY7 의 조합이 가장 우수한 성능을 보였다. \
따라서 아래 최종 테스트는 TRY6 의 세팅으로 수행하였다.

In [12]:
# model instantiation
model = TwoLayerNN(input_dim=784, num_hiddens=128, num_classes=10) # 데이터셋에 맞게 값 주입

In [13]:
# train the model & evaluate with validation data
lr, n_epochs, batch_size = 3.246747330056898e-05, 50, 128
model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 2.738 0.105, valid loss/acc: 2.736 0.106
epoch 01 - train loss/acc: 2.705 0.107, valid loss/acc: 2.703 0.107
epoch 02 - train loss/acc: 2.676 0.108, valid loss/acc: 2.674 0.109
epoch 03 - train loss/acc: 2.650 0.111, valid loss/acc: 2.648 0.111
epoch 04 - train loss/acc: 2.626 0.115, valid loss/acc: 2.624 0.114
epoch 05 - train loss/acc: 2.604 0.118, valid loss/acc: 2.602 0.117
epoch 06 - train loss/acc: 2.584 0.123, valid loss/acc: 2.582 0.121
epoch 07 - train loss/acc: 2.565 0.126, valid loss/acc: 2.563 0.125
epoch 08 - train loss/acc: 2.547 0.132, valid loss/acc: 2.545 0.130
epoch 09 - train loss/acc: 2.530 0.136, valid loss/acc: 2.528 0.135
epoch 10 - train loss/acc: 2.514 0.140, valid loss/acc: 2.512 0.140
epoch 11 - train loss/acc: 2.499 0.145, valid loss/acc: 2.497 0.144
epoch 12 - train loss/acc: 2.484 0.150, valid loss/acc: 2.483 0.148
epoch 13 - train loss/acc: 2.470 0.154, valid loss/acc: 2.469 0.153
epoch 14 - train loss/acc: 2.457 0.158, valid lo

In [14]:
# evalute the model on test data
Y_hat, _ = model.forward(X_test)
test_loss = model.compute_loss(Y_test, Y_hat)
test_acc = model.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 2.181, acc = 0.284


# Extra Credit (Optional)

In [15]:
def initialize_parameters(self, input_dim, num_hiddens, num_classes):
    """
    initializes parameters with He Initialization.

    Question (g)
    - refer to https://paperswithcode.com/method/he-initialization for He initialization

    Inputs
    - input_dim
    - num_hiddens
    - num_classes
    Returns
    - params: a dictionary with the initialized parameters.
    """
    np.random.seed(42)

    W1 = np.random.randn(input_dim, num_hiddens) * np.sqrt(2 / input_dim)
    b1 = np.zeros((1, num_hiddens))
    W2 = np.random.randn(num_hiddens, num_classes) * np.sqrt(2 / num_hiddens)
    b2 = np.zeros((1, num_classes))

    params = {
        "W1": W1,
        "b1": b1,
        "W2": W2,
        "b2": b2
    }
    return params

def forward_relu(self, X):
    """
    Defines and performs the feed forward step of a two-layer neural network.
    Specifically, the network structue is given by

        y = softmax(relu(X W1 + b1) W2 + b2)

    where X is the input matrix of shape (N, D), y is the class distribution matrix
    of shape (N, C), N is the number of examples (either the entire dataset or
    a mini-batch), D is the feature dimensionality, and C is the number of classes.

    Question (g)

    Inputs
        X: the input matrix of shape (N, D)

    Returns
        y: the output of the model
        ff_dict: a dictionary containing all the fully connected units and activations.
    """
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']

    # 첫번째 레이어
    Z1 = np.dot(X, W1) + b1
    A1 = np.maximum(0, Z1)  # ReLU

    # 두번째 레이어
    Z2 = np.dot(A1, W2) + b2
    y = softmax(Z2)

    ff_dict = {'Z1': Z1, 'A1': A1, 'Z2': Z2, 'Y': y}

    return y, ff_dict

def backward_relu(self, X, Y, ff_dict):
    """
    Performs backpropagation over the two-layer neural network, and returns
    a dictionary of gradients of all model parameters.

    Question (g)

    Inputs:
        - X: the input matrix of shape (B, D), where B is the number of examples
            in a mini-batch, D is the feature dimensionality.
        - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
            where B is the number of examples in a mini-batch, C is the number
            of classes.
        - ff_dict: the dictionary containing all the fully connected units and
            activations.

    Returns:
        - grads: a dictionary containing the gradients of corresponding weights
            and biases.
    """
    A1 = ff_dict['A1']
    Y_hat = ff_dict['Y']
    B = X.shape[0]

    dY = (Y_hat - Y) / B

    dW2 = np.dot(A1.T, dY)
    db2 = np.sum(dY, axis=0, keepdims=True)

    dA1 = np.dot(dY, self.params['W2'].T)
    dZ1 = dA1 * (A1 > 0)

    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)

    grads = {
        'W1': dW1,
        'b1': db1,
        'W2': dW2,
        'b2': db2
    }
    return grads


TwoLayerNNRelu = copy.copy(TwoLayerNN)
TwoLayerNNRelu.initialize_parameters = initialize_parameters
TwoLayerNNRelu.feed_forward = forward_relu
TwoLayerNNRelu.back_propagate = backward_relu

In [16]:
###
# Question (g)
# Tune the hyperparameters with validation data,
# and print the results by running the lines below.
###

# 하이퍼파라미터 튜닝 실험 횟수 (Random Search 횟수)
exp_num = 10
for i in range(exp_num):

  # 하이퍼파라미터 선언 및 범위 지정
  num_hiddens = 2 ** np.random.randint(5, 9)
  lr = 10 ** np.random.uniform(-6, -2)
  n_epochs = 5 * np.random.randint(1,10) # 축소
  batch_size = 2 ** np.random.randint(5, 9)

  print(f'======= TRY {i} | num_hiddens: {num_hiddens}, lr: {lr:.4f}, n_epochs: {n_epochs}, b_size = {batch_size} =======')

  # 모델 세팅
  model_relu = TwoLayerNNRelu(input_dim=784, num_hiddens=num_hiddens, num_classes=10)

  # 모델 학습 및 Val 데이터로 평가
  history = model_relu.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)
  print()


epoch 00 - train loss/acc: 2.411 0.099, valid loss/acc: 2.408 0.101
epoch 01 - train loss/acc: 2.406 0.099, valid loss/acc: 2.403 0.101
epoch 02 - train loss/acc: 2.401 0.100, valid loss/acc: 2.398 0.101
epoch 03 - train loss/acc: 2.396 0.100, valid loss/acc: 2.394 0.102
epoch 04 - train loss/acc: 2.392 0.101, valid loss/acc: 2.389 0.103
epoch 05 - train loss/acc: 2.387 0.101, valid loss/acc: 2.385 0.103
epoch 06 - train loss/acc: 2.383 0.102, valid loss/acc: 2.381 0.104
epoch 07 - train loss/acc: 2.379 0.103, valid loss/acc: 2.377 0.105
epoch 08 - train loss/acc: 2.374 0.105, valid loss/acc: 2.372 0.106
epoch 09 - train loss/acc: 2.370 0.106, valid loss/acc: 2.368 0.108
epoch 10 - train loss/acc: 2.366 0.107, valid loss/acc: 2.365 0.109
epoch 11 - train loss/acc: 2.363 0.109, valid loss/acc: 2.361 0.111
epoch 12 - train loss/acc: 2.359 0.111, valid loss/acc: 2.357 0.113
epoch 13 - train loss/acc: 2.355 0.112, valid loss/acc: 2.354 0.114
epoch 14 - train loss/acc: 2.351 0.113, valid lo

TRY3 의 조합으로 최종 테스트를 수행해보았다.

In [17]:
# model instantiation
model_relu = TwoLayerNNRelu(input_dim=784, num_hiddens=256, num_classes=10)

In [18]:
# train the model & evaluate with validation data
lr, n_epochs, batch_size = 0.0067, 35, 256
history = model_relu.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 2.104 0.461, valid loss/acc: 2.107 0.453
epoch 01 - train loss/acc: 1.917 0.624, valid loss/acc: 1.922 0.614
epoch 02 - train loss/acc: 1.749 0.689, valid loss/acc: 1.756 0.680
epoch 03 - train loss/acc: 1.598 0.726, valid loss/acc: 1.607 0.715
epoch 04 - train loss/acc: 1.464 0.750, valid loss/acc: 1.474 0.741
epoch 05 - train loss/acc: 1.347 0.760, valid loss/acc: 1.358 0.752
epoch 06 - train loss/acc: 1.244 0.777, valid loss/acc: 1.256 0.768
epoch 07 - train loss/acc: 1.156 0.791, valid loss/acc: 1.169 0.780
epoch 08 - train loss/acc: 1.080 0.802, valid loss/acc: 1.093 0.792
epoch 09 - train loss/acc: 1.014 0.810, valid loss/acc: 1.027 0.800
epoch 10 - train loss/acc: 0.957 0.816, valid loss/acc: 0.970 0.808
epoch 11 - train loss/acc: 0.907 0.823, valid loss/acc: 0.921 0.816
epoch 12 - train loss/acc: 0.864 0.830, valid loss/acc: 0.877 0.824
epoch 13 - train loss/acc: 0.826 0.832, valid loss/acc: 0.839 0.825
epoch 14 - train loss/acc: 0.792 0.838, valid lo

In [19]:
Y_hat, _ = model_relu.forward(X_test)
test_loss = model_relu.compute_loss(Y_test, Y_hat)
test_acc = model_relu.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 0.486, acc = 0.882
