# 12. 딥러닝 들여다보기

## 12-2. 신경망 구성 (1) 개요

### MNIST Revisited

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# MNIST 데이터를 로드. 다운로드하지 않았다면 다운로드까지 자동으로 진행됩니다. 
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()   

# 모델에 맞게 데이터 가공
x_train_norm, x_test_norm = x_train / 255.0, x_test / 255.0
x_train_reshaped = x_train_norm.reshape(-1, x_train_norm.shape[1]*x_train_norm.shape[2])
x_test_reshaped = x_test_norm.reshape(-1, x_test_norm.shape[1]*x_test_norm.shape[2])

# 딥러닝 모델 구성 - 2 Layer Perceptron
model=keras.models.Sequential()
model.add(keras.layers.Dense(50, activation='sigmoid', input_shape=(784,)))  # 입력층 d=784, 은닉층 레이어 H=50
model.add(keras.layers.Dense(10, activation='softmax'))   # 출력층 레이어 K=10
model.summary()

# 모델 구성과 학습
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])
model.fit(x_train_reshaped, y_train, epochs=10)

# 모델 테스트 결과
test_loss, test_accuracy = model.evaluate(x_test_reshaped,y_test, verbose=2)
print(f"test_loss: {test_loss} ")
print(f"test_accuracy: {test_accuracy}")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                39250     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 0s - loss: 0.1078 - accuracy: 0.9680
test_loss: 0.10780690610408783 
test_accuracy: 0.9679999947547913


In [2]:
# 입력층 데이터의 모양(shape)
print(x_train_reshaped.shape)

# 테스트를 위해 x_train_reshaped의 앞 5개의 데이터를 가져온다.
X = x_train_reshaped[:5]

(60000, 784)


In [3]:
weight_init_std = 0.1
input_size = 784
hidden_size=50

# 인접 레이어간 관계를 나타내는 파라미터 W를 생성하고 random 초기화
W1 = weight_init_std * np.random.randn(input_size, hidden_size)  
# 바이어스 파라미터 b를 생성하고 Zero로 초기화
b1 = np.zeros(hidden_size)

a1 = np.dot(X, W1) + b1   # 은닉층 출력

print(f'W1.shape:',W1.shape)
print(f'X.shape:',X.shape)
print(f'b1.shape:', b1.shape)
print(f'a1.shape:',a1.shape)

W1.shape: (784, 50)
X.shape: (5, 784)
b1.shape: (50,)
a1.shape: (5, 50)


<span style = "color:green; font-size:150%"> 
<b>Q. 첫 번째 데이터의 은닉층 출력을 확인해 봅시다.  50dim의 벡터가 나오나요?</span>

In [4]:
# 첫 번째 데이터의 은닉층 출력을 확인해 봅시다.  50dim의 벡터가 나오나요?
a1[0]

array([-0.03903154,  1.95237002, -1.23297817,  0.7774001 ,  0.98683625,
       -0.792677  ,  1.3778527 ,  0.47579772,  0.59490477,  0.56227145,
        1.07699341, -0.94802499, -0.80901878,  1.66652714,  0.52353009,
        0.19155294,  0.78148131,  1.52314006,  1.82990768,  0.25271733,
       -0.75604994,  0.61529505,  1.69565685, -0.17649813,  0.82615092,
        0.41822442,  0.17899841,  0.48505913, -3.32445536,  0.28745757,
        2.24056095, -0.94598001, -0.29811733,  0.34079453,  0.38055451,
       -1.23991348,  1.99856564,  1.20417594, -1.0651277 , -0.84259215,
        1.56570276,  0.03720417,  1.18520024, -1.85563565,  0.42752201,
        1.97149099, -0.36325767,  1.0874221 , -0.6808314 ,  0.50196858])

## 12-3. 신경망 구성 (2) 활성화 함수와 손실 함수

## 활성화 함수 (Activation Functions)

In [5]:
# 위 수식의 sigmoid 함수를 구현해 봅니다.
def sigmoid(x):
    return 1 / (1 + np.exp(-x))  


z1 = sigmoid(a1)
print(z1[0])  # sigmoid의 출력은 모든 element가 0에서 1사이

[0.49024335 0.87570484 0.2256606  0.68511951 0.72846257 0.31159415
 0.79864591 0.61675508 0.64448973 0.63697795 0.74592459 0.27928219
 0.30809963 0.84111225 0.62797285 0.54774234 0.68599928 0.82100041
 0.86175073 0.56284522 0.31950448 0.64914773 0.84496664 0.45598966
 0.69554045 0.60305829 0.5446305  0.6189418  0.03474169 0.57137359
 0.90383323 0.27969399 0.42601778 0.58438351 0.59400684 0.22445105
 0.8806464  0.76926683 0.25633076 0.30098913 0.82717014 0.50929997
 0.76588153 0.13521256 0.60528179 0.87777117 0.41017121 0.74789598
 0.33607577 0.62292184]


In [6]:
# 단일 레이어 구현 함수
def affine_layer_forward(X, W, b):
    y = np.dot(X, W) + b
    cache = (X, W, b)
    return y, cache

print('푸슝~3')

푸슝~3


In [7]:
input_size = 784
hidden_size = 50
output_size = 10

W1 = weight_init_std * np.random.randn(input_size, hidden_size)
b1 = np.zeros(hidden_size)
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
b2 = np.zeros(output_size)

a1, cache1 = affine_layer_forward(X, W1, b1)
z1 = sigmoid(a1)
a2, cache2 = affine_layer_forward(z1, W2, b2)    # z1이 다시 두번째 레이어의 입력이 됩니다. 

print(a2[0])  # 최종 출력이 output_size만큼의 벡터가 되었습니다.

[-0.28615831  0.07776111  0.75183561  0.20877389  0.04245447 -0.52126731
 -0.60078501 -1.00608647 -0.03605047 -0.60720834]


In [8]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

In [9]:
y_hat = softmax(a2)
y_hat[0]  # 10개의 숫자 중 하나일 확률이 되었습니다.

array([0.08124258, 0.11690468, 0.22939225, 0.13326927, 0.11284918,
       0.06422101, 0.05931207, 0.03954785, 0.10432879, 0.05893231])

In [10]:
# 정답 라벨을 One-hot 인코딩하는 함수
def _change_one_hot_label(X, num_category):
    T = np.zeros((X.size, num_category))
    for idx, row in enumerate(T):
        row[X[idx]] = 1
        
    return T

Y_digit = y_train[:5]
t = _change_one_hot_label(Y_digit, 10)
t     # 정답 라벨의 One-hot 인코딩

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [11]:
print(y_hat[0])
print(t[0])

[0.08124258 0.11690468 0.22939225 0.13326927 0.11284918 0.06422101
 0.05931207 0.03954785 0.10432879 0.05893231]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [12]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

Loss = cross_entropy_error(y_hat, t)
Loss

2.568821561442879

## 12-4. 경사하강법

In [13]:
batch_num = y_hat.shape[0]
dy = (y_hat - t) / batch_num
dy    # softmax값의 출력으로 Loss를 미분한 값

array([[ 0.01624852,  0.02338094,  0.04587845,  0.02665385,  0.02256984,
        -0.1871558 ,  0.01186241,  0.00790957,  0.02086576,  0.01178646],
       [-0.18162585,  0.01956423,  0.04788864,  0.02375783,  0.02388226,
         0.01265623,  0.01267405,  0.00842566,  0.02060655,  0.0121704 ],
       [ 0.01627226,  0.02360201,  0.04363288,  0.02702262, -0.17821905,
         0.01356558,  0.01445195,  0.00811647,  0.02082361,  0.01073167],
       [ 0.01653792, -0.17985412,  0.04743667,  0.02872768,  0.02020412,
         0.01531879,  0.01053251,  0.00744132,  0.02278625,  0.01086886],
       [ 0.01933804,  0.02206629,  0.04957146,  0.02450896,  0.02691587,
         0.01356752,  0.01273533,  0.00646114,  0.01667243, -0.19183704]])

In [14]:
dW2 = np.dot(z1.T, dy)    
dW2

array([[-0.08374543, -0.10109025,  0.16569379,  0.09173103, -0.00825372,
        -0.08594283,  0.04287603,  0.02666851,  0.07140906, -0.1193462 ],
       [-0.07390999, -0.00872776,  0.12052884,  0.06846813, -0.10091666,
        -0.1017346 ,  0.03328316,  0.02068431,  0.05410746, -0.0117829 ],
       [-0.11413739, -0.06111407,  0.18277707,  0.10113685, -0.08382722,
        -0.05362497,  0.04895666,  0.02987097,  0.07878913, -0.12882704],
       [-0.08543287, -0.0190473 ,  0.09660665,  0.05365513, -0.00779545,
        -0.10632293,  0.02545976,  0.01627549,  0.04272643, -0.01612491],
       [-0.11087242,  0.02784476,  0.12254285,  0.0672307 , -0.0700382 ,
        -0.07684512,  0.03370296,  0.02054433,  0.05282884, -0.0669387 ],
       [-0.05085929,  0.00989253,  0.08903689,  0.04922727, -0.04702115,
        -0.06663896,  0.02428489,  0.0147263 ,  0.03825346, -0.06090194],
       [-0.09532622, -0.08221955,  0.17370211,  0.09708879, -0.04026627,
        -0.12324761,  0.04567141,  0.02859816

In [15]:
dW2 = np.dot(z1.T, dy)
db2 = np.sum(dy, axis=0)

In [16]:
def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

In [17]:
dz1 = np.dot(dy, W2.T)
da1 = sigmoid_grad(a1) * dz1
dW1 = np.dot(X.T, da1)
db1 = np.sum(dz1, axis=0)

In [18]:
learning_rate = 0.1

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    return W1, b1, W2, b2

## 2-5. 오차역전파법이란?

In [19]:
def affine_layer_backward(dy, cache):
    X, W, b = cache
    dX = np.dot(dy, W.T)
    dW = np.dot(X.T, dy)
    db = np.sum(dy, axis=0)
    return dX, dW, db

In [20]:
# 파라미터 초기화
W1 = weight_init_std * np.random.randn(input_size, hidden_size)
b1 = np.zeros(hidden_size)
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
b2 = np.zeros(output_size)

# Forward Propagation
a1, cache1 = affine_layer_forward(X, W1, b1)
z1 = sigmoid(a1)
a2, cache2 = affine_layer_forward(z1, W2, b2)

# 추론과 오차(Loss) 계산
y_hat = softmax(a2)
t = _change_one_hot_label(Y_digit, 10)   # 정답 One-hot 인코딩
Loss = cross_entropy_error(y_hat, t)

print(y_hat)
print(t)
print('Loss: ', Loss)
        
dy = (y_hat - t) / X.shape[0]
dz1, dW2, db2 = affine_layer_backward(dy, cache2)
da1 = sigmoid_grad(a1) * dz1
dX, dW1, db1 = affine_layer_backward(da1, cache1)

# 경사하강법을 통한 파라미터 업데이트    
learning_rate = 0.1
W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

[[0.08028677 0.06087052 0.08344577 0.11040949 0.05554446 0.11852898
  0.15350876 0.09561458 0.05476104 0.18702962]
 [0.1016505  0.06378328 0.08203009 0.105985   0.05120713 0.13454029
  0.14194351 0.11028396 0.05217102 0.15640522]
 [0.0851558  0.06356971 0.08847663 0.09885321 0.05675833 0.14484107
  0.13038777 0.09239194 0.04760213 0.19196342]
 [0.0887097  0.05853812 0.07841847 0.10751438 0.06407255 0.12552467
  0.11938749 0.11441481 0.06117691 0.1822429 ]
 [0.10985657 0.05697674 0.0878311  0.09822158 0.05057798 0.13862127
  0.15189127 0.09627654 0.05836421 0.15138274]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Loss:  2.4027572754628346


## 12-6. 모델 학습 Step-by-Step

In [21]:
W1 = weight_init_std * np.random.randn(input_size, hidden_size)
b1 = np.zeros(hidden_size)
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
b2 = np.zeros(output_size)

def train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=False):
    a1, cache1 = affine_layer_forward(X, W1, b1)
    z1 = sigmoid(a1)
    a2, cache2 = affine_layer_forward(z1, W2, b2)
    y_hat = softmax(a2)
    t = _change_one_hot_label(Y, 10)
    Loss = cross_entropy_error(y_hat, t)

    if verbose:
        print('---------')
        print(y_hat)
        print(t)
        print('Loss: ', Loss)
        
    dy = (y_hat - t) / X.shape[0]
    dz1, dW2, db2 = affine_layer_backward(dy, cache2)
    da1 = sigmoid_grad(a1) * dz1
    dX, dW1, db1 = affine_layer_backward(da1, cache1)
    
    W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
    
    return W1, b1, W2, b2, Lossb

In [22]:
X = x_train_reshaped[:5]
Y = y_train[:5]

# train_step을 다섯 번 반복 돌립니다.
for i in range(5):
    W1, b1, W2, b2, _ = train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=True)

---------
[[0.10132424 0.04694082 0.1593532  0.07336761 0.05198382 0.07863765
  0.14074116 0.18650205 0.07199905 0.0891504 ]
 [0.09371504 0.04729875 0.17022429 0.08511428 0.061696   0.07386892
  0.12763935 0.19195799 0.07288136 0.07560401]
 [0.09392293 0.05261715 0.15017474 0.08098165 0.04561189 0.08376934
  0.13906608 0.17435851 0.08082462 0.09867309]
 [0.09327881 0.05243794 0.15517492 0.07971568 0.0445909  0.07672965
  0.17316253 0.1804449  0.07106796 0.0733967 ]
 [0.0991319  0.05225459 0.15501479 0.07882603 0.0580723  0.06916437
  0.16237059 0.16714562 0.08093793 0.07708188]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Loss:  2.7018000060266028


NameError: name 'Lossb' is not defined

## 12-7. 추론 과정 구현과 정확도(Accuracy) 계산

In [None]:
# X = x_train[:100] 에 대해 모델 추론을 시도합니다. 
X = x_train_reshaped[:100]
Y = y_test[:100]
result = predict(W1, b1, W2, b2, X)
result[0]

In [None]:
def accuracy(W1, b1, W2, b2, x, y):
    y_hat = predict(W1, b1, W2, b2, x)
    y_hat = np.argmax(y_hat, axis=1)

    accuracy = np.sum(y_hat == y) / float(x.shape[0])
    return accuracy

In [None]:
acc = accuracy(W1, b1, W2, b2, X, Y)

t = _change_one_hot_label(Y, 10)
print(result[0])
print(t[0])
print(acc)

## 12-8. 전체 학습 사이클 수행 - 종합문제

In [None]:
import tensorflow as tf
from tensorflow import keras

# mnist를 불러오고 train_data, train_label, test_data, test_label로 나눠주세요.

#우리의 모델은 MLP이기 때문에 데이터를 255로 나누고 1차원(60000, n)으로 만들어주세요.

# 초기화된 파라미터를 정의하는 함수를 만들고 초기값을 만드세요.
def init_params(input_size, hidden_size, output_size, weight_init_std=0.01):
    #W1, b1, W2, b2를 모두 정의해주세요.
    
    return W1, b1, W2, b2

W1, b1, W2, b2 = init_params(input_size = "YOUR CODE", hidden_size = 50, output_size = "YOUR CODE")

<span style = "color:green; font-size:150%"> 
<b>Q. MLP를 정의하세요.</span>

In [None]:
# MLP를 정의하세요.
def affine_layer_forward(X, W, b):
  #[[YOUR CODE]]
  return y, cache

# relu를 정의하세요 (np.maximum을 활용하세요)
def relu(x):
  #[[YOUR CODE]]
  return result

# softmax를 정의하세요
def softmax(x):
  #[[YOUR CODE]]
  return result

# one-hot 인코딩을 정의하세요
def _change_one_hot_label(X, num_category):
    #[[YOUR CODE]]
    return T

# cross entropy loss함수를 정의하세요
def cross_entropy_error(y, t):
     #[[YOUR CODE]]
    return result

<span style = "color:green; font-size:150%"> 
<b>Q. MLP의 backward pass를 정의하세요</span>

In [None]:
# MLP의 backward pass를 정의하세요
def affine_layer_backward(dy, cache):
  #[[YOUR CODE]]
  return dX, dW, db

# relu 함수의 backward pass를 정의하세요. (np.where 함수를 활용하세요)
def relu_grad(x):
  #[[YOUR CODE]]
  return result

#파라미터를 업데이트하는 함수를 정의하세요.
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
  #[[YOUR CODE]]
  return W1, b1, W2, b2

# train_step을 정의합니다.
def train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=False):
  #[[YOUR CODE]]

  if verbose:
      print('---------')
      print(y_hat)
      print(t)
      print('Loss: ', Loss)
      
  #[[YOUR CODE]]
  
  W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
  
  return W1, b1, W2, b2, Loss

# 예측값을 만드는 함수를 정의하세요
def predict(W1, b1, W2, b2, X):
  #[[YOUR CODE]]
  return y

#정확도를 나타내는 함수를 정의하세요
def accuracy(W1, b1, W2, b2, x, y):
  #[[YOUR CODE]]
  return accuracy

In [None]:
# 하이퍼파라미터
iters_num = 50000  # 반복 횟수를 적절히 설정한다.
train_size = x_train.shape[0]
batch_size = 100   # 미니배치 크기
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

# 1에폭당 반복 수
iter_per_epoch = max(train_size / batch_size, 1)

W1, b1, W2, b2 = init_params(784, 50, 10)

for i in range(iters_num):
  # 미니배치 획득
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train_reshaped[batch_mask]
  y_batch = y_train[batch_mask]
  

  W1, b1, W2, b2, Loss = train_step("YOUR CODE")

  # 학습 경과 기록
  train_loss_list.append(Loss)
  
  # 1에폭당 정확도 계산
  # train_accuracy와 test_accuracy를 완성해주세요
  if i % iter_per_epoch == 0:
      print('Loss: ', Loss)
      train_acc = accuracy("YOUR CODE")
      test_acc = accuracy("YOUR CODE")
      train_acc_list.append(train_acc)
      test_acc_list.append(test_acc)
      print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

In [None]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 6 

# Accuracy 그래프 그리기
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()

In [None]:
# Loss 그래프 그리기
x = np.arange(len(train_loss_list))
plt.plot(x, train_loss_list, label='train acc')
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.ylim(0, 3.0)
plt.legend(loc='best')
plt.show()