In [1]:
import numpy as np

In [2]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out
    
    def backward(self, dout):
        dx = dout * self.y # x <-> y 바꿈
        dy = dout * self.x # y <-> x 바꿈
        return dx, dy

In [3]:
apple = 100
apple_num = 2
tax = 1.1

# layers
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward propagation
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price) # 220

220.00000000000003


In [4]:
# 역전파
# .backward()가 받는 인수 = 순전파의 출력에 대한 미분값 (ex. dprice)
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, dapple_num, dtax)

2.2 110.00000000000001 200


In [5]:
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

In [6]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# 계층들
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# 순전파
apple_price = mul_apple_layer.forward(apple, apple_num) # (1)
orange_price = mul_orange_layer.forward(orange, orange_num) # (2)
all_price = add_apple_orange_layer.forward(apple_price, orange_price) # (3)
price = mul_tax_layer.forward(all_price, tax) # (4)

# 역전파
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice) # (4)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) # (3)
dorange, dorange_num = mul_orange_layer.backward(dorange_price) # (2)
dapple, dapple_num = mul_apple_layer.backward(dapple_price) # (1)

print(price)
print(dall_price, dtax)
print(dapple_num, dapple, dorange, dorange_num, dtax)

715.0000000000001
1.1 650
110.00000000000001 2.2 3.3000000000000003 165.0 650


In [7]:
class Relu():
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0 # 0 이하인 값들은 0으로 흘려보내고
        dx = dout # 0보다 큰 값들은 dout 그대로 돌린다
        return dx
        

In [8]:
class Sigmoid():
    
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-1*x))
        self.out = out
        return out
    
    def backward(self, dout):
        dx = dout * (1 - dout)
        return dx
    

# Affine Layer

In [13]:
# 순전파 때의 편향 덧셈 (편향) = element-wise
X_dot_W = np.array([[0,0,0], [10,10,10]])
B = np.array([1,2,3])

print(X_dot_W + B)
print(B.shape)

[[ 1  2  3]
 [11 12 13]]
(3,)


In [14]:
# 역전파 때는, 각 데이터의 역전파 값들이 편향 원소에 모여야 함
# 최초 순전파 당시 더해졌던 형상대로 돌아가야 함 = shape(3,)
dY = np.array([[1,2,3], [4,5,6]])
dB = np.sum(dY, axis=0)
print(dB)
print(dB.shape)

[5 7 9]
(3,)


In [15]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
    
    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.b
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

In [17]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

# TwoLayerNet
오차역전파법 구현

In [32]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    
    batch_size = y.shape[0]
    
    delta = 1e-7
    return -np.sum(np.log(y + delta))/batch_size

In [30]:
def softmax(x):
    exp_x = np.exp(x)
    sum_exp_x = np.sum(exp_x, axis=-1).reshape(-1, 1)
    y = exp_x/sum_exp_x
    return y

In [22]:
def numerical_gradient_no_batch(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x) # x와 형상이 같은 배열을 생성
    
    for idx in range(x.size):
        tmp_val = x[idx]
        
        # f(x+h) 계산
        x[idx] = tmp_val + h
        fxh1 = f(x)
        
        # f(x-h) 계산
        x[idx] = tmp_val - h 
        fxh2 = f(x) 
        
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 값 복원
        
    return grad
        

def numerical_gradient(f, X):
    if X.ndim == 1:
        return numerical_gradient_no_batch(f, X)
    else:
        grad = np.zeros_like(X)
        
        for idx, x in enumerate(X):
            grad[idx] = numerical_gradient_no_batch(f, x)
            
        return grad

In [38]:
#from common.layers import *
#from common.gradient import numerical_gradient
from collections import OrderedDict

class TwoLayerNet:
    
    def __init__(self, input_size, hidden_size, output_size,
                weight_init_std=0.01):
        
        # 가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * \
                            np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * \
                            np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # 계층 생성
        self.layers = OrderedDict()
        
        self.layers['Affine1'] = \
            Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = \
            Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        # self.layers는 순서가 있는 dictionary
        # for loop으로 호출하면 선언한 순서대로 튀어나온다
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
        
    # x: 입력 데이터, t: 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t) # softmax layer forward -> cross entropy return
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1) # 최대 probability가 위치한 column index (0, 1, 2, ... = target labels)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
            
        accuracy = np.sum(y==t) / float(x.shape[0])
        return accuracy
    
    # x: 입력 데이터, t: 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            # reversed order
            dout = layer.backward(dout)
            
        # result
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads
    

### load MNIST data

In [25]:
import pickle

def load_mnist(normalize=True, flatten=True, one_hot_label=False):
    def _change_one_hot_label(X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1

        return T

    with open('mnist.pkl', 'rb') as f:
        dataset = pickle.load(f)
        
    if normalize:
        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0
            
    if one_hot_label:
        dataset['train_label'] = _change_one_hot_label(dataset['train_label'])
        dataset['test_label'] = _change_one_hot_label(dataset['test_label'])    
    
    if not flatten:
         for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].reshape(-1, 1, 28, 28)

    return (dataset['train_img'], dataset['train_label']), (dataset['test_img'], dataset['test_label']) 


In [26]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [27]:
# one-hot encoded
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, 
                                                  one_hot_label=True)

# target(y) = 2차원 어레이, one-hot encoded
print(t_train, t_train.ndim)

iter_num = 1000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size/batch_size, 1)

# model 생성
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]] 2


In [36]:
for i in range(iter_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.numerical_gradient(x_batch, t_batch)
    
    for key in ("W1", "b1", "W2", "b2"):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    train_acc = network.accuracy(x_train, t_train)
    test_acc = network.accuracy(x_test, t_test)
    
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)
    print(f"{i} / train acc:{train_acc} / test acc:{test_acc}")
    
    assert i != 4, "test finished"

0 / train acc:0.0817 / test acc:0.0812
1 / train acc:0.08238333333333334 / test acc:0.0819
2 / train acc:0.08366666666666667 / test acc:0.0834
3 / train acc:0.08401666666666667 / test acc:0.0836
4 / train acc:0.08525 / test acc:0.084


AssertionError: test finished

# 수치 미분과 오차역전파법으로 구한 기울기 비교하기

In [39]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, 
                                                  one_hot_label=True)

# model 생성
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

# 각 가중치 차이의 절대값을 구한 다음 평균
[np.average(np.abs(grad_backprop[key] - grad_numerical[key])) for key in grad_numerical.keys()]

[0.0004012244757606656,
 0.0028917560652828066,
 0.005763962639086577,
 0.14095289149098872]

### 연산속도 차이 비교

In [55]:
# 수치미분
%time
network.numerical_gradient(x_batch, t_batch)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


KeyboardInterrupt: 

In [54]:
# 오차역전파법
%time
network.gradient(x_batch, t_batch)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


{'W1': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'b1': array([-1.67886250e-04, -3.68743963e-03, -4.20856262e-04, -2.56916930e-03,
         3.49523768e-04, -4.01858413e-03, -2.27692384e-03,  1.25058204e-03,
        -2.64706792e-04,  7.02420727e-04, -3.95671804e-04,  1.70496759e-03,
        -1.84736181e-03,  7.60534417e-05,  7.46392967e-04, -2.06933849e-03,
        -2.59758117e-03, -2.39086902e-03, -1.25058240e-03, -4.60239973e-03,
        -1.84078793e-04, -2.25814821e-04, -3.68330883e-03, -1.94489372e-03,
        -1.30879116e-05, -3.70103018e-03, -2.50005122e-04,  1.94883145e-04,
        -6.18794075e-04, -1.16671033e-04,  3.93069261e-06, -3.22482111e-03,
        -1.76743270e-03,  1.46932567e-05, -3.00168264e-04, -2.53749780e-04,
         1.09447702e-03,  9.84235278e-04,  7.47908826e-

In [44]:
# 오차역전파법 적용
for i in range(iter_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.gradient(x_batch, t_batch)
    
    for key in ("W1", "b1", "W2", "b2"):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    train_acc = network.accuracy(x_train, t_train)
    test_acc = network.accuracy(x_test, t_test)
    
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)
    print(f"{i} / train acc:{train_acc} / test acc:{test_acc}")
    
    assert i != 20, "test finished"

0 / train acc:0.30296666666666666 / test acc:0.3058
1 / train acc:0.29481666666666667 / test acc:0.296
2 / train acc:0.21598333333333333 / test acc:0.2185
3 / train acc:0.24855 / test acc:0.2478
4 / train acc:0.26886666666666664 / test acc:0.2687
5 / train acc:0.24491666666666667 / test acc:0.2405
6 / train acc:0.26243333333333335 / test acc:0.2652
7 / train acc:0.26958333333333334 / test acc:0.2784
8 / train acc:0.31411666666666666 / test acc:0.3236
9 / train acc:0.34926666666666667 / test acc:0.3549
10 / train acc:0.33968333333333334 / test acc:0.3443
11 / train acc:0.3439333333333333 / test acc:0.343
12 / train acc:0.3369666666666667 / test acc:0.342
13 / train acc:0.31083333333333335 / test acc:0.3136
14 / train acc:0.33225 / test acc:0.336
15 / train acc:0.33813333333333334 / test acc:0.3471
16 / train acc:0.35246666666666665 / test acc:0.3579
17 / train acc:0.31138333333333335 / test acc:0.3172
18 / train acc:0.3123666666666667 / test acc:0.3167
19 / train acc:0.31548333333333334

AssertionError: test finished