In [1]:
import time
import numpy as np

In [2]:
def _t(x):
    return np.transpose(x)

def _m(A, B):
    return np.matmul(A, B)

forward일때 call을 하면서 필요한걸 미리 저장해놓고
back일때 grad()을 계산함 (layer의 출력과 layer의 입력 사이의 미분을 구함)))
grad_W = loss function을 W로 미분한걸 저장, 지금까지 미분한거에 현재 미분한거(grad())까지 추가
dh = 지금까지의 미분을 다 곱한것
grad_b = loss function을 b로 미분한것

In [3]:
class Sigmoid:
    def __init__(self):
        self.last_o = 1
    
    def __call__(self, x):
        self.last_o = 1 / (1.0 + np.exp(-x))
        return self.last_o
        
    def grad(self):    # sigmoid(x)(1-sigmoid(x))
        return self.last_o * (1 - self.last_o)

In [4]:
class MeanSquaredError:
    def __init__(self):
        # gradient 저장
        self.dh = 1
        self.last_diff = 1
        
    def __call__(self, h, y):    # 1/2 * mean ((h - y)^2)
        self.last_diff = h - y
        return 1 / 2 * np.mean(np.square(h - y))
        
    def grad(self):    # h - y
        return self.last_diff

In [5]:
class Neuron:
    def __init__(self, W, b, a_obj):
        # Model parameters
        self.W = W
        self.b = b
        self.a = a_obj()
        
        # gradient 저장
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.dh = np.zeros_like(_t(self.W))    # 이전 입력에 대한 gradient
        
        self.last_x = np.zeros((self.W.shape[0]))    # W로 미분했을때 이전 입력을 가지고 있어야 미분 가능하기 때문에 마지막 x를 저장
        self.last_h = np.zeros((self.W.shape[1]))
        
    def __call__(self, x):
        self.last_x = x    # call 이 되면서 last x 를 저장
        self.last_h = _m(_t(self.W), x) + self.b    # call 이 되면서 last h 를 저장
        return self.a(self.last_h)
    
    def grad(self):    # y = Wx + b  ->  dy/dh = W    앞의 입력으로 미분 했을때
        return self.W * self.a.grad()
    
    def grad_W(self, dh):    # W로 미분 했을때
        grad = np.ones_like(self.W)
        grad_a = self.a.grad()
        for j in range(grad.shape[1]):    # y = w^Tx + b    ->   dy/dw = x
            grad[:, j] = dh[j] * grad_a[j] * self.last_x
        return grad
        
    def grad_b(self, dh):    # y = Wx + b  ->  dy/dh = 1
        return dh * self.a.grad() * 1
        

In [9]:
class DNN:
    def __init__(self, hidden_depth, num_neuron, num_input, output, activation=Sigmoid):
        def init_var(i, o):
            return np.random.normal(0.0, 0.01, (i, o)), np.zeros((o,))
        
        self.sequence = list()
        
        # First hidden layer
        W, b = init_var(num_input, num_neuron)
        self.sequence.append(Neuron(W, b, activation))
        
        # Hidden layers
        for index in range(hidden_depth):
            W, b = init_var(num_neuron, num_neuron)
            self.sequence.append(Neuron(W, b, activation))
            
        # Output layer
        W, b = init_var(num_neuron, output)
        self.sequence.append(Neuron(W, b, activation))
        
    def __call__(self, x):
        for layer in self.sequence:
            x = layer(x)        
        return x
        
    def calc_gradient(self, loss_obj):
        loss_obj.dh = loss_obj.grad()    # loss function의 gradient 계산
        self.sequence.append(loss_obj)    # 저장

        # back-prop loop    loss function부터 gradient를 앞으로 하나씩 연계로 계산
        for i in range(len(self.sequence) - 1, 0, -1):
            l1 = self.sequence[i]
            l0 = self.sequence[i-1]

            l0.dh = _m(l0.grad(), l1.dh)    # loss를 현재 layer로 미분한 gradient = 다음 layer를 현재 layer로 미분한 gradient * loss를 이전 layer로 미분한 gradient
            l0.dW = l0.grad_W(l1.dh)   # loss function을 W로 미분
            l0.db = l0.grad_b(l1.dh)    # loss function을 b로 미분

        self.sequence.remove(loss_obj)
        

In [10]:
def gradient_descent(network, x, y, loss_obj, alpha=0.01):
    loss = loss_obj(network(x), y)  # Forward inference
    network.calc_gradient(loss_obj)  # Back-propagation
    for layer in network.sequence:
        layer.W += -alpha * layer.dW
        layer.b += -alpha * layer.db
    return loss

In [11]:
x = np.random.normal(0.0, 1.0, (10,))
y = np.random.normal(0.0, 1.0, (2,))

t = time.time()
dnn = DNN(hidden_depth=5, num_neuron=32, num_input=10, output=2, activation=Sigmoid)
loss_obj = MeanSquaredError()
for epoch in range(100):
    loss = gradient_descent(dnn, x, y, loss_obj, alpha=0.01)
    print('Epoch {}: Test loss {}'.format(epoch, loss))
    
print('{} seconds elapsed.'.format(time.time() - t))

Epoch 0: Test loss 0.1924968081013336
Epoch 1: Test loss 0.19034308706316233
Epoch 2: Test loss 0.18821382280591967
Epoch 3: Test loss 0.18610913015035885
Epoch 4: Test loss 0.18402910194260352
Epoch 5: Test loss 0.18197380967455612
Epoch 6: Test loss 0.17994330412979975
Epoch 7: Test loss 0.17793761605138703
Epoch 8: Test loss 0.17595675682801415
Epoch 9: Test loss 0.17400071919519672
Epoch 10: Test loss 0.17206947794819638
Epoch 11: Test loss 0.17016299066359358
Epoch 12: Test loss 0.16828119842655415
Epoch 13: Test loss 0.16642402656100092
Epoch 14: Test loss 0.16459138536006804
Epoch 15: Test loss 0.16278317081438703
Epoch 16: Test loss 0.1609992653359283
Epoch 17: Test loss 0.15923953847529512
Epoch 18: Test loss 0.15750384763053943
Epoch 19: Test loss 0.1557920387457433
Epoch 20: Test loss 0.15410394699777466
Epoch 21: Test loss 0.15243939746979274
Epoch 22: Test loss 0.1507982058102361
Epoch 23: Test loss 0.14918017887618024
Epoch 24: Test loss 0.14758511536009933
Epoch 25: Test