* gradient를 알면 매개변수의 미소 변화에 따른 영향력 차이를 알기 쉽다
* 덧셈의 역전파는 순방향 입력 신호값이 필요 없지만
* 곱셍의 역전파는 순방향 입력 신호값이 필요해서 변수로 저장해둔다


## 5.4 단순한 계층 구현하기

In [2]:
# 곱셈 계층
class MulLayer:
  def __init__(self):
    self.x = None
    self.y = None

  def forward(self, x, y):
    self.x = x
    self.y = y

    return x * y

  def backward(self, dout):
    dx = dout * self.y
    dy = dout * self.x

    return dx, dy


In [3]:
apple = 100
apple_num = 2
tax = 1.1

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price)

220.00000000000003


In [5]:
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(1)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, dapple_num, dtax)

2.2 110.00000000000001 200


* backward가 받는 argument는 순전파 출력에 대한 미분   
  ex) If forward() -> apple, then backward(dapple)

In [6]:
# 덧셈 계층
class AddLayer:
  def __init__(self):
    pass

  def forward(self, x, y):
    return x + y

  def backward(self, dout):
    dx = dout
    dy = dout
    return dx, dy

In [9]:
# 계산 그래프 구현
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price)
print(dapple, dapple_num, dorange, dorange_num, dtax)

715.0000000000001
2.2 110.00000000000001 3.3000000000000003 165.0 650


## 5.5 활성화 함수 계층 구현하기

In [10]:
class ReLU:
  def __init__(self):
    self.mask = None

  def forward(self, x):
    self.mask = (x <= 0)
    out = x.copy()
    out[self.mask] = 0
    return out

  def backward(self, dout):
    dout[self.mask] = 0
    dx = dout
    return dx


Note that $$ \begin{align*}\frac{\partial L}{\partial x} = \frac{\partial L}{\partial y} y^2 \exp(-x) &= \frac{\partial L}{\partial y}\frac{1}{(1 + \exp(-x))^2} \exp(-x) \\
&= \frac{\partial L}{\partial y}\frac{1}{1 + \exp(-x)} \frac{\exp(-x)}{1 + \exp(-x)}
\\
&= \frac{\partial L}{\partial y} y(1-y)
\end{align*}
$$

In [13]:
import numpy as np

# 사실 sigmoid를 연산별로 세분화 해도 되지만 간소화!
class sigmoid():
  def __init__(self):
    self.out = None

  def forward(self, x):
    out = 1 / (1 + np.exp(-x))
    self.out = out
    return out

  def backward(self, dout):
    dx = dout * self.out * (1 - self.out)
    return dx

## 5.6 Affine/Softmax 계층 구현하기
* $X$와 $\frac{\partial L}{\partial X}$의 shape은 같다!


In [21]:
W = np.array([[1,2],
             [3,4],
             [5,6]])

X = np.array(([2,2,2],[2,2,2]))

np.dot(X, W)

array([[18, 24],
       [18, 24]])

In [17]:
# 편향 주의
X_dot_W = np.array([[0,0,0],[10, 10, 10]])
B = np.array([1,2,3])

X_dot_W + B

array([[ 1,  2,  3],
       [11, 12, 13]])

In [18]:
class Affine:
  def __init__(self, W, b):
    self.W = W
    self.b = b
    self.x = None
    self.dW = None
    self.db = None

  def forward(self, x):
    self.x = x
    out = np.dot(x, self.W) + self.b
    return out

  def backward(self, dout):
    dx = np.dot(dout, self.W.T)
    self.dW = np.dot(self.x.T, dout)
    self.db = np.sum(dout, axis=0)
    return dx, self.dW, self.db


In [24]:
# test
B = np.array([1,2])

net = Affine(W, B)
y = net.forward(X)
dx, dW, db = net.backward(np.array([[1,1], [1,1]]))
print(y, dx, dW, db)

[[19 26]
 [19 26]] [[ 3  7 11]
 [ 3  7 11]] [[4 4]
 [4 4]
 [4 4]] [2 2]


* NN이 수행하는 작업은 학습 및 추론이다. 그러나 추론에서는 softmax를 쓸 필요가 없다   
* softmax를 거치지 않은 output은 score라 부르고, score를 softmax에 통과시킨 output은 확률이라 부른다   


In [26]:
class SoftmaxWithLoss:
  def __init__(self):
    self.loss = None
    self.y = None
    self.t = None

  def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

  def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)

    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)

    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

  def forward(self, x, t):
    self.t = t
    self.y = self.softmax(x)
    self.loss = self.cross_entropy_error(self.y, self.t)
    return self.loss

  def backward(self, dout=1):
    batch_size = self.t.shape[0]
    dx = (self.y - self.t) / batch_size

    return dx

In [29]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

X = np.array(([2,2,2],[2,2,2]))
print(X.ndim)
softmax(X)

2


array([[0.33333333, 0.33333333, 0.33333333],
       [0.33333333, 0.33333333, 0.33333333]])

In [14]:
import numpy as np

a = np.random.randint(1, 10, size=(3, 2, 4))
b = np.random.randint(1, 10, size=(4,))
c = a + b
d = a * b
print(c)
print(c.shape)
print("----------------")
print(d)
print(d.shape)

[[[ 8 16  4 15]
  [ 4 16  5 12]]

 [[ 8 14  7  9]
  [ 5  8  8 16]]

 [[ 8 14  5  9]
  [12  9  8 15]]]
(3, 2, 4)
----------------
[[[15 63  3 56]
  [ 3 63  6 32]]

 [[15 49 12  8]
  [ 6  7 15 64]]

 [[15 49  6  8]
  [27 14 15 56]]]
(3, 2, 4)


In [12]:
a = np.random.randint(1, 10, size=(3, 2, 4))
b = np.random.randint(1, 10, size=(2,1))
c = a + b
d = a * b
print(c)
print(c.shape)
print("----------------")
print(d)
print(d.shape)

[[[17 18 14 16]
  [14 11  6  7]]

 [[12 16 13 13]
  [12 10  7  6]]

 [[12 12 14 17]
  [14 12 11  8]]]
(3, 2, 4)
----------------
[[[72 81 45 63]
  [45 30  5 10]]

 [[27 63 36 36]
  [35 25 10  5]]

 [[27 27 45 72]
  [45 35 30 15]]]
(3, 2, 4)


In [7]:
a = np.random.randint(1, 10, size=(3, 2, 4))
b = np.random.randint(1, 10, size=(4,1))
c = a + b
print(c)
print(c.shape)

ValueError: operands could not be broadcast together with shapes (3,2,4) (4,1) 

In [8]:
a = np.random.randint(1, 10, size=(3, 2, 4))
b = np.random.randint(1, 10, size=(2,2))
c = a + b
print(c)
print(c.shape)

ValueError: operands could not be broadcast together with shapes (3,2,4) (2,2) 