# GRUレイヤの実装

In [21]:
import numpy as np
from common.functions import sigmoid

## [演習]
* 以下のGRUレイヤのクラスを完成させましょう

In [22]:
class GRU:
    def __init__(self, Wx, Wh, b):
        '''
        Wx: 入力x用の重みパラーメタ（3つ分の重みをまとめたもの）
        Wh: 隠れ状態h用の重みパラメータ（3つ分の重みをまとめたもの）
        b: バイアス（3つ分のバイアスをまとめたもの）
        '''
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev):
        """
        順伝播計算
        """
        Wx, Wh, b = self.params
        N, H = h_prev.shape
        
        Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
        Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
        bhz,   bhr,  bhh =  b[:H], b[H:2 * H], b[2 * H:]
        
        z = sigmoid(np.dot(x, Wxz) + np.dot(h_prev, Whz) + bhz)
        r = sigmoid(np.dot(x, Wxr) + np.dot(h_prev, Whr) + bhr)
        h_hat = np.tanh(np.dot(x, Wxh) + np.dot(r*h_prev, Whh) + bhh)
        h_next = z * h_prev + (1-z) * h_hat

        self.cache = (x, h_prev, z, r, h_hat)

        return h_next

    def backward(self, dh_next):
        """
        逆伝播計算
        """        
        Wx, Wh, b = self.params
    
        H = Wh.shape[0]
        Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
        Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
        x, h_prev, z, r, h_hat = self.cache

        dh_hat = dh_next * (1 - z)
        dh_prev = dh_next * z

        # tanh
        dt = dh_hat * (1 - h_hat ** 2)
        dbt = dt
        dWhh = np.dot((r * h_prev).T, dt)
        dhr = np.dot(dt, Whh.T)
        dWxh = np.dot(x.T, dt)
        dx = np.dot(dt, Wxh.T)
        dh_prev += r * dhr

        # update gate(z)
        dz =  dh_next * h_prev - dh_next * h_hat
        dt = dz * z * (1-z)
        dbz = dt
        dWhz = np.dot(h_prev.T, dt)
        dh_prev += np.dot(dt, Whz.T)
        dWxz = np.dot(x.T, dt)
        dx += np.dot(dt, Wxz.T)

        # reset gate(r)
        dr = dhr * h_prev
        dt = dr * r * (1-r)
        dbr = dt
        dWhr = np.dot(h_prev.T, dt)
        dh_prev += np.dot(dt, Whr.T)
        dWxr = np.dot(x.T, dt)
        dx += np.dot(dt, Wxr.T)

        dA = np.hstack((dbz, dbr, dbt ))
        
        dWx = np.hstack((dWxz, dWxr, dWxh))
        dWh = np.hstack((dWhz, dWhr, dWhh))
        db = dA.sum(axis=0)
        
        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db
        
        return dx, dh_prev

In [23]:
D = 10 # 入力データの次元
H = 5 # 中間層のノード数

Wx = (np.random.randn(D, 3 * H) / np.sqrt(D))
Wh = (np.random.randn(H, 3 * H) / np.sqrt(H))
b = np.zeros(3 * H)

# オブジェクトの生成
gru = GRU(Wx, Wh, b)

# 順伝播計算
N = 4 # バッチサイズ
x = np.random.randn(N, D)
h_prev = np.random.randn(N, H)
h_next = gru.forward(x, h_prev)
print("h_next=", h_next)
print()

# 逆伝播計算
dh_next = np.random.randn(N, H)
dx, dh_prev = gru.backward(dh_next)
print("dx=", dx)
print()
print("dh_prev=", dh_prev)
print()


h_next= [[ 4.51401979e-01 -1.29109321e+00  9.84114593e-01  1.22735029e-01
  -2.94885676e-01]
 [ 1.56395596e+00 -3.26754157e-01  2.32750488e-01 -6.07076498e-02
   6.35643358e-04]
 [-1.89844346e+00 -1.58757302e-01  9.47706042e-01 -9.31687181e-01
  -5.77341633e-01]
 [ 6.06044453e-01 -1.57070124e+00  1.32888480e-01  1.74545966e+00
   4.55822471e-01]]

dx= [[-0.17225414  0.052994   -0.52891957 -0.59867723 -0.51577556 -0.28619732
  -0.63013771 -0.63884887  0.12398139 -0.65989108]
 [-0.2278402  -0.06104341  0.05960301  0.78291398  0.29824403  0.00524307
   0.33018403  0.71947909 -0.07163497  0.24064119]
 [ 0.11395631  0.05504316 -0.21254997  0.4135878  -0.17866303 -0.38857536
  -0.01524115  0.20411398  0.45281973  0.29995602]
 [ 0.26597511  0.16795122  0.17889276 -0.09128597 -0.02345679  0.14893604
  -0.03197249 -0.11606175  0.00810124  0.15092468]]

dh_prev= [[-0.72327791 -0.79538018 -0.47835197  2.47354608  1.53650683]
 [-0.50397399  1.02908499 -0.84061175  0.12630191 -0.61426331]
 [-1.0794