In [None]:
## for comet
#from comet_ml import Experiment

import numpy
from tqdm import tqdm
from collections import OrderedDict
from dataset import ptb
import sys
import os
import pickle
import random
import datetime

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [None]:
#device = 'cpu'
device = 'gpu'

np = numpy
if device == 'gpu':
    import cupy
    import cupyx
    np = cupy
print(f'Use {device}.')

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x, ndim=-1):
    x = x.astype('float64')
    if x.ndim == -1:
        ndim = len(x.shape) - 1
    c = x.max()
    exp_x = np.exp(x - c)
    sum_exp_x = np.sum(exp_x, axis=ndim)
    out = (exp_x.T / sum_exp_x).T
    return out.astype('f')

def cross_entropy_error(y, t, onehot=False):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    if not onehot:
        out = -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
    else:
        out = -np.sum(np.dot(t, np.log(y + 1e-7))) / batch_size
    return out

def numerical_diff(f, x, i):
    h = 1e-4
    h_vec = np.zeros_like(x)
    h_vec[i] = h
    return (f(x + h_vec) - f(x - h_vec)) / (2*h)

def numerical_diff2(f, x, i, j):
    h = 1e-4
    h_vec = np.zeros_like(x)
    h_vec[i, j] = h
    return (f(x + h_vec) - f(x - h_vec)) / (2*h)

def numerical_gradient(f, x):
    grad = np.zeros_like(x).astype(np.float128)
    n, m = x.shape
    for i in range(n):
        for j in range(m):
            grad[i, j] = numerical_diff2(f, x, i, j)
    return grad

def clip_grads(grads, max_norm):
    total_norm = 0
    for grad in grads:
        total_norm += np.sum(grad ** 2)
    total_norm = np.sqrt(total_norm)
    rate = max_norm / (total_norm + 1e-6)
    if rate < 1:
        for grad in grads:
            grad *= rate

def to_cpu(x):
    if type(x) == numpy.ndarray:
        return x
    return cupy.asnumpy(x)

def to_gpu(x):
    if type(x) == cupy.ndarray:
        return x
    return cupy.array(x)

def to_device(x, device=device):
    if device == 'gpu':
        return to_gpu(x)
    else:
        return to_cpu(x)

In [None]:
class Adam:
    def __init__(self, shape, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=10**(-8)):
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = np.zeros(shape)
        self.v = np.zeros(shape)
        self.t = 0

    def update(self, w, dw):
        self.t += 1
        self.m = (self.beta1 * self.m) + (1 - self.beta1) * dw
        self.v = (self.beta2 * self.v) + (1 - self.beta2) * dw**2
        mh = self.m / (1 - self.beta1 ** self.t)
        vh = self.v / (1 - self.beta2 ** self.t)
        w -= self.alpha * (mh / (np.sqrt(vh) + self.epsilon))

class AdamContainer:
    def __init__(self, layers, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=10**(-8)):
        self.params = []
        for params in [layer.params for layer in layers]:
            for param in params:
                self.params.append(param)
        self.grads = []
        for grads in [layer.grads for layer in layers]:
            for grad in grads:
                self.grads.append(grad)
        self.adams = [Adam(param.shape, alpha, beta1, beta2, epsilon) for param in self.params]
    
    def update(self):
        for adam, param, grad in zip(self.adams, self.params, self.grads):
            adam.update(param, grad)

In [None]:
class BaseLayer:
    def __init__(self):
        self.params = None
        self.grads = None
        
    def to_cpu(self):
        for param in self.params:
            param = to_cpu(param)
        for grad in self.grads:
            grad = to_cpu(grad)
            
    def to_gpu(self):
        for param in self.params:
            param = to_gpu(param)
        for grad in self.grads:
            grad = to_gpu(grad)

class Affine(BaseLayer):
    def __init__(self, w, b):
        self.params = [w, b]
        self.grads = [np.zeros_like(w), np.zeros_like(b)]
    
    def forward(self, x):
        w, b = self.params
        self.x = x
        return np.dot(x, w) + b
    
    def backward(self, dout):
        w = self.params[0]
        dx = np.dot(dout, w.T)
        self.grads[0] = self.dw = np.dot(self.x.T, dout)
        self.grads[1] = self.db = np.sum(dout, axis=0)
        return dx

class TimeAffine(BaseLayer):
    def __init__(self, w, b):
        self.params = [w, b]
        self.grads = [np.zeros_like(w), np.zeros_like(b)]
        self.layers = None
        self.cache = None
        
    def forward(self, xs):
        w, b = self.params
        N, T, D = xs.shape
        D, V = w.shape
        
        out = np.empty((N, T, V), dtype='f')
        self.layers = []
        
        for t in range(T):
            layer = Affine(w, b)
            out[:,t,:] = layer.forward(xs[:,t,:])
            self.layers.append(layer)
        
        self.cache = (N, T, D, V)
        return out
    
    def backward(self, dout):
        N, T, D, V = self.cache
        
        out = np.empty((N, T, D), dtype='f')
        dw = np.empty((D, T, V), dtype='f')
        db = np.empty((T, V), dtype='f')
        for t in range(T):
            layer = self.layers[t]
            out[:,t,:] = layer.backward(dout[:,t,:])
            dw[:,t,:] = layer.dw
            db[t,:] = layer.db
        self.grads[0] = self.dw = dw.sum(axis=1)
        self.grads[1] = self.db = db.sum(axis=0)
        
        return out

class ReLU(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
    
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        return dout

class Sigmoid(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        
    def forward(self, x):
        self.out = sigmoid(x)
        return self.out
    
    def backward(self, dout):
        dx = dout * self.out * (1 - self.out)
        return dx
    
class Softmax(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        
    def forward(self, x):
        self.y = softmax(x)
        return self.y
    
    def backward(self, dout):
        dx = self.y * dout
        sumdx = np.sum(dx, axis=1, keepdims=True)
        dx -= self.y * sumdx
        return dx

class SoftmaxWithLoss(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1, onehot=False):
        batch_size = self.t.shape[0]
        if not onehot:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx *= dout
            dx = dx / batch_size
        else:
            dx = (self.y - self.t) / batch_size
        return dx

class TimeSoftmaxWithLoss(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        self.layers = None
        self.cache = None
        
    def forward(self, xs, ts):
        N, T, V = xs.shape
        
        # if ts is one-hot vector
        if ts.ndim == 3:
            ts = ts.argmax(axis=2).reshape(N, T)
        
        ys = np.empty(T, dtype='f')
        self.layers = []
        
        for t in range(T):
            layer = SoftmaxWithLoss()
            ys[t] = layer.forward(xs[:,t,:], ts[:,t])
            self.layers.append(layer)
            
        loss = ys.sum() / T
        
        self.cache = (N, T, V)
        return loss
    
    def backward(self):
        N, T, V = self.cache
        dx = np.empty((N, T, V), dtype='f')
        for t in range(T):
            layer = self.layers[t]
            dx[:,t,:] = layer.backward()
        self.dx = dx / T
        
        return self.dx

class Dropout(BaseLayer):
    def __init__(self, ratio=0.5):
        self.params = []
        self.grads = []
        self.ratio = ratio
        self.mask = None
        self.train = True

    def forward(self, x):
        if self.train:
            self.mask = np.random.rand(*x.shape) > self.ratio
            self.mask = self.mask.astype('f') / (1.0 - self.ratio)
            return x * self.mask
        else:
            return x

    def backward(self, dout):
        return dout * self.mask

class Embedding(BaseLayer):
    def __init__(self, w):
        self.params = [w]
        self.grads = [np.zeros_like(w)]
        self.idx = None
        
    def forward(self, idx):
        w = self.params[0]
        self.idx = idx
        return w[idx]
    
    def backward(self, dout):
        if device == 'gpu':
            add_at = cupyx.scatter_add
        else:
            add_at = np.add.at
            
        dw = self.grads[0]
        dw[...] = 0
        add_at(dw, self.idx, dout)
        self.grads[0] = self.dw = dw
        return None

class TimeEmbedding(BaseLayer):
    def __init__(self, w):
        self.params = [w]
        self.grads = [np.zeros_like(w)]
        self.layers = None
        
    def forward(self, xs):
        w = self.params[0]
        N, T = xs.shape
        V, D = w.shape
        
        out = np.empty((N, T, D), dtype='f')
        self.layers = []
        
        for t in range(T):
            layer = Embedding(w)
            out[:,t,:] = layer.forward(xs[:,t])
            self.layers.append(layer)
            
        return out
    
    def backward(self, dout):
        N, T, D = dout.shape
        
        dw = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dout[:,t,:])
            dw += layer.dw
        self.grads[0] = self.dw = dw
        
        return None

class RNN(BaseLayer):
    def __init__(self, wx, wh, b):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
    
    def forward(self, x, h_prev):
        wx, wh, b = self.params
        t = np.dot(h_prev, wh) + np.dot(x, wx) + b
        h_next = np.tanh(t)
        self.cache = (x, h_prev, h_next)
        return h_next
    
    def backward(self, dh_next):
        wx, wh, b = self.params
        x, h_prev, h_next = self.cache
        
        dt = dh_next * (1 - dh_next ** 2)
        db = np.sum(dt, axis=0)
        dwh = np.dot(h_prev.T, dt)
        dh_prev = np.dot(dt, wh.T)
        dwx = np.dot(x.T, dt)
        dx = np.dot(dt, wx.T)
        self.grads[0][...] = self.dwx = dwx
        self.grads[1][...] = self.dwh = dwh
        self.grads[2][...] = self.db = db
        
        return dx, dh_prev

class TimeRNN(BaseLayer):
    def __init__(self, wx, wh, b, stateful=False):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.layers = None
        self.h, self.dh = None, None
        self.stateful = stateful
        
    def set_state(self, h):
        self.h = h
        
    def reset_state(self):
        self.h = None
        
    def forward(self, xs):
        wx, wh, b = self.params
        N, T, D = xs.shape
        D, H = wx.shape
        
        self.layers = []
        hs = np.empty((N, T, H), dtype='f')
        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        
        for t in range(T):
            layer = RNN(*self.params)
            self.h = layer.forward(xs[:,t,:], self.h)
            hs[:,t,:] = self.h
            self.layers.append(layer)
        
        return hs
    
    def backward(self, dhs):
        wx, wh, b  = self.params
        N, T, H = dhs.shape
        D, H = wx.shape
        
        dxs = np.empty((N, T, D), dtype='f')
        dh = 0
        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh = layer.backward(dhs[:,t,:] + dh)
            dxs[:,t,:] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad
        
        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dwx = self.grads[0]
        self.dwh = self.grads[1]
        self.db = self.grads[2]
        self.dh = dh
        
        return dxs

class LSTM(BaseLayer):
    def __init__(self, wx, wh, b):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.cache = None
    
    def forward(self, x, h_prev, c_prev):
        wx, wh, b = self.params
        N, H = h_prev.shape
        
        A = np.dot(h_prev, wh) + np.dot(x, wx) + b
        
        f = A[:, :H]
        g = A[:, H:2*H]
        i = A[:, 2*H:3*H]
        o = A[:, 3*H:]
        
        f = sigmoid(f)
        g = np.tanh(g)
        i = sigmoid(i)
        o = sigmoid(o)
        
        c_next = f * c_prev + g * i
        h_next = o * np.tanh(c_next)
        
        self.cache = (x, h_prev, c_prev,i, f, g, o, c_next)
        return h_next, c_next
    
    def backward(self, dh_next, dc_next):
        wx, wh, b = self.params
        x, h_prev, c_prev, i, f, g, o, c_next = self.cache
        
        tanh_c_next = np.tanh(c_next)
        ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
        
        dc_prev = ds * f
        di = ds * g
        df = ds * c_prev
        do = dh_next * tanh_c_next
        dg = ds * i
        
        di *= i * (1 - i)
        df *= f * (1 - f)
        do *= o * (1 - o)
        dg *= 1 - g ** 2
        
        dA = np.hstack((df, dg, di, do))
        
        dwh = np.dot(h_prev.T, dA)
        dh_prev = np.dot(dA, wh.T)
        dwx = np.dot(x.T, dA)
        dx = np.dot(dA, wx.T)
        db = dA.sum(axis=0)
        
        self.grads[0][...] = self.dwx = dwx
        self.grads[1][...] = self.dwh = dwh
        self.grads[2][...] = self.db = db
        
        return dx, dh_prev, dc_prev

class TimeLSTM(BaseLayer):
    def __init__(self, wx, wh, b, stateful=False):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.layers = None
        self.h, self.dh, self.c = None, None, None
        self.stateful = stateful
        
    def set_state(self, h, c=None):
        self.h, self.c = h, c
        
    def reset_state(self):
        self.h, self.c = None, None
        
    def forward(self, xs):
        wx, wh, b = self.params
        N, T, D = xs.shape
        H = wh.shape[0]
        
        self.layers = []
        hs = np.empty((N, T, H), dtype='f')
        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')
        
        for t in range(T):
            layer = LSTM(*self.params)
            self.h, self.c = layer.forward(xs[:,t,:], self.h, self.c)
            hs[:,t,:] = self.h
            self.layers.append(layer)
        
        return hs
    
    def backward(self, dhs):
        wx, wh, b  = self.params
        N, T, H = dhs.shape
        D = wx.shape[0]
        
        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0
        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:,t,:] + dh, dc)
            dxs[:,t,:] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad
        
        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dwx = self.grads[0]
        self.dwh = self.grads[1]
        self.db = self.grads[2]
        self.dh = dh
        
        return dxs

class BaseNetwork:
    def __init__(self):
        self.layers = None
        self.lastLayer = None
        self.adam = None
        
    def train(self, x, t):
        # forward
        loss = self.loss(x, t)
        
        # backward
        dout = self.lastLayer.backward()
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        self.update()
            
        return loss
    
    def update(self):
        for grads in [layer.grads for layer in self.layers.values()]:
            clip_grads(grads, self.max_grad)
        self.adam.update()
    
    def save(self, state_file_name = 'network.state.pkl'):
        params = self.adam.params.copy()
        for i, param in enumerate(params):
            params[i] = to_cpu(params[i])
            
        with open(state_file_name, 'wb') as f:
            pickle.dump(params, f)
            print(f'Saved: {state_file_name}')
            
    def load(self, state_file_name = 'network.state.pkl'):
        if os.path.exists(state_file_name):
            with open(state_file_name, 'rb') as f:
                params = pickle.load(f)
                for i, param in enumerate(params):
                    self.adam.params[i][:] = to_device(param, device)
                print(f'Loaded: {state_file_name}')

In [None]:
class Network(BaseNetwork):
    def __init__(self, vocab_size, wordvec_size, hidden_size, max_grad=5.0, dropout_ratio=0.1):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        self.max_grad = max_grad
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm1_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm1_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm1_b = np.zeros(4 * H).astype('f')
        lstm2_wx = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm2_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm2_b = np.zeros(4 * H).astype('f')
        ## if use weight tying (require: H == D), comment-out this affine_w
        affine_w = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.layers = OrderedDict()
        self.layers['Embedding'] = TimeEmbedding(embed_w)
        self.layers['Dropout1'] = Dropout(dropout_ratio)
        self.layers['LSTM1'] = TimeLSTM(lstm1_wx, lstm1_wh, lstm1_b, stateful=True)
        self.layers['Dropout2'] = Dropout(dropout_ratio)
        self.layers['LSTM2'] = TimeLSTM(lstm2_wx, lstm2_wh, lstm2_b, stateful=True)
        self.layers['Dropout3'] = Dropout(dropout_ratio)
        self.layers['Affine'] = TimeAffine(affine_w, affine_b)
        #self.layers['Affine'] = TimeAffine(embed_w.T, affine_b)  # weight tying
        self.lastLayer = TimeSoftmaxWithLoss()
        
        self.lstm_layers = [self.layers['LSTM1'], self.layers['LSTM2']]
        self.drop_layers = [self.layers['Dropout1'], self.layers['Dropout2'], self.layers['Dropout3']]
        
        self.adam = AdamContainer(list(self.layers.values()))
        
    def predict(self, x, train=False):
        for layer in self.drop_layers:
            layer.train = train
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    def loss(self, x, t, train=True):
        y = self.predict(x, train)
        return self.lastLayer.forward(y, t)
    
    def reset_rnn_state(self):
        for layer in self.lstm_layers:
            layer.reset_state()

In [None]:
class Encoder(BaseLayer):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        self.layers = OrderedDict()
        self.layers['Embedding'] = TimeEmbedding(embed_w)
        self.layers['LSTM'] = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful=False)
        
        self.params = []
        for params in [layer.params for layer in self.layers.values()]:
            for param in params:
                self.params.append(param)
        self.grads = []
        for grads in [layer.grads for layer in self.layers.values()]:
            for grad in grads:
                self.grads.append(grad)
        self.hs = None
        
    def forward(self, xs):
        for layer in self.layers.values():
            xs = layer.forward(xs)
        self.hs = xs
        return xs[:,-1,:]
    
    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:,-1,:] = dh
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dhs = layer.backward(dhs)
        return dhs

In [None]:
class Decoder(BaseLayer):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        affine_w = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.layers = OrderedDict()
        self.layers['Embedding'] = TimeEmbedding(embed_w)
        self.layers['LSTM'] = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful=True)
        self.layers['Affine'] = TimeAffine(affine_w, affine_b)
        
        self.params = []
        for params in [layer.params for layer in self.layers.values()]:
            for param in params:
                self.params.append(param)
        self.grads = []
        for grads in [layer.grads for layer in self.layers.values()]:
            for grad in grads:
                self.grads.append(grad)
        
    def forward(self, xs, h):
        self.layers['LSTM'].set_state(h)
        for layer in self.layers.values():
            xs = layer.forward(xs)
        return xs
    
    def backward(self, dh):
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dh = layer.backward(dh)
        return self.layers['LSTM'].dh
    
    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.layers['LSTM'].set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape(1,1)
            for layer in self.layers.values():
                x = layer.forward(x)
            sample_id = np.argmax(x.flatten())
            sampled.append(int(sample_id))

        return np.array(sampled)

class PeekyDecoder(BaseLayer):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(H + D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        affine_w = (rn(H + H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.layers = OrderedDict()
        self.layers['Embedding'] = TimeEmbedding(embed_w)
        self.layers['LSTM'] = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful=True)
        self.layers['Affine'] = TimeAffine(affine_w, affine_b)
        
        self.params = []
        for params in [layer.params for layer in self.layers.values()]:
            for param in params:
                self.params.append(param)
        self.grads = []
        for grads in [layer.grads for layer in self.layers.values()]:
            for grad in grads:
                self.grads.append(grad)
        self.cache = None
        
    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape
        
        self.layers['LSTM'].set_state(h)
        
        out = self.layers['Embedding'].forward(xs)
        hs = np.repeat(h, T, axis=0).reshape(N, T, H)
        out = np.concatenate((hs, out), axis=2)
        
        out = self.layers['LSTM'].forward(out)
        out = np.concatenate((hs, out), axis=2)
        
        score = self.layers['Affine'].forward(out)
        self.cache = H
        return score
    
    def backward(self, dh):
        H = self.cache
        
        dout = self.layers['Affine'].backward(dh)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        
        dout = self.layers['LSTM'].backward(dout)
        dout, dhs1 = dout[:, :, H:], dout[:, :, :H]
        
        self.layers['Embedding'].backward(dout)

        dhs = dhs0 + dhs1
        dh = self.layers['LSTM'].dh + np.sum(dhs, axis=1)
        
        return dh
    
    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.layers['LSTM'].set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array(sample_id).reshape(1,1)
            out = self.layers['Embedding'].forward(x)
            out = np.concatenate((peeky_h, out), axis=2)
            out = self.layers['LSTM'].forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.layers['Affine'].forward(out)
            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))

        return np.array(sampled)

class Seq2SeqNetwork(BaseNetwork):
    def __init__(self, vocab_size, wordvec_size, hidden_size, max_grad=5.0):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.max_grad = max_grad
        self.layers = OrderedDict()
        self.layers['Encoder'] = Encoder(V, D, H)
        self.layers['Decoder'] = PeekyDecoder(V, D, H)
        self.lastLayer = TimeSoftmaxWithLoss()
        self.adam = AdamContainer(list(self.layers.values()))
        
    def generate(self, xs, start_id, sample_size):
        h = self.layers['Encoder'].forward(xs)
        sampled = self.layers['Decoder'].generate(h, start_id, sample_size)
        return sampled
        
    def loss(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]
        h = self.layers['Encoder'].forward(xs)
        score = self.layers['Decoder'].forward(decoder_xs, h)
        loss = self.lastLayer.forward(score, decoder_ts)
        return loss

In [None]:
class WeightSum(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        self.cache = None
        
    def forward(self, hs, a):
        N, T, H = hs.shape
        
        ar = a.reshape(N, T, 1) #.repeat(H, axis=2)
        t = hs * ar
        c = np.sum(t, axis=1)
        
        self.cache = (hs, ar)
        return c
    
    def backward(self, dc):
        hs, ar = self.cache
        N, T, H = hs.shape
        
        dt = dc.reshape(N, 1, H).repeat(T, axis=1)
        dar = dt * hs
        dhs = dt * ar
        da = np.sum(dar, axis=2)
        
        return dhs, da

class AttentionWeight(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        self.softmax = Softmax()
        self.cache = None
    
    def forward(self, hs, h):
        N, T, H = hs.shape
        
        hr = h.reshape(N, 1, H) #.repeat(T, axis=1)
        t = hs * hr
        s = np.sum(t, axis=2)
        a = self.softmax.forward(s)
        
        self.cache = (hs, hr)
        return a
        
    def backward(self, da):
        hs, hr = self.cache
        N, T, H = hs.shape
        
        ds = self.softmax.backward(da)
        dt = ds.reshape(N, T, 1).repeat(H, axis=2)
        dhs = dt * hr
        dhr = dt * hs
        dh = np.sum(dhr, axis=1)
        
        return dhs, dh

class Attention(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        self.attention_weight_layer = AttentionWeight()
        self.weight_sum_layer = WeightSum()
        self.attention_weight = None
    
    def forward(self, hs, h):
        a = self.attention_weight_layer.forward(hs, h)
        out = self.weight_sum_layer.forward(hs, a)
        self.attention_weight = a
        return out
    
    def backward(self, dout):
        dhs0, da = self.weight_sum_layer.backward(dout)
        dhs1, dh = self.attention_weight_layer.backward(da)
        dhs = dhs0 + dhs1
        return dhs, dh

class TimeAttention(BaseLayer):
    def __init__(self):
        self.params = []
        self.grads = []
        self.layers = None
        self.attention_weights = None
        
    def forward(self, hs_enc, hs_dec):
        N, T, H = hs_dec.shape
        out = np.empty_like(hs_dec)
        self.layers = []
        self.attention_weights = []
        
        for t in range(T):
            layer = Attention()
            out[:, t, :] = layer.forward(hs_enc, hs_dec[:, t, :])
            self.layers.append(layer)
            self.attention_weights.append(layer.attention_weight)
            
        return out
    
    def backward(self, dout):
        N, T, H = dout.shape
        dhs_enc = 0
        dhs_dec = np.empty_like(dout)
        
        for t in range(T):
            layer = self.layers[t]
            dhs, dh = layer.backward(dout[:, t, :])
            dhs_enc += dhs
            dhs_dec[:, t, :] = dh
        
        return dhs_enc, dhs_dec

class AttentionEncoder(Encoder):
    def forward(self, xs):
        for layer in self.layers.values():
            xs = layer.forward(xs)
        return xs
    
    def backward(self, dhs):
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dhs = layer.backward(dhs)
        return dhs

class AttentionDecoder(BaseLayer):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        affine_w = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.layers = OrderedDict()
        self.layers['Embedding'] = TimeEmbedding(embed_w)
        self.layers['LSTM'] = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful=True)
        self.layers['Attention'] = TimeAttention()
        self.layers['Affine'] = TimeAffine(affine_w, affine_b)
        
        self.params = []
        for params in [layer.params for layer in self.layers.values()]:
            for param in params:
                self.params.append(param)
        self.grads = []
        for grads in [layer.grads for layer in self.layers.values()]:
            for grad in grads:
                self.grads.append(grad)
        
    def forward(self, xs, enc_hs):
        h = enc_hs[:, -1]
        self.layers['LSTM'].set_state(h)
        
        out = self.layers['Embedding'].forward(xs)
        dec_hs = self.layers['LSTM'].forward(out)
        c = self.layers['Attention'].forward(enc_hs, dec_hs)
        out = np.concatenate((c, dec_hs), axis=2)
        score = self.layers['Affine'].forward(out)
        return score
    
    def backward(self, dscore):
        dout = self.layers['Affine'].backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2
        dc, ddec_hs0 = dout[:, :, :H], dout[: ,: ,H:]
        denc_hs, ddec_hs1 = self.layers['Attention'].backward(dc)
        ddec_hs = ddec_hs0 + ddec_hs1
        dout = self.layers['LSTM'].backward(ddec_hs)
        denc_hs[:, -1] += self.layers['LSTM'].dh
        self.layers['Embedding'].backward(dout)

        return denc_hs

    def generate(self, enc_hs, start_id, sample_size):
        sampled = []
        sample_id = start_id
        h = enc_hs[:,  -1]
        self.layers['LSTM'].set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape(1, 1)
            out = self.layers['Embedding'].forward(x)
            dec_hs = self.layers['LSTM'].forward(out)
            c = self.layers['Attention'].forward(enc_hs, dec_hs)
            out = np.concatenate((c, dec_hs), axis=2)
            score = self.layers['Affine'].forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return np.array(sampled)

class AttentionSeq2SeqNetwork(BaseNetwork):
    def __init__(self, vocab_size, wordvec_size, hidden_size, max_grad=5.0):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.max_grad = max_grad
        self.layers = OrderedDict()
        self.layers['Encoder'] = AttentionEncoder(V, D, H)
        self.layers['Decoder'] = AttentionDecoder(V, D, H)
        self.lastLayer = TimeSoftmaxWithLoss()
        self.adam = AdamContainer(list(self.layers.values()))
        
    def generate(self, xs, start_id, sample_size):
        h = self.layers['Encoder'].forward(xs)
        sampled = self.layers['Decoder'].generate(h, start_id, sample_size)
        return sampled
        
    def loss(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]
        h = self.layers['Encoder'].forward(xs)
        score = self.layers['Decoder'].forward(decoder_xs, h)
        loss = self.lastLayer.forward(score, decoder_ts)
        return loss

In [None]:
def load_data(file_path=None):
    if file_path is None or not os.path.exists(file_path):
        return ptb.load_data('train')
    
    words = (open(file_path).read().translate(
                str.maketrans({',':'','.':'','-':'',"'":'',':':'',';':'','!':'','?':''}))
                .lower().strip().split())

    word_to_id = {}
    id_to_word = {}
    for i, word in enumerate(words):
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word
            
    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

def make_corpus(str):
    corpus = []
    word_to_id = {}
    id_to_word = {}
    for word in str.lower().strip().split():
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word
        corpus.append(word_to_id[word])
    return corpus, word_to_id, id_to_word

In [None]:
def cos_similarity(x, y):
    x = x / np.sqrt(np.sum(x ** 2))
    y = y / np.sqrt(np.sum(y ** 2))
    return np.dot(x, y)

def most_similar(word_to_id, id_to_word, word_matrix, queries=['you', 'say', 'good'], top=5):
    for query in queries:
        if query not in word_to_id:
            print(f'{query} is not found.')
            continue
        
        print(f'\n{query}')
        query_id = word_to_id[query]
        query_vec = word_matrix[query_id]
        
        vocab_size = len(word_matrix)
        similarity = np.zeros(vocab_size)
        for i in range(vocab_size):
            similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
        count = 0
        for i in (-1 * similarity).argsort():
            if id_to_word[i.item()] == query:
                continue
            print(f' {id_to_word[i.item()]}: {similarity[i.item()]}')
            count += 1
            if count >= top:
                break

def generate_seq(network, start_id, sample_size=100):
    word_ids = [start_id]
    
    x = start_id
    while len(word_ids) < sample_size:
        x = np.array(x).reshape(1,1)
        score = network.predict(x)
        p = softmax(score.flatten())
        x = np.random.choice(len(p), size=1, p=p)
        word_ids.append(x.item())
        
    return word_ids

def print_generate_seq(network, word_to_id, id_to_word, start_word='you', load_state=True):
    start_id = word_to_id[start_word]
    network.reset_rnn_state()
    word_ids = generate_seq(network, start_id)
    network.reset_rnn_state()
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace('<eos>', '.\n')
    print(txt)

In [None]:
def generate_question():
    num1 = random.randint(0, 999)
    num2 = random.randint(0, 999)
    question = str(num1) + '+' + str(num2)
    correct = '=' + str(num1 + num2)
    question = question.ljust(7)
    correct = correct.ljust(5+1)
    return question, correct

def make_question_data(data_size, char_to_id):
    questions = []
    corrects = []
    for _ in range(data_size):
        q, c = generate_question()
        q = [char_to_id[s] for s in q]
        c = [char_to_id[s] for s in c]
        questions.append(q)
        corrects.append(c)
    questions = np.array(questions)
    corrects = np.array(corrects)
    return questions[:, ::-1], corrects

In [None]:
def generate_date_question(max_daydelta=10000):
    today = datetime.date.today()
    date = today + datetime.timedelta(days=random.randint(-max_daydelta, max_daydelta))
    question = date.strftime('%Y %B %d %A')
    correct = '=' + date.strftime('%Y-%m-%d')
    question = question.ljust(29)
    correct = correct.ljust(1+10+1)
    return question, correct

def make_date_questions(question_num):
    questions =  []
    corrects = []
    char_to_id = {}
    
    index = 0
    for number in range(10):
        char_to_id[f'{number}'] = index
        index += 1
    for alphabet in range(26):
        char_to_id[f'{chr(ord("a") + alphabet)}'] = index
        char_to_id[f'{chr(ord("A") + alphabet)}'] = index + 1
        index += 2
    char_to_id['='] = index
    char_to_id['-'] = index + 1
    char_to_id[' '] = index + 2
    
    for _ in range(question_num):
        q, c = generate_date_question()
        q = [char_to_id[s] for s in q]
        c = [char_to_id[s] for s in c]
        questions.append(q)
        corrects.append(c)
    questions = np.array(questions)
    corrects = np.array(corrects)
    
    id_to_char = dict(zip(char_to_id.values(), char_to_id.keys()))
    
    return questions[:, ::-1], corrects, char_to_id, id_to_char

In [None]:
class Program1:
    def __init__(self):
        self.params = OrderedDict({'batch_size': 64, 'corpus_size': 202646,
                                  'wordvec_size': 512, 'hidden_size': 512,
                                  'time_size': 32, 'max_epoch': 100})
        self.losses = []
        self.ppl_list = []
        
    def __call__(self):
        ## for comet
        #experiment = Experiment()
        #experiment.log_parameters(self.params)
        
        corpus, word_to_id, id_to_word = load_data('dataset/tinyshakespare.txt')
        #corpus, word_to_id, id_to_word = load_data()
        print(f'max_corpus_size: {len(corpus)}')
        corpus = corpus[:self.params['corpus_size']]
        self.params['corpus_size'] = len(corpus)
        vocab_size = int(max(corpus) + 1)
        self.params['vocab_size'] = vocab_size
        
        self.corpus = corpus
        self.word_to_id = word_to_id
        self.id_to_word = id_to_word
        
        xs = corpus[:-1]
        ts = corpus[1:]
        data_size = len(xs)
        
        for key in self.params.keys() :
            print(f'{key}: {self.params[key]}')
        batch_size, corpus_size, wordvec_size, hidden_size, time_size, max_epoch, vocab_size = self.params.values()

        self.net = Network(vocab_size, wordvec_size, hidden_size)
        self.net.load()
        
        print()
        print_generate_seq(self.net, word_to_id, id_to_word)
        print()
        most_similar(word_to_id, id_to_word, self.net.layers['Embedding'].params[0])
        print()
        
        max_iters = data_size // (batch_size * time_size)
        time_idx = 0
        total_loss = 0
        loss_count = 0
        
        jump = (corpus_size - 1) // batch_size
        offsets = [i * jump for i in range(batch_size)]
        
        for epoch in range(max_epoch):
            for ite in tqdm(range(max_iters)):
                batch_x = np.empty((batch_size, time_size), dtype='i')
                batch_t = np.empty((batch_size, time_size), dtype='i')
        
                for t in range(time_size):
                    for i, offset in enumerate(offsets):
                        batch_x[i, t] = xs[(offset + time_idx) % data_size]
                        batch_t[i, t] = ts[(offset + time_idx) % data_size]
                    time_idx += 1
                
                loss = self.net.train(batch_x, batch_t)
                self.losses.append(loss.tolist())
                total_loss += loss
                loss_count += 1
                
                ## for comet
                #experiment.log_metric('loss', loss, step=epoch*max_iters-(max_iters-ite))
            
            ppl = float(np.exp(total_loss / loss_count))
            print(f'| epoch {epoch+1} | total_loss {total_loss} | perplexity {ppl}')
            self.ppl_list.append(ppl)
            if len(self.ppl_list) > 1 and min(self.ppl_list[:-1]) >= ppl:
                self.net.save(f'network.state.ppl_min.pkl')
            
            time_idx, total_loss, loss_count = 0, 0, 0
            
            ## for comet
            #experiment.log_metric('perplexity', ppl, step=epoch*max_iters)
        
        print()
        print_generate_seq(self.net, word_to_id, id_to_word)
        print()
        most_similar(word_to_id, id_to_word, self.net.layers['Embedding'].params[0])
        
        self.net.save()

        ## for comet
        #experiment.end()

In [None]:
class Program2:
    def __init__(self):
        self.char_to_id = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, '+':10, '=':11, ' ':12}
        self.id_to_char = dict(zip(self.char_to_id.values(), self.char_to_id.keys()))
        self.params = OrderedDict({'vocab_size': len(self.char_to_id), 'wordvec_size': 16, 'hidden_size': 256,
                                   'max_grad': 5.0, 'max_epoch': 100, 'sample_size': 50000, 'batch_size': 128})
        self.losses = []
        
    def fit(self, x, t, batch_size):
        data_size = len(x)
        max_iters = data_size // batch_size
        
        for iters in tqdm(range(max_iters)):
            batch_x = x[iters*batch_size:(iters+1)*batch_size]
            batch_t = t[iters*batch_size:(iters+1)*batch_size]

            loss = self.net.train(batch_x, batch_t)
            self.losses.append(loss.tolist())
            
        return loss
        
    def __call__(self):
        ## for comet
        #experiment = Experiment()
        #experiment.log_parameters(self.params)
        
        for key in self.params.keys() :
            print(f'{key}: {self.params[key]}')
        vocab_size, wordvec_size, hidden_size, max_grad, max_epoch, sample_size, batch_size = self.params.values()

        self.net = Seq2SeqNetwork(vocab_size, wordvec_size, hidden_size, max_grad)
        self.net.load()
        
        if sample_size != -1:
            questions, corrects = make_question_data(sample_size, self.char_to_id)
            
        for epoch in range(max_epoch):
            if sample_size == -1:
                questions, corrects = make_question_data(batch_size, self.char_to_id)
            loss = self.fit(questions, corrects, batch_size)
            
            correct_num = 0
            test_num = 3
            for _ in range(test_num):
                test_q, test_c = make_question_data(1, self.char_to_id)
                answer = self.net.generate(test_q, start_id=self.char_to_id['='], sample_size=4)
                question = ''.join([self.id_to_char[x] for x in to_cpu(test_q[0, ::-1])])
                correct = ''.join([self.id_to_char[x] for x in to_cpu(test_c[0, 1:-1])])
                answer = ''.join([self.id_to_char[x] for x in to_cpu(answer)])
                is_correct = True if answer == correct else False
                correct_num += 1 if is_correct else 0
                print(f'{"○" if is_correct else "×"} {question} = {answer}[{correct}]')
            print(f'[{epoch}] Corrects {correct_num} / {test_num}\tloss: {loss}')
            
            ## for comet
            #experiment.log_metric('loss', loss, step=epoch)
        
        self.net.save()
        self.exam()

        ## for comet
        #experiment.end()
    
    def exam(self, question_num=100):
        correct_num = 0
        questions, corrects = make_question_data(question_num, self.char_to_id)
        for i in range(question_num):
            answer = self.net.generate(np.expand_dims(questions[i], axis=0), start_id=self.char_to_id['='], sample_size=4)
            question = ''.join([self.id_to_char[x] for x in to_cpu(questions[i, ::-1])])
            correct = ''.join([self.id_to_char[x] for x in to_cpu(corrects[i, 1:-1])])
            answer = ''.join([self.id_to_char[x] for x in to_cpu(answer)])
            is_correct = True if answer == correct else False
            correct_num += 1 if is_correct else 0
            print(f'{"○" if is_correct else "×"} {question} = {answer}[{correct}]')
        print(f'Score: {correct_num} / {question_num}\tAccuracy: {(correct_num / question_num)}')

In [None]:
class Program3:
    def __init__(self):
        sample_size = 50000
        self.questions, self.corrects, self.char_to_id, self.id_to_char = make_date_questions(sample_size)
        self.params = OrderedDict({'vocab_size': len(self.char_to_id), 'wordvec_size': 16, 'hidden_size': 256,
                                   'max_grad': 5.0, 'max_epoch': 80, 'sample_size': sample_size, 'batch_size': 128})
        self.net = AttentionSeq2SeqNetwork(self.params['vocab_size'], self.params['wordvec_size'], self.params['hidden_size'], self.params['max_grad'])
        self.losses = []
        
    def fit(self, x, t, batch_size):
        data_size = len(x)
        max_iters = data_size // batch_size
        
        for iters in tqdm(range(max_iters)):
            batch_x = x[iters*batch_size:(iters+1)*batch_size]
            batch_t = t[iters*batch_size:(iters+1)*batch_size]

            loss = self.net.train(batch_x, batch_t)
            self.losses.append(loss.tolist())
            
        return loss
    
    def visualize(self, attention_map, row_labels, column_labels):
        fig, ax = plt.subplots()
        ax.pcolor(attention_map, cmap=plt.cm.Greys_r, vmin=0.0, vmax=1.0)

        ax.patch.set_facecolor('black')
        ax.set_yticks(np.arange(attention_map.shape[0])+0.5, minor=False)
        ax.set_xticks(np.arange(attention_map.shape[1])+0.5, minor=False)
        ax.invert_yaxis()
        ax.set_xticklabels(row_labels, minor=False)
        ax.set_yticklabels(column_labels, minor=False)

        plt.savefig('figure.png')
        plt.clf()
        plt.close()
        print('Saved: figure.png')
    
    def save_figure(self):
        global np, device
        device_changed = False
        
        temp_file_name = 'temp.state.pkl'
        self.net.save(temp_file_name)
        
        if device == 'gpu':
            print('Change to cpu.')
            device = 'cpu'
            np = numpy
            device_changed = True
        
        _program = Program3()
        _program.net.load(temp_file_name)
        
        for i in tqdm(range(len(_program.questions))):
            question = to_cpu(_program.questions[[i]])
            correct = to_cpu(_program.corrects[[i]])
            answer = _program.net.generate(question, start_id=self.char_to_id['='], sample_size=10)
            _program.net.loss(question, correct)
            question = [_program.id_to_char[x] for x in question[0, ::-1]]
            correct = [_program.id_to_char[x] for x in correct[0, 1:-1]]
            answer = [_program.id_to_char[x] for x in answer]
            if answer != correct:
                  continue
            
            attention_map = np.array(_program.net.layers['Decoder'].layers['Attention'].attention_weights)
            attention_map = attention_map[:, 0, :].reshape(attention_map.shape[0], attention_map.shape[2])
            attention_map = attention_map[:-1, ::-1]
            _program.visualize(attention_map, question, correct)
            break
                  
        if device_changed:
            print('Change to gpu.')
            device = 'gpu'
            np = cupy
        
        os.remove(temp_file_name)
        
    def __call__(self):
        ## for comet
        #experiment = Experiment()
        #experiment.log_parameters(self.params)
        
        for key in self.params.keys() :
            print(f'{key}: {self.params[key]}')
        vocab_size, wordvec_size, hidden_size, max_grad, max_epoch, sample_size, batch_size = self.params.values()

        self.net.load()
        
        for epoch in range(max_epoch):
            loss = self.fit(self.questions, self.corrects, batch_size)
            
            correct_num = 0
            test_num = 3
            for _ in range(test_num):
                questions, corrects, _, _ = make_date_questions(1)
                answer = self.net.generate(questions, start_id=self.char_to_id['='], sample_size=10)
                question = ''.join([self.id_to_char[x] for x in to_cpu(questions[0, ::-1])])
                correct = ''.join([self.id_to_char[x] for x in to_cpu(corrects[0, 1:-1])])
                answer = ''.join([self.id_to_char[x] for x in to_cpu(answer)])
                is_correct = True if answer == correct else False
                correct_num += 1 if is_correct else 0
                print(f'{"○" if is_correct else "×"} {question} = {answer} [{correct}]')
            print(f'[{epoch}] Corrects {correct_num} / {test_num}\tloss: {loss}')
            
            ## for comet
            #experiment.log_metric('loss', loss, step=epoch)
        
        self.net.save()
        self.exam()
        self.save_figure()

        ## for comet
        #experiment.end()
    
    def exam(self, question_num=100):
        correct_num = 0
        for _ in range(question_num):
            questions, corrects, _, _ = make_date_questions(1)
            answer = self.net.generate(questions, start_id=self.char_to_id['='], sample_size=10)
            question = ''.join([self.id_to_char[x] for x in to_cpu(questions[0, ::-1])])
            correct = ''.join([self.id_to_char[x] for x in to_cpu(corrects[0, 1:-1])])
            answer = ''.join([self.id_to_char[x] for x in to_cpu(answer)])
            is_correct = True if answer == correct else False
            correct_num += 1 if is_correct else 0
            print(f'{"○" if is_correct else "×"} {question} = {answer} [{correct}]')
        print(f'Score: {correct_num} / {question_num}\tAccuracy: {(correct_num / question_num)}')

In [None]:
program = Program1()
if __name__ == '__main__':
    program()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

print('loss')
plt.plot(program.losses, label='loss')
plt.legend
plt.show()

In [None]:
program = Program2()
if __name__ == '__main__':
    program()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

print('loss')
plt.plot(program.losses, label='loss')
plt.legend
plt.show()

In [None]:
program = Program3()
if __name__ == '__main__':
    program()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image

plt.plot(program.losses, label='loss')
plt.legend()
plt.show()

img = Image.open('figure.png')
fig = plt.figure(dpi=200)
ax = fig.add_subplot(1, 1, 1) # (row, col, num)
ax.set_xticks([])
ax.set_yticks([])
plt.imshow(img)
plt.show()