In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

n_in = 64
n_mid = 32
n_latent = 2
n_out = 64
eta = 0.001
batch_size = 32
epochs = 50

In [2]:
digits = load_digits()
x = digits.data / 16.0
t = digits.target

x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)

In [3]:
class MiddleLayer:
    def __init__(self, n_in, n_out):
        self.w = np.random.randn(n_in, n_out) * np.sqrt(2 / n_in)
        self.b = np.zeros(n_out)
    def forward(self, x):
        self.x = x
        self.u = np.dot(x, self.w) + self.b
        self.y = np.maximum(0, self.u)
        return self.y
    def backward(self, grad_y):
        delta = grad_y * (self.u > 0)
        self.grad_w = np.dot(self.x.T, delta)
        self.grad_b = np.sum(delta, axis=0)
        grad_x = np.dot(delta, self.w.T)
        return grad_x
    def update(self, eta):
        self.w -= eta * self.grad_w
        self.b -= eta * self.grad_b

In [4]:
class ParamsLayer:
    def __init__(self, n_in, n_out):
        self.w = np.random.randn(n_in, n_out) / np.sqrt(n_in)
        self.b = np.zeros(n_out)
    def forward(self, x):
        self.x = x
        self.u = np.dot(x, self.w) + self.b
        self.y = self.u
        return self.y
    def backward(self, grad_y):
        self.grad_w = np.dot(self.x.T, grad_y)
        self.grad_b = np.sum(grad_y, axis=0)
        grad_x = np.dot(grad_y, self.w.T)
        return grad_x
    def update(self, eta):
        self.w -= eta * self.grad_w
        self.b -= eta * self.grad_b

In [5]:
class LatentLayer:
    def forward(self, mu, log_var):
        self.mu = mu
        self.log_var = log_var
        self.epsilon = np.random.randn(*mu.shape)
        self.z = mu + np.exp(0.5 * log_var) * self.epsilon
        return self.z
    def backward(self, grad_z):
        grad_mu = grad_z
        grad_log_var = grad_z * self.epsilon * 0.5 * np.exp(0.5 * self.log_var)
        return grad_mu, grad_log_var

In [6]:
class OutputLayer:
    def __init__(self, n_in, n_out):
        self.w = np.random.randn(n_in, n_out) / np.sqrt(n_in)
        self.b = np.zeros(n_out)
    def forward(self, x):
        self.x = x
        self.u = np.dot(x, self.w) + self.b
        self.y = 1 / (1 + np.exp(-self.u))
        return self.y
    def backward(self, t):
        delta = (self.y - t)
        self.grad_w = np.dot(self.x.T, delta)
        self.grad_b = np.sum(delta, axis=0)
        grad_x = np.dot(delta, self.w.T)
        return grad_x
    def update(self, eta):
        self.w -= eta * self.grad_w
        self.b -= eta * self.grad_b

In [7]:
enc_mid = MiddleLayer(n_in, n_mid)
enc_mu = ParamsLayer(n_mid, n_latent)
enc_logvar = ParamsLayer(n_mid, n_latent)
latent_layer = LatentLayer()

dec_mid = MiddleLayer(n_latent, n_mid)
dec_out = OutputLayer(n_mid, n_out)

In [8]:
def forward(x):
    h = enc_mid.forward(x)
    mu = enc_mu.forward(h)
    log_var = enc_logvar.forward(h)
    z = latent_layer.forward(mu, log_var)
    h_dec = dec_mid.forward(z)
    y = dec_out.forward(h_dec)
    return y, mu, log_var, z

In [9]:
def vae_loss(y, x, mu, log_var):
    rec_loss = -np.sum(x * np.log(y + 1e-7) + (1 - x) * np.log(1 - y + 1e-7), axis=1)
    kl_loss = -0.5 * np.sum(1 + log_var - mu**2 - np.exp(log_var), axis=1)
    return np.mean(rec_loss + kl_loss), np.mean(rec_loss), np.mean(kl_loss)

In [10]:
def backward(x, y, mu, log_var, z):
    grad_y = dec_out.backward(x)
    grad_h_dec = dec_mid.backward(grad_y)

    grad_z = grad_h_dec
    grad_mu, grad_log_var = latent_layer.backward(grad_z)

    grad_h_mu = enc_mu.backward(grad_mu + (mu / x.shape[0]))

    grad_h_logvar = enc_logvar.backward(grad_log_var + 0.5 * (np.exp(log_var) - 1) / x.shape[0])
    grad_h = grad_h_mu + grad_h_logvar
    enc_mid.backward(grad_h)

In [11]:
def update():
    enc_mid.update(eta)
    enc_mu.update(eta)
    enc_logvar.update(eta)
    dec_mid.update(eta)
    dec_out.update(eta)

In [12]:
for epoch in range(epochs):
    idx = np.random.permutation(len(x_train))
    for i in range(0, len(x_train), batch_size):
        x_mb = x_train[idx[i:i+batch_size]]
        y, mu, log_var, z = forward(x_mb)
        loss, rec_loss, kl_loss = vae_loss(y, x_mb, mu, log_var)
        backward(x_mb, y, mu, log_var, z)
        update()

    # 평가 및 출력
    y_test, mu_test, log_var_test, z_test = forward(x_test)
    test_loss, test_rec_loss, test_kl_loss = vae_loss(y_test, x_test, mu_test, log_var_test)
    print(f'Epoch {epoch+1}, Test Loss: {test_loss:.4f}, Rec: {test_rec_loss:.4f}, KL: {test_kl_loss:.4f}')


Epoch 1, Test Loss: 33.4018, Rec: 26.9222, KL: 6.4796
Epoch 2, Test Loss: 32.6112, Rec: 25.9380, KL: 6.6732
Epoch 3, Test Loss: 31.9322, Rec: 25.6519, KL: 6.2803
Epoch 4, Test Loss: 32.0274, Rec: 25.4162, KL: 6.6112
Epoch 5, Test Loss: 32.3629, Rec: 25.3857, KL: 6.9772
Epoch 6, Test Loss: 30.9910, Rec: 25.3420, KL: 5.6490
Epoch 7, Test Loss: 32.4015, Rec: 25.0574, KL: 7.3441
Epoch 8, Test Loss: 31.0488, Rec: 24.8779, KL: 6.1709
Epoch 9, Test Loss: 31.9982, Rec: 24.8006, KL: 7.1976
Epoch 10, Test Loss: 30.8647, Rec: 24.8175, KL: 6.0471
Epoch 11, Test Loss: 31.2139, Rec: 24.4452, KL: 6.7687
Epoch 12, Test Loss: 32.0736, Rec: 24.4425, KL: 7.6311
Epoch 13, Test Loss: 30.8612, Rec: 24.1225, KL: 6.7387
Epoch 14, Test Loss: 30.6096, Rec: 24.2802, KL: 6.3294
Epoch 15, Test Loss: 30.5848, Rec: 24.0515, KL: 6.5333
Epoch 16, Test Loss: 31.1975, Rec: 23.9947, KL: 7.2028
Epoch 17, Test Loss: 30.2954, Rec: 23.8194, KL: 6.4760
Epoch 18, Test Loss: 30.6222, Rec: 23.9140, KL: 6.7082
Epoch 19, Test Loss