In [1]:
import numpy as np

class Linear:
    def __init__(self, input_node, output_node):
        self.weights = np.random.rand(input_node, output_node) * 0.01
        self.bias = np.zeros((1, output_node))
        self.grad_weights = None
        self.grad_bias = None
        self.mat = None

        # Adagrad 상태 변수 초기화
        self.grad_squared_w = None
        self.grad_squared_b = None

    def feedforward(self, mat):
        self.mat = mat
        return np.dot(mat, self.weights) + self.bias

    def backward(self, grad_output, clip_value=1.0):
        self.grad_weights = np.dot(self.mat.T, grad_output)
        self.grad_bias = np.sum(grad_output, axis=0)

        grad_norm = np.linalg.norm(self.grad_weights)
        if grad_norm > clip_value:
            self.grad_weights *= (clip_value / grad_norm)
            
        return np.dot(grad_output, self.weights.T)

    def update(self, optimizer):
        optimizer.update(self)


In [2]:
class Momentum:
    def __init__(self, lr=0.01, beta=0.9):
        self.lr = lr
        self.beta = beta

    def update(self, layer):
        if not hasattr(layer, 'velocity_w'):
            layer.velocity_w = np.zeros_like(layer.weights)
            layer.velocity_b = np.zeros_like(layer.bias)

        layer.velocity_w = self.beta * layer.velocity_w + (1 - self.beta) * layer.grad_weights
        layer.velocity_b = self.beta * layer.velocity_b + (1 - self.beta) * layer.grad_bias

        layer.weights -= self.lr * layer.velocity_w
        layer.bias -= self.lr * layer.velocity_b


In [3]:
class RMSProp:
    def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8):
        self.lr = lr
        self.rho = rho
        self.epsilon = epsilon

    def update(self, layer):
        if not hasattr(layer, 'rms_w'):
            layer.rms_w = np.zeros_like(layer.weights)
            layer.rms_b = np.zeros_like(layer.bias)

        layer.rms_w = self.rho * layer.rms_w + (1 - self.rho) * (layer.grad_weights ** 2)
        layer.rms_b = self.rho * layer.rms_b + (1 - self.rho) * (layer.grad_bias ** 2)

        layer.weights -= self.lr * layer.grad_weights / (np.sqrt(layer.rms_w) + self.epsilon)
        layer.bias -= self.lr * layer.grad_bias / (np.sqrt(layer.rms_b) + self.epsilon)


In [4]:
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

    def update(self, layer):
        if not hasattr(layer, 'm_w'):
            layer.m_w = np.zeros_like(layer.weights)
            layer.v_w = np.zeros_like(layer.weights)
            layer.m_b = np.zeros_like(layer.bias)
            layer.v_b = np.zeros_like(layer.bias)
            layer.t = 0

        layer.t += 1

        layer.m_w = self.beta1 * layer.m_w + (1 - self.beta1) * layer.grad_weights
        layer.v_w = self.beta2 * layer.v_w + (1 - self.beta2) * (layer.grad_weights ** 2)

        layer.m_b = self.beta1 * layer.m_b + (1 - self.beta1) * layer.grad_bias
        layer.v_b = self.beta2 * layer.v_b + (1 - self.beta2) * (layer.grad_bias ** 2)

        m_w_corr = layer.m_w / (1 - self.beta1 ** layer.t)
        v_w_corr = layer.v_w / (1 - self.beta2 ** layer.t)
        m_b_corr = layer.m_b / (1 - self.beta1 ** layer.t)
        v_b_corr = layer.v_b / (1 - self.beta2 ** layer.t)

        layer.weights -= self.lr * m_w_corr / (np.sqrt(v_w_corr) + self.epsilon)
        layer.bias -= self.lr * m_b_corr / (np.sqrt(v_b_corr) + self.epsilon)


In [5]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, layer):
        layer.weights -= self.lr * layer.grad_weights
        layer.bias -= self.lr * layer.grad_bias

In [6]:
class Adagrad:
    def __init__(self, lr=0.01, epsilon=1e-8):
        self.lr = lr
        self.epsilon = epsilon

    def update(self, layer):
        # 각 레이어별로 상태 저장이 되어 있는지 확인
        if layer.grad_squared_w is None:
            layer.grad_squared_w = np.zeros_like(layer.weights)
            layer.grad_squared_b = np.zeros_like(layer.bias)

        # 각 레이어에 맞는 상태값 업데이트
        layer.grad_squared_w += layer.grad_weights ** 2
        layer.grad_squared_b += layer.grad_bias ** 2

        # 업데이트
        layer.weights -= self.lr * layer.grad_weights / (np.sqrt(layer.grad_squared_w) + self.epsilon)
        layer.bias -= self.lr * layer.grad_bias / (np.sqrt(layer.grad_squared_b) + self.epsilon)


In [7]:
class MSE:
    @staticmethod
    def loss(y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    @staticmethod
    def gradient(y_true, y_pred):
        return 2 * (y_pred - y_true) / y_true.shape[0]

In [None]:
class NeuralNetwork:
    def __init__(self, layers, optimizer):
        self.layers = layers
        self.optimizer = optimizer

    def forward(self, x):
        for layer in self.layers:
            x = layer.feedforward(x)
        return x

    def backward(self, y_true, y_pred):
        grad = MSE.gradient(y_true, y_pred)
        for layer in reversed(self.layers):
            grad = layer.backward(grad)

    def update_weights(self):
        for layer in self.layers:
            if hasattr(layer, 'update'):
                layer.update(self.optimizer)

    def train(self, x_train, y_train, epochs=100, batch_size=32, shuffle=False):
        n_samples = x_train.shape[0]
        for epoch in range(epochs):
            # 👉 shuffle 옵션이 True인 경우에만 섞어준다
            if shuffle:
                indices = np.arange(n_samples)
                np.random.shuffle(indices)
                x_train = x_train[indices]
                y_train = y_train[indices]

            for start_idx in range(0, n_samples, batch_size):
                end_idx = start_idx + batch_size
                x_batch = x_train[start_idx:end_idx]
                y_batch = y_train[start_idx:end_idx]

                # Forward
                y_pred = self.forward(x_batch)
                # Backward
                self.backward(y_batch, y_pred)
                # Update
                self.update_weights()

            # Optional: Epoch별 loss 출력
            y_pred_full = self.forward(x_train)
            loss = MSE.loss(y_train, y_pred_full)
            if epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

In [9]:
class ReLU:
    def __init__(self):
        self.input = None

    def feedforward(self, x):
        self.input = x
        return np.maximum(0, x)

    def backward(self, grad_output):
        return grad_output * (self.input > 0)

In [12]:
# 데이터 준비
x_train = np.random.rand(500, 2)
y_train = np.dot(x_train, np.array([[2.0], [3.0]])) + 1.0

# 모델 구성
model = NeuralNetwork(
    layers=[
        Linear(2, 4),
        ReLU(),
        Linear(4, 1)
    ],
    optimizer=Adam(lr=0.1)  # SGD(lr=0.1) 로 바꿔도 가능
)

# 학습 (배치 사이즈 64로)
model.train(x_train, y_train, epochs=100, batch_size=64)


Epoch 0, Loss: 1.8006
Epoch 10, Loss: 0.0006
Epoch 20, Loss: 0.0000
Epoch 30, Loss: 0.0000
Epoch 40, Loss: 0.0000
Epoch 50, Loss: 0.0000
Epoch 60, Loss: 0.0000
Epoch 70, Loss: 0.0000
Epoch 80, Loss: 0.0000
Epoch 90, Loss: 0.0000


In [28]:
from sklearn.datasets import load_diabetes
import numpy as np

# 데이터 로드
diabetes = load_diabetes()
X = diabetes.data  # 442개 샘플, 10개의 feature
y = diabetes.target.reshape(-1, 1)  # 타겟값 (연속형 회귀 값)

# 표준화 (정규화)
X = (X - X.mean(axis=0)) / X.std(axis=0)
y = y / y.max()  # 정답값도 0~1로 스케일링

model = NeuralNetwork(
    layers=[
        Linear(10, 32),  # 10개 feature -> 중간 레이어 32개 노드
        ReLU(),
        Linear(32, 1)    # 출력 1개
    ],
    optimizer=Adam(lr=0.01)
)

# 학습
model.train(X, y, epochs=10000, batch_size=32)
# 최종 예측 및 최종 Loss 출력
y_pred = model.forward(X)
final_loss = MSE.loss(y, y_pred)
print(f"최종 Loss: {final_loss:.4f}")


Epoch 0, Loss: 0.0596
Epoch 10, Loss: 0.0231
Epoch 20, Loss: 0.0219
Epoch 30, Loss: 0.0216
Epoch 40, Loss: 0.0206
Epoch 50, Loss: 0.0207
Epoch 60, Loss: 0.0200
Epoch 70, Loss: 0.0205
Epoch 80, Loss: 0.0198
Epoch 90, Loss: 0.0205
Epoch 100, Loss: 0.0201
Epoch 110, Loss: 0.0190
Epoch 120, Loss: 0.0228
Epoch 130, Loss: 0.0218
Epoch 140, Loss: 0.0186
Epoch 150, Loss: 0.0176
Epoch 160, Loss: 0.0174
Epoch 170, Loss: 0.0164
Epoch 180, Loss: 0.0164
Epoch 190, Loss: 0.0159
Epoch 200, Loss: 0.0154
Epoch 210, Loss: 0.0155
Epoch 220, Loss: 0.0153
Epoch 230, Loss: 0.0144
Epoch 240, Loss: 0.0151
Epoch 250, Loss: 0.0143
Epoch 260, Loss: 0.0148
Epoch 270, Loss: 0.0148
Epoch 280, Loss: 0.0148
Epoch 290, Loss: 0.0144
Epoch 300, Loss: 0.0145
Epoch 310, Loss: 0.0137
Epoch 320, Loss: 0.0128
Epoch 330, Loss: 0.0149
Epoch 340, Loss: 0.0139
Epoch 350, Loss: 0.0127
Epoch 360, Loss: 0.0143
Epoch 370, Loss: 0.0130
Epoch 380, Loss: 0.0124
Epoch 390, Loss: 0.0119
Epoch 400, Loss: 0.0124
Epoch 410, Loss: 0.0126
Epo