#### library import

In [1]:
import numpy as np
import tensorflow as tf # for dataset

#### 1. Linear & Relu

In [64]:
class Linear(object):
    def __init__(self, input_dim, output_dim, name = 'linear'):
        self.name = name
        self.input_dim = input_dim
        self.output_dim = output_dim
    
        self.weight = np.random.randn(input_dim, output_dim)
        self.weight_gradient = np.zeros_like(self.weight)
        self.b = np.random.randn(output_dim)
        self.db = np.zeros_like(self.b)
        
        self.v_w, self.v_b = 0, 0
        self.m, self.v = 0, 0
        
        self.x = None
        
    def __call__(self, x):
        self.x = x
        return self.forward(x)
        
    def forward(self, x):
        res = np.dot(x, self.weight) + self.b
        return res 
    
    def backward(self, dout):
        # batch  x feature_out dot feature_out x feature_in  >>> batch x feature_in
        dx = np.dot(dout, self.weight.T)
        
        # feature_in x batch dot batch x feature_out >>> feature_in x feature_out (weight 매트릭스와 같은 모양)
        self.weight_gradient = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        return dx
    
class ReLU(object):
    def __init__(self, name='relu'):
        self.name = name
        self.mask = None
        
    def __call__(self, x):
        return self.forward(x)
    
    def forward(self, x):
        self.mask = x < 0
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

#### 2. Cross entropy loss

In [12]:
class CrossEntropyLoss(object):
    
    def __init__(self):
        self.target = None
        self.pred_prob = None
        self.batch_size = None
        
    def __call__(self, pred, y):
        return self.forward(pred, y)
    
    def softmax(self, pred):
        # avoid overflow
        z = pred - np.max(pred, axis=1).reshape(-1, 1)
        numerator = np.exp(z)
        denominator = np.sum(numerator, axis=1).reshape(-1, 1)
        softmax = numerator/denominator
        
        return softmax
    
    # batch size는 batchsize로 나눠서 한 값만 내보낸다.
    def forward(self, pred, y):
        self.target = y
        self.batch_size = len(y)
        
        res =[]
        # pred >>> 5, 10
        self.pred_prob = self.softmax(pred)
        predicted = self.pred_prob[np.arange(self.batch_size), y]
        loss = -np.sum(np.log(predicted + 1e-7)) / self.batch_size
        return loss
    
    def backward(self, dout=1):        
        dx = self.pred_prob.copy()
        dx[np.arange(self.batch_size), self.target] -= 1
        dx /= self.batch_size
        
        return dx

#### 3. Optimizer

##### 3-1. Root Mean Square Prop (RMSprop)

- Geoffrey Hinton이 제안
- average of squared gradients를 이용해서 업데이트!

In [4]:
class RMSprop(object):
    def __init__(self, layers, learning_rate = 1e-3):
        self.beta, self.eps = 0.9, 1e-8
        self.learning_rate = learning_rate
        self.layers = layers
        
    def zero_grad(self):
        for layer in self.layers:
            if layer.name in ['linear', 'tmp']:
                layer.weight_gradient = np.zeros_like(layer.weight)
                layer.db = np.zeros_like(layer.b)
                
    def backward(self, dout):
        self.layers.reverse()    # 뒤에서 부터 backward 하기 위해 순서 바꿔주기
        for layer in self.layers:
            dout = layer.backward(dout)
        self.layers.reverse()
    
    def step(self):
        for layer in self.layers:
            if layer.name in ['linear']:
                layer.v_w = self.beta * layer.v_w + (1-self.beta) * layer.weight_gradient**2
                layer.v_b = self.beta * layer.v_b + (1-self.beta) * layer.db**2
                
                layer.weight = layer.weight - (self.learning_rate / np.sqrt(layer.v_w + self.eps)) * layer.weight_gradient
                layer.b = layer.b - (self.learning_rate / np.sqrt(layer.v_b + self.eps)) * layer.db
            

##### 3-2. Stochastic Gradient Descent (SGD)

- Gradient Descent 업데이트가 예전엔 모든 샘플에 대해서 한번에 다 구해서 진행.
    - 해당 방법이 너무 느려...
- 따라서 batch에 맞춰서 Gradient를 구하고 업데이트 진행
    - 사실상 Gradient Descent라고 말하면 모든 사람이 batch에 따라 진행하므로 해당 방버이 Gradient Descent라고 생각해도 무방
   

In [46]:
class SGD(object):
    def __init__(self, layers, learning_rate = 1e-3, momentum=False):
        self.beta, self.eps = 0.9, 1e-8
        self.learning_rate = learning_rate
        self.layers = layers
        self.momentum = momentum
        
    def zero_grad(self):
        for layer in self.layers:
            if layer.name in ['linear', 'tmp']:
                layer.weight_gradient = np.zeros_like(layer.weight)
                layer.db = np.zeros_like(layer.b)
                
    def backward(self, dout):
        self.layers.reverse()    # 뒤에서 부터 backward 하기 위해 순서 바꿔주기
        for layer in self.layers:
            dout = layer.backward(dout)
        self.layers.reverse()
    
    def step(self):
        for layer in self.layers:
            if layer.name in ['linear', 'tmp']:
                
                if self.momentum:
                    # implementation for momentum but still need to be fixed
                    layer.v_w = (self.beta * layer.v_w) + (1 - self.beta) * layer.weight_gradient
                    layer.v_b = (self.beta * layer.v_b) +  (1 - self.beta) * layer.db
                    layer.weight = layer.weight - (self.learning_rate * layer.v_w)
                    layer.b = layer.b - (self.learning_rate * layer.v_b)
                
                else:
                # classic SGD
                    layer.weight = layer.weight - (self.learning_rate * layer.weight_gradient)
                    layer.b = layer.b - (self.learning_rate * layer.db)

##### 3-3. Adaptive moment estimation (Adam)

- RMSprop 와 SGD 알고리즘을 합친 것!

In [61]:
class Adam(object):
    def __init__(self, layers, learning_rate = 1e-3):
        self.beta1, self.beta2, self.eps = 0.9, 0.999, 1e-7
        self.learning_rate = learning_rate
        self.layers = layers
        
    def zero_grad(self):
        for layer in self.layers:
            if layer.name in ['linear', 'tmp']:
                layer.weight_gradient = np.zeros_like(layer.weight)
                layer.db = np.zeros_like(layer.b)
                
    def backward(self, dout):
        self.layers.reverse()    # 뒤에서 부터 backward 하기 위해 순서 바꿔주기
        for layer in self.layers:
            dout = layer.backward(dout)
        self.layers.reverse()
    
    def step(self):
        for layer in self.layers:
            if layer.name in ['linear', 'tmp']:
                layer.m += (1-self.beta1) * (layer.weight_gradient - layer.m)
                layer.v += (1-self.beta2) * (layer.weight_gradient**2 - layer.v)
                layer.weight -= self.learning_rate * layer.m / (np.sqrt(layer.v) + self.eps)
                

#### 4. Model class

In [6]:
class Sequential(object):
    def __init__(self, layers: list):
        self.layers = layers
    
    def __call__(self, x):
        feature = x.copy()
        for layer in self.layers:
            feature = layer(feature)
        
        return feature    # >>> logits

#### 5. Main

##### Dataset

In [7]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, y_train = x_train, y_train
x_test, y_test = x_test, y_test

#### RMSprop!

- 75 epoch
    - train: 95%
    - test: 90%

In [7]:
EPOCH = 100
batch_size=8
iteration = int(len(x_train) / batch_size)
test_iteration = int(len(x_test) / batch_size)

model = Sequential([Linear(784, 512),
          ReLU(), 
          Linear(512, 512),
          ReLU(),
          Linear(512, 256),
          ReLU(),
          Linear(256, 128),
          ReLU(),
          Linear(128, 10)
         ])

criterion = CrossEntropyLoss()
optimizer = RMSprop(model.layers, learning_rate=0.00001)

for epo in range(EPOCH):
    train_acc = []
    loss_list = []
    idx = 0
    
    for each in range(iteration):
        if each+1 == iteration:
            x = x_train[idx:].reshape(len(x_train)-idx, -1)
            target = y_train[idx:]
        else:
            x = x_train[idx: idx+batch_size].reshape(batch_size, -1)
            target = y_train[idx: idx+batch_size]
        output = model(x)

        loss = criterion(output, target)
        loss_list.append(loss)
        
        predicted = output.argmax(axis=1)
        train_acc.append((predicted == target).sum()/batch_size)

        dout = criterion.backward()
        optimizer.zero_grad()
        optimizer.backward(dout)
        optimizer.step()
        
        idx += batch_size
        
    print(f'{epo+1}th train acc: {np.mean(train_acc)*100}, loss: {np.mean(loss_list)}')
    
    idx = 0
    test_acc = []
    for each in range(test_iteration):
        if each+1 == test_iteration:
            x = x_test[idx:].reshape(len(x_test)-idx, -1)
            target = y_test[idx:]
        else:
            x = x_test[idx: idx+batch_size].reshape(batch_size, -1)
            target = y_test[idx: idx+batch_size]

        output = model(x)
        loss = criterion(output, target)
        predicted = output.argmax(axis=1)
        test_acc.append((predicted == target).sum()/batch_size)
        
        idx += batch_size
    print(f'{epo+1}th test acc: {np.mean(test_acc)*100}')
    print('='*15)

1th train acc: 31.72833333333333, loss: 11.00409250410843
1th test acc: 51.980000000000004
2th train acc: 59.56166666666667, loss: 6.517889186758365
2th test acc: 66.92
3th train acc: 69.395, loss: 4.932943104580797
3th test acc: 73.48
4th train acc: 74.42166666666667, loss: 4.12274015816596
4th test acc: 77.02
5th train acc: 77.31333333333333, loss: 3.656658556034081
5th test acc: 79.28
6th train acc: 79.49666666666667, loss: 3.304746798804825
6th test acc: 81.01
7th train acc: 81.08166666666666, loss: 3.0492749811521356
7th test acc: 82.07
8th train acc: 82.23333333333333, loss: 2.863648245086932
8th test acc: 82.89
9th train acc: 83.19166666666666, loss: 2.7091831608069152
9th test acc: 83.67
10th train acc: 84.045, loss: 2.571642077065404
10th test acc: 84.36
11th train acc: 84.70333333333333, loss: 2.4655312800382623
11th test acc: 84.87
12th train acc: 85.37, loss: 2.358077308365207
12th test acc: 85.35000000000001
13th train acc: 86.00833333333333, loss: 2.2551901304882565
13th 

KeyboardInterrupt: 

#### SGD 


In [52]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, y_train = x_train[:8000], y_train[:8000]
x_test, y_test = x_test[:2000], y_test[:2000]

In [60]:
np.warnings.filterwarnings('ignore')

EPOCH = 100
batch_size=8
iteration = int(len(x_train) / batch_size)
test_iteration = int(len(x_test) / batch_size)

model = Sequential([Linear(784, 512),
          ReLU(), 
          Linear(512, 256),
          ReLU(),
          #Linear(256, 10),
          #ReLU(),
          Linear(256, 256),
          ReLU(),
          Linear(256, 128),
          ReLU(),
          #Linear(128, 64),
          #ReLU(),
          Linear(128, 10)
         ])

criterion = CrossEntropyLoss()
optimizer = SGD(model.layers, learning_rate=0.00000001, momentum=False)

for epo in range(EPOCH):
    train_acc = []
    loss_list = []
    idx = 0
    
    for each in range(iteration):
        if each+1 == iteration:
            x = x_train[idx:].reshape(len(x_train)-idx, -1)
            target = y_train[idx:]
        else:
            x = x_train[idx: idx+batch_size].reshape(batch_size, -1)
            target = y_train[idx: idx+batch_size]
        output = model(x)
        
        loss = criterion(output, target)
        loss_list.append(loss)
        
        predicted = output.argmax(axis=1)
        train_acc.append((predicted == target).sum()/batch_size)

        dout = criterion.backward()
        optimizer.zero_grad()
        optimizer.backward(dout)
        optimizer.step()
        
        idx += batch_size
        
    print(f'{epo+1}th train acc: {np.mean(train_acc)*100}, loss: {np.mean(loss_list)}')
    
    idx = 0
    test_acc = []
    for each in range(test_iteration):
        if each+1 == test_iteration:
            x = x_test[idx:].reshape(len(x_test)-idx, -1)
            target = y_test[idx:]
        else:
            x = x_test[idx: idx+batch_size].reshape(batch_size, -1)
            target = y_test[idx: idx+batch_size]

        output = model(x)
        loss = criterion(output, target)
        predicted = output.argmax(axis=1)
        test_acc.append((predicted == target).sum()/batch_size)
        
        idx += batch_size
    print(f'{epo+1}th test acc: {np.mean(test_acc)*100}')
    print('='*15)

1th train acc: 59.925, loss: 6.459326772196549
1th test acc: 67.85
2th train acc: 78.4125, loss: 3.4794938202381314
2th test acc: 71.3
3th train acc: 83.28750000000001, loss: 2.6937366523789135
3th test acc: 73.85000000000001
4th train acc: 85.9375, loss: 2.2666071149785183
4th test acc: 75.7
5th train acc: 88.425, loss: 1.86566948317343
5th test acc: 76.75
6th train acc: 90.1625, loss: 1.585617569500529
6th test acc: 77.7
7th train acc: 91.1375, loss: 1.4284661359286857
7th test acc: 77.64999999999999
8th train acc: 92.0875, loss: 1.2753442262945816
8th test acc: 77.60000000000001
9th train acc: 93.325, loss: 1.0758827913764726
9th test acc: 78.4
10th train acc: 94.2625, loss: 0.9247756437112382
10th test acc: 78.14999999999999
11th train acc: 94.4875, loss: 0.8885099282715819
11th test acc: 78.35
12th train acc: 95.3, loss: 0.7575504002950456
12th test acc: 78.7
13th train acc: 95.9375, loss: 0.6547975398826863
13th test acc: 78.60000000000001
14th train acc: 96.0625, loss: 0.6346499

KeyboardInterrupt: 

#### Adam

In [None]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, y_train = x_train[:8000], y_train[:8000]
x_test, y_test = x_test[:2000], y_test[:2000]

In [67]:
np.warnings.filterwarnings('ignore')

EPOCH = 100
batch_size=8
iteration = int(len(x_train) / batch_size)
test_iteration = int(len(x_test) / batch_size)

model = Sequential([Linear(784, 512),
          ReLU(), 
          Linear(512, 256),
          ReLU(),
          #Linear(256, 10),
          #ReLU(),
          Linear(256, 256),
          ReLU(),
          Linear(256, 128),
          ReLU(),
          #Linear(128, 64),
          #ReLU(),
          Linear(128, 10)
         ])

criterion = CrossEntropyLoss()
optimizer = Adam(model.layers, learning_rate=0.00001)

for epo in range(EPOCH):
    train_acc = []
    loss_list = []
    idx = 0
    
    for each in range(iteration):
        if each+1 == iteration:
            x = x_train[idx:].reshape(len(x_train)-idx, -1)
            target = y_train[idx:]
        else:
            x = x_train[idx: idx+batch_size].reshape(batch_size, -1)
            target = y_train[idx: idx+batch_size]
        output = model(x)
        
        loss = criterion(output, target)
        loss_list.append(loss)
        
        predicted = output.argmax(axis=1)
        train_acc.append((predicted == target).sum()/batch_size)

        dout = criterion.backward()
        optimizer.zero_grad()
        optimizer.backward(dout)
        optimizer.step()
        
        idx += batch_size
        
    print(f'{epo+1}th train acc: {np.mean(train_acc)*100}, loss: {np.mean(loss_list)}')
    
    idx = 0
    test_acc = []
    for each in range(test_iteration):
        if each+1 == test_iteration:
            x = x_test[idx:].reshape(len(x_test)-idx, -1)
            target = y_test[idx:]
        else:
            x = x_test[idx: idx+batch_size].reshape(batch_size, -1)
            target = y_test[idx: idx+batch_size]

        output = model(x)
        loss = criterion(output, target)
        predicted = output.argmax(axis=1)
        test_acc.append((predicted == target).sum()/batch_size)
        
        idx += batch_size
    print(f'{epo+1}th test acc: {np.mean(test_acc)*100}')
    print('='*15)

1th train acc: 12.35, loss: 14.127510825714968
1th test acc: 15.9
2th train acc: 19.400000000000002, loss: 12.991185075272405
2th test acc: 19.3
3th train acc: 23.825, loss: 12.2779593382925
3th test acc: 22.45
4th train acc: 28.6375, loss: 11.502275980277632
4th test acc: 25.75
5th train acc: 33.7375, loss: 10.680253096978758
5th test acc: 29.549999999999997
6th train acc: 38.5125, loss: 9.910614024870497
6th test acc: 33.0
7th train acc: 42.6125, loss: 9.24977209908121
7th test acc: 36.1
8th train acc: 46.5375, loss: 8.617136840856094
8th test acc: 38.7
9th train acc: 50.125, loss: 8.038900155790463
9th test acc: 40.699999999999996
10th train acc: 53.300000000000004, loss: 7.527150615697537
10th test acc: 43.6
11th train acc: 55.8875, loss: 7.110094888141491
11th test acc: 46.25
12th train acc: 58.099999999999994, loss: 6.753482019651538
12th test acc: 48.6
13th train acc: 59.95, loss: 6.45529724825881
13th test acc: 50.64999999999999
14th train acc: 62.3625, loss: 6.066448188266941


92th test acc: 75.3
93th train acc: 95.3875, loss: 0.7434470665129571
93th test acc: 75.44999999999999
94th train acc: 95.3625, loss: 0.7474765904506967
94th test acc: 75.44999999999999
95th train acc: 95.55, loss: 0.7172551609176498
95th test acc: 75.44999999999999
96th train acc: 95.8875, loss: 0.6628565877581655
96th test acc: 75.4
97th train acc: 95.7, loss: 0.6930780172912124
97th test acc: 75.4
98th train acc: 96.0, loss: 0.6447237310819005
98th test acc: 75.55
99th train acc: 96.0875, loss: 0.6306203962562489
99th test acc: 75.44999999999999
100th train acc: 96.2125, loss: 0.6104727765675511
100th test acc: 75.55
