## Momentum

In [146]:
class Momentum:

  def __init__(self, learning_rate = 0.01, momentum=0.9):
    self.learning_rate = learning_rate
    self.momentum = momentum
    self.v = None

  def update(self, params, grads):
    if self.v is None:
      self.v = {}
      for key, val in params.items():
        self.v[key] = np.zeros_like(val)
    
    for key in params.keys():
      self.v[key]  = self.momentum * self.v[key] - self.learning_rate * grads[key]
      params[key] += self.v[key]

## MNIST 분류

### Modules Import

In [109]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict

### 데이터 로드

In [110]:
np.random.seed(42)

mnist = tf.keras.datasets.mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()
num_classes = 10

### 데이터 전처리

In [111]:
np.random.seed(42)

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
num_classes = 10

x_train = x_train[:10000]
x_test = x_test[:3000]

y_train = y_train[:10000]
y_test = y_test[:3000]

In [112]:
x_train, x_test = x_train.reshape(-1,28*28).astype(np.float32), x_test.reshape(-1,28*28).astype(np.float32)

x_train = x_train / .255
x_test = x_test / .255

y_train = np.eye(num_classes)[y_train]

In [113]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(10000, 784)
(10000, 10)
(3000, 784)
(3000,)


### Hyper Parameters

In [114]:
epochs = 1000
learning_rate = 1e-2
batch_size = 256
train_size = x_train.shape[0]
iter_pre_epoch = max(train_size / batch_size, 1)

### Util Functions

In [115]:
def softmax(x):
  if x.ndim == 2:
    x = x.T
    x = x - np.max(x,axis=0)
    y = np.exp(x) / np.sum(np.exp(x), axis =0)
    return y.T

  x = x - np.max(x)
  return np.exp(x) / np.sum(np.exp(x))

def mean_squared_error(y, t):
  return .5 * np.sum((y-t)**2)

def cross_entropy_error(pred_y, true_y):
  if pred_y.ndim == 1:
    true_y = true_y.reshape(1, true_y.size)
    pred_y = pred_y.reshape(1, pred_y.size)

  if true_y.size == pred_y.size:
    true_y = true_y.argmax(axis=1)

  batch_size = pred_y.shape[0]
  return -np.sum(np.log(pred_y[np.arange(batch_size), true_y] + 1e-7)) / batch_size

### Util Classes

#### ReLU

In [116]:
class ReLU:

  def __init__(self):
    self.mask = None

  def forward(self, input_data):
    self.mask = (input_data <= 0)
    out = input_data.copy()
    out[self.mask] = 0

    return out

  def backward(self, dout):
    dout[self.mask] = 0
    dx = dout

    return dx

#### Sigmoid

In [117]:
class Sigmoid:

  def __init__(self):
    self.out = None

  def forward(self, input_data):
    out = 1 / (1.0 + np.exp(-input_data))
    self.out = out

    return out

  def backward(self, dout):
    dx = dout * (1.0 - self.out) * self.out #self.dout 은 오타?
    return dx

#### Layer

In [118]:
class Layer:
  def __init__(self,W,b):
    self.W = W
    self.b = b

    self.input_data = None
    self.input_data_shape = None

    self.dW = None
    self.db = None

  def forward(self, input_data):
    self.input_data_shape = input_data.shape

    input_data = input_data.reshape(input_data.shape[0], -1)
    self.input_data = input_data
    out = np.dot(self.input_data, self.W) + self.b

    return out

  def backward(self, dout):
    dx = np.dot(dout, self.W.T)
    self.dW = np.dot(self.input_data.T, dout)
    self.db = np.sum(dout,axis=0)

    dx = dx.reshape(*self.input_data_shape)

    return dx

#### Batch Normalization

In [119]:
class BatchNormalization:

  def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var = None):
    self.gamma = gamma
    self.beta = beta
    self.momentum = momentum
    self.input_shape = None

    self.running_mean = running_mean
    self.running_var = running_var

    self.batch_size = None
    self.xc = None
    self.std = None
    self.dgamma = None
    self.dbeta = None
  
  def forward(self, input_data, is_train=True):
    self.input_shape = input_data.shape
    if input_data.ndim != 2:
      N, C, H, W = input_data.shape
      input_data = input_data.reshape(N,-1)

    out = self.__forward(input_data, is_train)

    return out.reshape(*self.input_shape)

  def __forward(self, input_data, is_train):
    if self.running_mean is None:
      N, D = input_data.shape
      self.running_mean = np.zeros(D)
      self.running_var = np.zeros(D)

    if is_train:
      mu = input_data.mean(axis=0)
      xc = input_data - mu
      var = np.mean(xc**2, axis=0)
      std = np.sqrt(var + 10e-7)
      xn = xc / std

      self.batch_size = input_data.shape[0]
      self.xc = xc
      self.xn = xn
      self.std = std
      self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu
      self.running_var = self.momentum * self.running_var + (1-self.momentum) * var
    else:
      xc = input_data - self.running_mean
      xn = xc / ((np.sqrt(self.running_var + 10e-7)))

    out = self.gamma * xn + self.beta
    return out

  def backward(self, dout):
    if dout.ndim != 2:
      N, C, H, W = dout.shape
      dout = dout.reshape(N, -1)

    dx = self.__backward(dout)

    dx = dx.reshape(*self.input_shape)
    return dx

  def __backward(self, dout):
    dbeta = dout.sum(axis=0)
    dgamma = np.sum(self.xn * dout, axis=0)
    dxn = self.gamma * dout
    dxc = dxn / self.std
    dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
    dvar = .5 * dstd / self.std
    dxc += (2.0 / self.batch_size) * self.xc * dvar
    dmu = np.sum(dxc, axis=0)
    dx = dxc- dmu / self.batch_size

    self.dgamma = dgamma
    self.dbeta = dbeta
    return dx

#### Dropout


In [120]:
class Dropout:
  def __init__(self, dropout_ratio = .5):
    self.dropout_ratio = dropout_ratio
    self.mask = None

  def forward(self,input_data, is_train=True):
    if is_train:
      self.mask = np.random.rand(*input_data.shape) > self.dropout_ratio
      return input_data * self.mask

    else:
      return input_data * (1.0 - self.dropout_ratio)

  def backward(self,dout):
    return dout * self.mask

#### Softmax

In [121]:
class Softmax:

  def __init__(self):
    self.loss = None
    self.y = None
    self.t = None

  def forward(self, input_data, t):
    self.t = t
    self.y = softmax(input_data)
    self.loss = cross_entropy_error(self.y, self.t)
    return self.loss

  def backward(self, dout=1):
    batch_size = self.t.shape[0]

    if self.t.size == self.y.size:
      dx = (self.y - self.t) / batch_size
    else:
      dx = self.y.copy()
      dx[np.arange(batch_size), self.t] -= 1
      dx = dx / batch_size

    return dx

## Model

In [131]:
class MyModel:

  def __init__(self, input_size, hidden_size_list, output_size,
               activation='relu', decay_lambda = 0,
               use_dropout=False, dropout_ratio = 0.5, use_batchnorm=False):
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_size_list = hidden_size_list
    self.hidden_layer_num = len(hidden_size_list)
    self.use_dropout = use_dropout
    self.decay_lambda = decay_lambda
    self.use_batchnorm = use_batchnorm
    self.params = {}

    self.__init_weight(activation)
    
    activation_layer = {'sigmoid':Sigmoid, 'relu':ReLU}
    self.layers = OrderedDict()
    for idx in range(1, self.hidden_layer_num+1):
      self.layers['Layer' + str(idx)] = Layer(self.params['W' + str(idx)],
                                              self.params['b' + str(idx)])
      if self.use_batchnorm:
        self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
        self.params['beta' + str(idx)] = np.ones(hidden_size_list[idx-1])
        self.layers['BatchNorm' + str(idx)] = BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)])

      self.layers['Activation_function' + str(idx)] = activation_layer[activation]()

      if self.use_dropout:
        self.layers['Dropout' + str(idx)] = Dropout(dropout_ratio)

    idx = self.hidden_layer_num + 1
    self.layers['Layer' + str(idx)] = Layer(self.params['W' + str(idx)], self.params['b' + str(idx)])
    self.last_layer = Softmax()

  def __init_weight(self, activation):
    all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]

    for idx in range(1, len(all_size_list)):
      scale = None
      if activation.lower() == 'relu':
        scale = np.sqrt(2.0 / all_size_list[idx-1])
      elif activation.lower() == 'sigmoid':
        scale = np.sqrt(1.0 / all_size_list[idx-1])

      self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
      self.params['b' + str(idx)] =  np.zeros(all_size_list[idx])

    
  def predict(self, x, is_train=False):
    for key, layer in self.layers.items():
      if 'Dropout' in key or 'BatchNorm' in key:
        x = layer.forward(x, is_train)
      else:
        x = layer.forward(x)

    return x

  def loss(self, x, t, is_train=False):
    y = self.predict(x,is_train)

    weight_decay = 0
    for idx in range(1, self.hidden_layer_num + 2):
      W = self.params['W' + str(idx)]
      weight_decay += 0.5 * self.decay_lambda * np.sum(W**2)

    return self.last_layer.forward(y,t) + weight_decay

  def accuracy(self,x,t):
    y = self.predict(x, is_train=False)
    y = np.argmax(y, axis=1)
    if t.ndim != 1:
      t = np.argmax(t,axis=1)
    
    accuracy = np.sum(y ==t) / float(x.shape[0])
    return accuracy

  def gradient(self, x, t):
    self.loss(x, t, is_train=True)

    dout = 1
    dout = self.last_layer.backward(dout)

    layers = list(self.layers.values())
    layers.reverse()
    for layer in layers:
      dout = layer.backward(dout)

    grads = {}
    for idx in range(1, self.hidden_layer_num+2):
      grads['W' + str(idx)] = self.layers['Layer' + str(idx)].dW + self.decay_lambda * self.params['W' + str(idx)]
      grads['b' + str(idx)] = self.layers['Layer' + str(idx)].db

      if self.use_batchnorm and idx != self.hidden_layer_num+1:
        grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma
        grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta

    return grads

## 모델 생성 및 학습 1

In [149]:
np.random.seed(42)

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
num_classes = 10

x_train = x_train[:10000]
x_test = x_test[:3000]

y_train = y_train[:10000]
y_test = y_test[:3000]

x_train, x_test = x_train.reshape(-1,28*28).astype(np.float32), x_test.reshape(-1,28*28).astype(np.float32)

x_train = x_train / .255
x_test = x_test / .255

y_train = np.eye(num_classes)[y_train]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(10000, 784)
(10000, 10)
(3000, 784)
(3000,)


In [150]:
epochs = 1000
learning_rate = 1e-3
batch_size = 100
train_size = x_train.shape[0]
iter_pre_epoch = max(train_size / batch_size, 1)

In [151]:
decay_lambda_1 = 0.15
model_1 = MyModel(input_size=784, hidden_size_list=[100, 100, 100, 100], output_size=10,
                  decay_lambda=decay_lambda_1, use_batchnorm=True)
optimizer = Momentum(learning_rate=learning_rate)

model_1_train_loss_list = []
model_1_train_acc_list = []
model_1_test_acc_list = []

In [152]:
for epoch in range(epochs):
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  y_batch = y_train[batch_mask]

  grads = model_1.gradient(x_batch, y_batch)
  optimizer.update(model_1.params, grads)

  loss = model_1.loss(x_batch, y_batch)
  model_1_train_loss_list.append(loss)

  train_acc = model_1.accuracy(x_train, y_train)
  test_acc = model_1.accuracy(x_test, y_test)
  model_1_train_acc_list.append(train_acc)
  model_1_test_acc_list.append(test_acc)

  if epoch % 50 == 0:
    print('[Model 1] Epoch: {}, Train Loss: {:.4f} Train Accuracy: {:.4f} Test Accuracy: {:.4f}'.format(epoch+1, loss, train_acc, test_acc))

[Model 1] Epoch: 1, Train Loss: 76.7115 Train Accuracy: 0.0832 Test Accuracy: 0.0770
[Model 1] Epoch: 51, Train Loss: 55.0378 Train Accuracy: 0.7667 Test Accuracy: 0.7080
[Model 1] Epoch: 101, Train Loss: 47.2827 Train Accuracy: 0.8398 Test Accuracy: 0.7797
[Model 1] Epoch: 151, Train Loss: 40.5122 Train Accuracy: 0.8738 Test Accuracy: 0.8180
[Model 1] Epoch: 201, Train Loss: 34.8000 Train Accuracy: 0.8878 Test Accuracy: 0.8343
[Model 1] Epoch: 251, Train Loss: 30.0831 Train Accuracy: 0.9015 Test Accuracy: 0.8483
[Model 1] Epoch: 301, Train Loss: 25.8020 Train Accuracy: 0.9121 Test Accuracy: 0.8610
[Model 1] Epoch: 351, Train Loss: 22.2532 Train Accuracy: 0.9170 Test Accuracy: 0.8717
[Model 1] Epoch: 401, Train Loss: 19.2091 Train Accuracy: 0.9204 Test Accuracy: 0.8807
[Model 1] Epoch: 451, Train Loss: 16.5716 Train Accuracy: 0.9253 Test Accuracy: 0.8790
[Model 1] Epoch: 501, Train Loss: 14.2565 Train Accuracy: 0.9271 Test Accuracy: 0.8800
[Model 1] Epoch: 551, Train Loss: 12.2841 Trai

## 모델 생성 및 학습 2
 - layer 수를 3개로 뒀으나 1 보다 더 낮음

> 들여쓴 블록



In [None]:
np.random.seed(42)

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
num_classes = 10

x_train = x_train[:10000]
x_test = x_test[:3000]

y_train = y_train[:10000]
y_test = y_test[:3000]

x_train, x_test = x_train.reshape(-1,28*28).astype(np.float32), x_test.reshape(-1,28*28).astype(np.float32)

x_train = x_train / .255
x_test = x_test / .255

y_train = np.eye(num_classes)[y_train]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(10000, 784)
(10000, 10)
(3000, 784)
(3000,)


In [None]:
epochs = 1000
learning_rate = 1e-3
batch_size = 100
train_size = x_train.shape[0]
iter_pre_epoch = max(train_size / batch_size, 1)

In [None]:
decay_lambda_1 = 0.15
model_1 = MyModel(input_size=784, hidden_size_list=[100, 100, 100], output_size=10,
                  decay_lambda=decay_lambda_1, use_batchnorm=True)
optimizer = Momentum(learning_rate=learning_rate)

model_1_train_loss_list = []
model_1_train_acc_list = []
model_1_test_acc_list = []

In [157]:
for epoch in range(epochs):
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  y_batch = y_train[batch_mask]

  grads = model_1.gradient(x_batch, y_batch)
  optimizer.update(model_1.params, grads)

  loss = model_1.loss(x_batch, y_batch)
  model_1_train_loss_list.append(loss)

  train_acc = model_1.accuracy(x_train, y_train)
  test_acc = model_1.accuracy(x_test, y_test)
  model_1_train_acc_list.append(train_acc)
  model_1_test_acc_list.append(test_acc)

  if epoch % 50 == 0:
    print('[Model 1] Epoch: {}, Train Loss: {:.4f} Train Accuracy: {:.4f} Test Accuracy: {:.4f}'.format(epoch+1, loss, train_acc, test_acc))

[Model 1] Epoch: 1, Train Loss: 74.7462 Train Accuracy: 0.1054 Test Accuracy: 0.1183
[Model 1] Epoch: 51, Train Loss: 16.0199 Train Accuracy: 0.8518 Test Accuracy: 0.8087
[Model 1] Epoch: 101, Train Loss: 3.3639 Train Accuracy: 0.8774 Test Accuracy: 0.8263
[Model 1] Epoch: 151, Train Loss: 1.2234 Train Accuracy: 0.8626 Test Accuracy: 0.8250
[Model 1] Epoch: 201, Train Loss: 0.9503 Train Accuracy: 0.8845 Test Accuracy: 0.8417
[Model 1] Epoch: 251, Train Loss: 0.6988 Train Accuracy: 0.9024 Test Accuracy: 0.8623
[Model 1] Epoch: 301, Train Loss: 0.8320 Train Accuracy: 0.8753 Test Accuracy: 0.8193
[Model 1] Epoch: 351, Train Loss: 0.7247 Train Accuracy: 0.8948 Test Accuracy: 0.8453
[Model 1] Epoch: 401, Train Loss: 0.9015 Train Accuracy: 0.8626 Test Accuracy: 0.7967
[Model 1] Epoch: 451, Train Loss: 0.8360 Train Accuracy: 0.8614 Test Accuracy: 0.7960
[Model 1] Epoch: 501, Train Loss: 0.7023 Train Accuracy: 0.9003 Test Accuracy: 0.8627
[Model 1] Epoch: 551, Train Loss: 0.7301 Train Accuracy

## 모델 생성 및 학습 3
 - learning_rate를 1e-2로 뒀으나 1보다 더 낮음

In [170]:
epochs = 1000
learning_rate = 1e-2
batch_size = 100
train_size = x_train.shape[0]
iter_pre_epoch = max(train_size / batch_size, 1)

In [171]:
decay_lambda_1 = 0.15
model_1 = MyModel(input_size=784, hidden_size_list=[100, 100, 100, 100], output_size=10,
                  decay_lambda=decay_lambda_1, use_batchnorm=True)
optimizer = Momentum(learning_rate=learning_rate)

model_1_train_loss_list = []
model_1_train_acc_list = []
model_1_test_acc_list = []

In [172]:
for epoch in range(epochs):
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  y_batch = y_train[batch_mask]

  grads = model_1.gradient(x_batch, y_batch)
  optimizer.update(model_1.params, grads)

  loss = model_1.loss(x_batch, y_batch)
  model_1_train_loss_list.append(loss)

  train_acc = model_1.accuracy(x_train, y_train)
  test_acc = model_1.accuracy(x_test, y_test)
  model_1_train_acc_list.append(train_acc)
  model_1_test_acc_list.append(test_acc)

  if epoch % 50 == 0:
    print('[Model 1] Epoch: {}, Train Loss: {:.4f} Train Accuracy: {:.4f} Test Accuracy: {:.4f}'.format(epoch+1, loss, train_acc, test_acc))

[Model 1] Epoch: 1, Train Loss: 76.6410 Train Accuracy: 0.0711 Test Accuracy: 0.0810
[Model 1] Epoch: 51, Train Loss: 16.1280 Train Accuracy: 0.8647 Test Accuracy: 0.8140
[Model 1] Epoch: 101, Train Loss: 3.3932 Train Accuracy: 0.8813 Test Accuracy: 0.8350
[Model 1] Epoch: 151, Train Loss: 1.3065 Train Accuracy: 0.8700 Test Accuracy: 0.8037
[Model 1] Epoch: 201, Train Loss: 0.9624 Train Accuracy: 0.8917 Test Accuracy: 0.8293
[Model 1] Epoch: 251, Train Loss: 0.7619 Train Accuracy: 0.8583 Test Accuracy: 0.8060
[Model 1] Epoch: 301, Train Loss: 0.7248 Train Accuracy: 0.8858 Test Accuracy: 0.8247
[Model 1] Epoch: 351, Train Loss: 0.7672 Train Accuracy: 0.8815 Test Accuracy: 0.8213
[Model 1] Epoch: 401, Train Loss: 0.8193 Train Accuracy: 0.8793 Test Accuracy: 0.8167
[Model 1] Epoch: 451, Train Loss: 0.7400 Train Accuracy: 0.8813 Test Accuracy: 0.8420
[Model 1] Epoch: 501, Train Loss: 0.6393 Train Accuracy: 0.8904 Test Accuracy: 0.8450
[Model 1] Epoch: 551, Train Loss: 0.7760 Train Accuracy

## 모델 생성 및 학습 4
 - decay_lambda를 0.75로 낮췄으나 1보다 낮음

In [173]:
epochs = 1000
learning_rate = 1e-3
batch_size = 100
train_size = x_train.shape[0]
iter_pre_epoch = max(train_size / batch_size, 1)

In [174]:
decay_lambda_1 = 0.75
model_1 = MyModel(input_size=784, hidden_size_list=[100, 100, 100,100], output_size=10,
                  decay_lambda=decay_lambda_1, use_batchnorm=True)
optimizer = Momentum(learning_rate=learning_rate)

model_1_train_loss_list = []
model_1_train_acc_list = []
model_1_test_acc_list = []

In [175]:
for epoch in range(epochs):
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  y_batch = y_train[batch_mask]

  grads = model_1.gradient(x_batch, y_batch)
  optimizer.update(model_1.params, grads)

  loss = model_1.loss(x_batch, y_batch)
  model_1_train_loss_list.append(loss)

  train_acc = model_1.accuracy(x_train, y_train)
  test_acc = model_1.accuracy(x_test, y_test)
  model_1_train_acc_list.append(train_acc)
  model_1_test_acc_list.append(test_acc)

  if epoch % 50 == 0:
    print('[Model 1] Epoch: {}, Train Loss: {:.4f} Train Accuracy: {:.4f} Test Accuracy: {:.4f}'.format(epoch+1, loss, train_acc, test_acc))

[Model 1] Epoch: 1, Train Loss: 320.2815 Train Accuracy: 0.0848 Test Accuracy: 0.0903
[Model 1] Epoch: 51, Train Loss: 158.6863 Train Accuracy: 0.7467 Test Accuracy: 0.6747
[Model 1] Epoch: 101, Train Loss: 71.1839 Train Accuracy: 0.8385 Test Accuracy: 0.7877
[Model 1] Epoch: 151, Train Loss: 32.2155 Train Accuracy: 0.8801 Test Accuracy: 0.8340
[Model 1] Epoch: 201, Train Loss: 14.9866 Train Accuracy: 0.8968 Test Accuracy: 0.8547
[Model 1] Epoch: 251, Train Loss: 7.2714 Train Accuracy: 0.9053 Test Accuracy: 0.8617
[Model 1] Epoch: 301, Train Loss: 3.8350 Train Accuracy: 0.9094 Test Accuracy: 0.8623
[Model 1] Epoch: 351, Train Loss: 2.3547 Train Accuracy: 0.9258 Test Accuracy: 0.8753
[Model 1] Epoch: 401, Train Loss: 1.6755 Train Accuracy: 0.9358 Test Accuracy: 0.8923
[Model 1] Epoch: 451, Train Loss: 1.2601 Train Accuracy: 0.9290 Test Accuracy: 0.8827
[Model 1] Epoch: 501, Train Loss: 1.2773 Train Accuracy: 0.9354 Test Accuracy: 0.8933
[Model 1] Epoch: 551, Train Loss: 1.0331 Train Acc

## 모델 생성 및 학습 5
 - train의 수를 6000으로 늘렸으나 1보다 낮음 결론적으로 1의 성능이 제일 뛰어남

In [177]:
np.random.seed(42)

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
num_classes = 10

x_train = x_train[:60000]
x_test = x_test[:3000]

y_train = y_train[:60000]
y_test = y_test[:3000]

x_train, x_test = x_train.reshape(-1,28*28).astype(np.float32), x_test.reshape(-1,28*28).astype(np.float32)

x_train = x_train / .255
x_test = x_test / .255

y_train = np.eye(num_classes)[y_train]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(60000, 784)
(60000, 10)
(3000, 784)
(3000,)


In [178]:
epochs = 1000
learning_rate = 1e-3
batch_size = 100
train_size = x_train.shape[0]
iter_pre_epoch = max(train_size / batch_size, 1)

In [179]:
decay_lambda_1 = 1.5
model_1 = MyModel(input_size=784, hidden_size_list=[100, 100, 100,100], output_size=10,
                  decay_lambda=decay_lambda_1, use_batchnorm=True)
optimizer = Momentum(learning_rate=learning_rate)

model_1_train_loss_list = []
model_1_train_acc_list = []
model_1_test_acc_list = []

In [180]:
for epoch in range(epochs):
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  y_batch = y_train[batch_mask]

  grads = model_1.gradient(x_batch, y_batch)
  optimizer.update(model_1.params, grads)

  loss = model_1.loss(x_batch, y_batch)
  model_1_train_loss_list.append(loss)

  train_acc = model_1.accuracy(x_train, y_train)
  test_acc = model_1.accuracy(x_test, y_test)
  model_1_train_acc_list.append(train_acc)
  model_1_test_acc_list.append(test_acc)

  if epoch % 50 == 0:
    print('[Model 1] Epoch: {}, Train Loss: {:.4f} Train Accuracy: {:.4f} Test Accuracy: {:.4f}'.format(epoch+1, loss, train_acc, test_acc))

[Model 1] Epoch: 1, Train Loss: 627.7445 Train Accuracy: 0.0769 Test Accuracy: 0.0710
[Model 1] Epoch: 51, Train Loss: 151.3856 Train Accuracy: 0.7484 Test Accuracy: 0.7027
[Model 1] Epoch: 101, Train Loss: 26.4802 Train Accuracy: 0.8286 Test Accuracy: 0.7927
[Model 1] Epoch: 151, Train Loss: 5.8513 Train Accuracy: 0.8639 Test Accuracy: 0.8290
[Model 1] Epoch: 201, Train Loss: 2.4541 Train Accuracy: 0.8525 Test Accuracy: 0.7993
[Model 1] Epoch: 251, Train Loss: 1.6172 Train Accuracy: 0.8935 Test Accuracy: 0.8640
[Model 1] Epoch: 301, Train Loss: 1.3931 Train Accuracy: 0.8923 Test Accuracy: 0.8630
[Model 1] Epoch: 351, Train Loss: 1.4730 Train Accuracy: 0.8371 Test Accuracy: 0.8023
[Model 1] Epoch: 401, Train Loss: 1.4562 Train Accuracy: 0.9004 Test Accuracy: 0.8723
[Model 1] Epoch: 451, Train Loss: 1.2692 Train Accuracy: 0.8958 Test Accuracy: 0.8653
[Model 1] Epoch: 501, Train Loss: 1.3192 Train Accuracy: 0.9076 Test Accuracy: 0.8840
[Model 1] Epoch: 551, Train Loss: 1.3763 Train Accur