<a href="https://colab.research.google.com/github/falconlee236/DeepLearningFrom_Scratch/blob/main/ch04/Chapter_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 4 Neural network training

**4.2 loss function**

*4.2.1 sum of squares for error, SSE*

In [None]:
def sum_squares_error(y, t):
  return 0.5 * np.sum((y - t) ** 2)

In [None]:
import numpy as np

# answer is 2
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

# example1 : estimate probablity of number 2 to most highest probablity(0.6)
y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
sum_squares_error(np.array(y), np.array(t))

In [None]:
# example2 : estimate probablity of number 7 to most highest probablity(0.6)
y = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]
sum_squares_error(np.array(y), np.array(t))

the smaller value of loss function is, the better answer has

*4.2.2 cross entropy error, CEE*

In [None]:
def cross_entropy_error(y, t):
  delta = 1e-7 # log0 = -inf/ protect -inf
  return -np.sum(t * np.log(y + delta))

In [None]:
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
cross_entropy_error(np.array(y), np.array(t))

In [None]:
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
y = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]
cross_entropy_error(np.array(y), np.array(t))

*4.2.3 mini-batch training*

In [None]:
%cd drive/MyDrive/DeepLearningFrom_Scratch/ch04/

In [None]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

print(x_train.shape) # (60000, 784)
print(t_train.shape) # (60000, 784)

In [None]:
train_size = x_train.shape[0]
batch_size = 10
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]

In [None]:
np.random.choice(60000, 10)

In [None]:
# Case: t label is given by one-hot encoding
def cross_entropy_error(y, t):
  if y.ndim == 1:
    t = t.reshape(1, t.size)
    y = y.reshape(1, y.size)

  batch_size = y.shape[0]
  return -np.sum(t * np.log(y + 1e-7)) / batch_size

In [None]:
# Case: t label is given by number label
def cross_entropy_error(y, t):
  if y.ndim == 1:
    t = t.reshape(1, t.size)
    y = y.reshape(1, y.size)

  batch_size = y.shape[0]
  return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

**4.3 numerical differentiation**

*4.3.1 differentiation*

In [None]:
# wrong implementation exaple
def numerical_diff(f, x):
  h = 10e-50
  return (f(x + h) - f(x)) / h

In [None]:
# rounding error example
np.float32(1e-50)

In [None]:
# middle differentiation -> Has little error
def numerical_diff(f, x):
  h = 1e-4 # 0.0001 best delta value
  return (f(x + h) - f(x - h)) / (2 * h)

*4.3.2 exapmle of numerical differentiation*

In [None]:
# y = 0.01x**2 + 0.1x
def function_1(x):
  return 0.01 * x ** 2 + 0.1 * x

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = np.arange(0.0, 20.0, 0.1) # made array from 0 to 20 for 0.1 space
y = function_1(x)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.plot(x, y)
plt.show()

In [None]:
numerical_diff(function_1, 5)

In [None]:
numerical_diff(function_1, 10)

*4.3.3 partial derivative*

In [None]:
# f(x0, x1) = x0 ** 2 + x1 ** 2
def function_2(x):
  return np.sum(x ** 2)

In [None]:
# x0 = 3, x1 = 4 -> parital derivative of x0
def function_tmp1(x0):
  return x0 * x0 + 4.0 ** 2.0

In [None]:
numerical_diff(function_tmp1, 3.0)

In [None]:
# x0 = 3, x1 = 4 -> partial derivative of x1
def function_tmp2(x1):
  return 3.0 ** 2.0 + x1 * x1

In [None]:
numerical_diff(function_tmp2, 4.0)

**4.4 Gradient**

Gradient is vector of differentation array of every variable 

In [None]:
def numerical_gradient(f, x):
  h = 1e-4
  grad = np.zeros_like(x) # Generate array like shape x

  for idx in range(x.size):
    tmp_val = x[idx]
    # calculate f(x + h)
    x[idx] = tmp_val + h
    fxh1 = f(x)

    # calculate f(x - h)
    x[idx] = tmp_val - h
    fxh2 = f(x)

    grad[idx] = (fxh1 - fxh2) / (2 * h)
    x[idx] = tmp_val # restore value
    
  return grad

In [None]:
numerical_gradient(function_2, np.array([3.0, 4.0]))

In [None]:
numerical_gradient(function_2, np.array([0.0, 2.0]))

In [None]:
numerical_gradient(function_2, np.array([3.0, 0.0]))

In [None]:
# f: function, init_x: init value, lr: learning rate, step_num: repetition number
def gradient_desent(f, init_x, lr=0.01, step_num=100):
  x = init_x # reference value

  for i in range(step_num):
    grad = numerical_gradient(f, x)
    x -= lr * grad
  return x


In [None]:
init_x = np.array([-3.0, 4.0])
gradient_desent(function_2, init_x=init_x, lr=0.1)

In [None]:
# example of too much learning rate: lr = 10.0
init_x = np.array([-3.0, 4.0])
gradient_desent(function_2, init_x=init_x, lr=10.0)

In [None]:
# example of too less learning rate: lr = 1e-10
init_x = np.array([-3.0, 4.0])
gradient_desent(function_2, init_x=init_x, lr=1e-10)

*4.4.2 gradient of neural network*