# Unit 10. Optimizer

In [None]:
import numpy as np

- 함수 및 도함수 정의

In [None]:
# f(x,y) = x*x + y*y + xy - 4x - 8y
def func(params):
    x, y = params
    return x*x + y*y + x*y - 4.*x - 8.*y

# Df(x,y) = (2x + y - 4, 2y + x - 8)
def deriv_f(params):
    x, y = params
    return np.array((np.round(2*x + y - 4., 4), np.round(2*y + x - 8., 4)))

- SGD(Stochastic Gradient Descent): 확률적 경사 하강법

In [None]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
    def update(self, params, grads):
        for i in range(params.shape[0]):
            params[i] -= self.lr * grads[i]

In [None]:
sgd = SGD(0.5)
params = np.array((0,0), dtype=np.float32)
grads = deriv_f(params)
print(f'초기값: params={params}, grads={grads}, func={func(params):.4f}')
for i in range(10):
    sgd.update(params, grads)
    print(f'{i+1}회 시행: params={params}, grads={grads}, func={func(params):.4f}')
    grads = deriv_f(params)

- Momentum
    - Gradient Descent에 현재의 관성을 추가

In [None]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
    def update(self, params, grads):
        if self.v is None:
            self.v = np.zeros_like(params)
        for i in range(len(params)):
            self.v[i] = self.momentum * self.v[i] - self.lr * grads[i]
            params[i] += self.v[i]

In [None]:
momentum = Momentum(lr=0.5, momentum=0.5)
params = np.zeros(2, dtype=np.float32)
grads = deriv_f(params)
print(f'초기값: params={params}, grads={grads}, func={func(params):.4f}')
for i in range(10):
    momentum.update(params, grads)
    print(f'{i+1}회 시행: params={params}, grads={grads}, v={momentum.v}, func={func(params):.4f}')
    grads = deriv_f(params)

- NAG(Nesterov Accelerated Gradient)
    - 현재 위치에서의 관성과 관성방향으로 움직인 후 위치에서의 gradient 반대방향을 합침

In [None]:
class NAG:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
    def update(self, params, grads):
        if self.v is None:
            self.v = np.zeros_like(params)
        for i in range(len(params)):
            params[i] += self.momentum * self.momentum * self.v[i]
            params[i] -= (1 + self.momentum) * self.lr * grads[i]
            self.v[i] *= self.momentum
            self.v[i] -= self.lr * grads[i]

In [None]:
nag = NAG(lr=0.2, momentum=0.8)
params = np.zeros(2, dtype=np.float32)
grads = deriv_f(params)
print(f'초기값: params={params}, grads={grads}, func={func(params):.4f}')
for i in range(10):
    nag.update(params, grads)
    print(f'{i+1}회 시행: params={params}, grads={grads}, v={momentum.v}, func={func(params):.4f}')
    grads = deriv_f(params)

- AdaGrad
    - 일정한 learning rate를 사용하지 않고 변수마다 그리고 스텝마다 learning rate가 바뀜

In [None]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
    def update(self, params, grads):
        if self.h is None:
            self.h = np.zeros_like(params)
        for i in range(len(params)):
            self.h[i] = round(self.h[i] + grads[i] * grads[i], 4)
            params[i] = round(params[i] - self.lr * grads[i] / (np.sqrt(self.h[i]) + 1e-7), 4)

In [None]:
adg = AdaGrad(lr=10)
params = np.zeros(2, dtype=np.float32)
grads = deriv_f(params)
print(f'초기값: params={params}, grads={grads}, func={func(params):.4f}')
for i in range(10):
    adg.update(params, grads)
    print(f'{i+1}회 시행: params={params}, grads={grads}, h={adg.h}, func={func(params):.4f}')
    grads = deriv_f(params)

- RMSProp
    - AdaGrad는 스텝이 많이 진행되면 h 값이 너무 커져서 학습률이 너무 작아져 학습이 거의 되지 않음
    - 이를 보완하기 위해 이전 누적치와 현재 그래디언트의 좌표별 제곱의 가중치 평균을 반영함

In [None]:
class RMSProp:
    def __init__(self, lr=0.01, gamma=0.75):    # gamma: forgetting factor(decay rate)
        self.lr = lr
        self.gamma = gamma      # gamma가 클수록 과거가 중요하고, 작을수록 현재(gradient)가 중요
        self.h = None
    def update(self, params, grads):
        if self.h is None:
            self.h = np.zeros_like(params)
        for i in range(len(params)):
            self.h[i] = round(self.gamma * self.h[i] + (1 - self.gamma) * grads[i] * grads[i], 4)
            params[i] = round(params[i] - self.lr * grads[i] / (np.sqrt(self.h[i]) + 1e-7), 4)

In [None]:
rmsp = RMSProp(lr=0.9, gamma=0.75)
params = np.zeros(2, dtype=np.float32)
grads = deriv_f(params)
print(f'초기값: params={params}, grads={grads}, func={func(params):.4f}')
for i in range(10):
    rmsp.update(params, grads)
    print(f'{i+1}회 시행: params={params}, grads={grads}, h={rmsp.h}, func={func(params):.4f}')
    grads = deriv_f(params)

- Adam
    - Momentum과 RMSProp 두가지 방식을 혼합

In [None]:
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr, self.beta1, self.beta2 = lr, beta1, beta2
        self.iter, self.m, self.v = 0, None, None
    def update(self, params, grads):
        if self.m is None:
            self.m = np.zeros_like(params)
            self.v = np.zeros_like(params)
        self.iter += 1
        lr_t = self.lr * np.sqrt(1. - self.beta2**self.iter) / (1. - self.beta1**self.iter)
        
        for i in range(len(params)):
            self.m[i] = round(self.beta1 * self.m[i] + (1. - self.beta1) * grads[i], 4)
            # self.m[i] += (1. - self.beta1) * (grads[i] - self.m[i])
            self.v[i] = round(self.beta2 * self.v[i] + (1. - self.beta2) * grads[i]**2, 4)
            # self.v[i] += (1. - self.beta2) * (grads[i]**2 - self.v[i])
            params[i] = round(params[i] - lr_t * self.m[i] / (np.sqrt(self.v[i] + 1e-7)), 4)

In [None]:
adam = Adam(lr=0.9)
params = np.zeros(2, dtype=np.float32)
grads = deriv_f(params)
print(f'초기값: params={params}, grads={grads}, func={func(params):.4f}')
for i in range(10):
    adam.update(params, grads)
    print(f'{i+1}회 시행: params={params}, grads={grads}, m={adam.m}, v={adam.v}, func={func(params):.4f}')
    grads = deriv_f(params)

### 시각화

In [None]:
import matplotlib.pyplot as plt
from collections import OrderedDict

In [None]:
def f(x, y):
    #x, y = params[0], params[1]
    return x*x / 20.0 + y*y

def df(x, y):
    #x, y = params[0], params[1]
    return np.array((x / 10.0, 2*y))

In [None]:
optimizers = OrderedDict()
optimizers['SGD'] = SGD(lr=0.95)
optimizers['Momentum'] = Momentum(lr=0.1)
optimizers['AdaGrad'] = AdaGrad(lr=1.5)
optimizers['Adam'] = Adam(lr=0.3)

In [None]:
idx = 1
plt.figure(figsize=(10,10))
for key in optimizers:
    optimizer = optimizers[key]
    x_history, y_history = [], []
    params = np.array((-7, 2), dtype=np.float32)
    for i in range(30):
        x_history.append(params[0])
        y_history.append(params[1])
        grads = df(params[0], params[1])
        optimizer.update(params, grads)
        
    x = np.arange(-10, 10, 0.01)
    y = np.arange(-5, 5, 0.01)
    X, Y = np.meshgrid(x, y)
    Z = f(X, Y)
    
    # 외곽선 단순화
    mask = Z > 7
    Z[mask] = 0
    
    # 그래프 그리기
    plt.subplot(2, 2, idx)
    idx += 1
    plt.plot(x_history, y_history, 'ro-')
    plt.contour(X, Y, Z)
    plt.xlim(-10, 10), plt.ylim(-10, 10)
    plt.plot(0, 0, '+')
    plt.title(key), plt.xlabel('X'), plt.ylabel('Y')
    
plt.show()