优化算法
================

在训练模型时,我们会使用优化算法不断迭代模型参数以降低模型损失函数的值。当迭代终
止时,模型的训练随之终止,此时的模型参数就是模型通过训练所学习到的参数。


优化算法对于深度学习十分重要。
- 一方面,训练一个复杂的深度学习模型可能需要数小时、数日,
甚至数周时间,而优化算法的表现直接影响模型的训练效率;
- 另一方面,理解各种优化算法的原
理以及其中超参数的意义将有助于我们更有针对性地调参,从而使深度学习模型表现更好。


本章将详细介绍深度学习中常用的优化算法。

In [1]:
import sys
sys.path.append('../../')

In [2]:
#导入模块
import torch
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
from matplotlib.animation import FuncAnimation

#建立步长为0.01，即每隔0.01取一个点
step = 0.01
X = np.arange(-5,5,step)
Y = -0.5
#写入函数，z是大写
Z = (3*(1-X)**2)*np.exp(-(X**2) - (Y+1)**2)\
   - 10*(X/5 - X**3 - Y**5)*np.exp(-X**2-Y**2)\
   - 1/3*np.exp(-(X+1)**2 - Y**2) 

plt.plot(X,Z)
plt.show()

<Figure size 640x480 with 1 Axes>

In [3]:
import kitorch as kt
from kitorch import optim,no_grad,  functional as F

In [4]:
def model(X,Y=-0.5):
    a = (-(X**2) - (Y+1)**2).exp()
    b = (-X**2-Y**2).exp()
    c = (-(X+1)**2 - Y**2).exp() 
    return (3*(X-1)**2)*a - 10*(1/5*X - X**3 - Y**5)*b - 1/3*c

In [5]:
def train(x,model,optimizer,epochs,points):
    Z0 = model(x)
    for epoch in range(epochs):
        Z0.backward()
        optimizer.step()
        Z1 = model(x)

        optimizer.zero_grad()
        Z0 = Z1
        points.append((x.data.item(),Z0.data.item()))
        

In [6]:
x = kt.randn(1,requires_grad=True)

In [7]:
epochs = 100
# 收敛速度慢Adadelta
# optimizer = optim.Adadelta(paras,lr=1)
x1 = x.deepcopy()
optimizer_Adadelta = optim.Adadelta([[x1]],lr=2)
points_Adadelta = [(x1.data.item())]
train(x1,model,optimizer_Adadelta,epochs,
      points_Adadelta)

# Adagrad
# optimizer = optim.Adagrad(paras,lr=0.1)
x2 = x.deepcopy()
optimizer_Adagrad = optim.Adagrad([[x2]],lr=0.1)
points_Adagrad = [(x2.data.item())]
train(x2,model,optimizer_Adagrad,epochs,
      points_Adagrad)

x3 = x.deepcopy()
optimizer_SGD = optim.SGD([[x3]],lr=0.01,momentum=0.9)
points_SGD = [(x3.data.item())]
train(x3,model,optimizer_SGD,epochs,points_SGD)

x4 = x.deepcopy()
optimizer_Adam = optim.Adam([[x4]],lr=0.1)
points_Adam = [(x4.data.item())]
train(x4,model,optimizer_Adam,epochs,points_Adam)

x5 = x.deepcopy()
optimizer_RMSprop = optim.RMSprop([[x5]],lr=0.1,beta=0.9)
points_RMSprop = [(x5.data.item())]
train(x5,model,optimizer_RMSprop,epochs,points_RMSprop)


In [8]:
# 利用torch进行计算
epochs = 100
x1 = torch.from_numpy(x.data.copy())
x1.requires_grad = True
optimizer_Adadelta = torch.optim.Adadelta([x1],lr=2)
torch_points_Adadelta = [(x1.data.item())]
train(x1,model,optimizer_Adadelta,epochs,
      torch_points_Adadelta)

x2 = torch.from_numpy(x.data.copy())
x2.requires_grad = True
optimizer_Adagrad = torch.optim.Adagrad([x2],lr=0.1)
torch_points_Adagrad = [(x2.data.item())]
train(x2,model,optimizer_Adagrad,epochs,
      torch_points_Adagrad)


x3 = torch.from_numpy(x.data.copy())
x3.requires_grad = True
optimizer_SGD = torch.optim.SGD([x3],lr=0.01,momentum=0.9)
torch_points_SGD = [(x3.data.item())]
train(x3,model,optimizer_SGD,epochs,torch_points_SGD)

x4= torch.from_numpy(x.data.copy())
x4.requires_grad = True

optimizer_Adam = torch.optim.Adam([x4],lr=0.1)
torch_points_Adam = [(x4.data.item())]
train(x4,model,optimizer_Adam,epochs,torch_points_Adam)


x5 = torch.from_numpy(x.data.copy())
x5.requires_grad = True

optimizer_RMSprop = torch.optim.RMSprop([x5],lr=0.1,alpha=0.9)
torch_points_RMSprop = [(x5.data.item())]
train(x5,model,optimizer_RMSprop,epochs,torch_points_RMSprop)

In [9]:
#导入模块
%matplotlib qt5

In [10]:
global ani
def plot(data,X,Z,title="optimizer analysis",file_name=None,
         interval=200):
    global ani
    fig, ax = plt.subplots()
    time_text = ax.text(0.05, 0.9, '', transform=ax.transAxes)
    traces = []
    markers = []
    labels = []
    dot_plots = []
    for points,marker,label in data:
        dot_plots.append(ax.plot([], [],marker,markersize=10)[0])
        labels.append(label)
        traces.append(points)
    
    
    def init():
        ax.plot(X,Z)
        ax.set_title(title)
        time_text.set_text('')
        return ax,time_text
    
    def gen_dot():
        num_point = len(traces[0])
        for i in range(num_point):
            new_point = []
            for trace in traces:
                new_point.append(trace[i])
                
            new_point.append(i)
            yield new_point
            
    def update_dot(newd):
        time_text.set_text('times=%s'%newd[-1])
        for idx,dot_plot in enumerate(dot_plots):
            dot_plot.set_data(newd[idx][0], newd[idx][1])
      
        ax.legend(handles=dot_plots,labels=labels,loc='best')
        return dot_plots


    ani = FuncAnimation(fig, update_dot, 
                              frames = gen_dot, 
                              interval = interval, 
                              init_func=init,
                              repeat = False
                           )
    
    plt.show()
    
    if file_name:
        ani.save(file_name,writer='imagemagick')
        
    return ani

In [11]:
data = [
    (torch_points_SGD,'ro','SGD'),
    (torch_points_Adam,'kp','Adam'),
    (torch_points_Adagrad,'b<','Adagrad'),
    (torch_points_Adadelta,'k*','Adadelta'),
    (torch_points_RMSprop,'rv','RMSprop')
]

plot(data,X,Z,interval=500)

<matplotlib.animation.FuncAnimation at 0x7f08a2394be0>