In [4]:
def sgd_adagrad(parameters,sqrs,lr):
    eps=1e-10
    for param ,sqr in zip(parameters,sqrs):
        sqr[:]=sqr+param.grad.data**2
        div=lr/torch.sqrt(sqr+eps)*param.grad.data
        param.data=param.data-div

In [5]:
import numpy as np
import torch
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch import nn
from torch.autograd import Variable
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
def data_tf(x):
    x=np.array(x,dtype="float32")/255 # 将数据变到0~1 之间
    x=(x-0.5)/0.5 # 标准化
    x=x.reshape((-1,)) # 拉平
    x=torch.from_numpy(x)
    return x

train_set=MNIST("./data",train=True,transform=data_tf,download=True)
test_set=MNIST("./data",train=False,transform=data_tf,download=True)

In [7]:
criterion=nn.CrossEntropyLoss()

In [8]:
train_data=DataLoader(train_set,batch_size=1,shuffle=True)
# 使用Sequential 定义三层神经网络
net=nn.Sequential(
    nn.Linear(784,200),
    nn.ReLU(),
    nn.Linear(200,10),
)

In [16]:
# 初始化梯度平方项
sqrts=[]
for param in net.parameters():
    sqrts.append(torch.zeros_like(param.data))
print(sqrts[0].shape)
print(sqrts[1].shape)
print(sqrts[2].shape)
print(sqrts[3].shape)
print(len(sqrts))

torch.Size([200, 784])
torch.Size([200])
torch.Size([10, 200])
torch.Size([10])
4


In [None]:
losses1=[]
idx=0
start=time.time()

for e in range(5):
    train_loss=0
    for im,label in train_data:
        im=Variable(im)
        label=Variable(label)
        # 前向传播
        out=net(im)
        loss=criterion(out,label)
        # 反向传播
        net.zero_grad()
        loss.backward()
        sgd_adagrad(net.parameters(),sqrs,e-2) # 使用0.01 的学习率
        # 记录误差
        train_loss +=loss.data.numpy()
        if idx %30 ==0:
            losses1.append(loss.data.numpy())
        idx +=1
    print(e,train_loss/len(train_data))
end=time.time()
print(start-end)

In [17]:
# pytorch 中 
optimizer=torch.optim.Adagrad(net.parameters(),lr=1e-2)

In [None]:
#(2) RMSProp 
def rmsprop(parameters,sqrs,lr,alpha):
    eps=1e-10
    for param, sqr in zip(parameters,sqrs):
        sqr[:]=alpha*sqr+(1-alpha)*param.grad.data**2
        div=lr/torch.sqrt(sqr+eps)*param.grad.data
        param.data=param.data-div
# 使用 
rmsprop(net.parameters(),sqrs,1e-3,0.9) alpha 为0.9

# pytorch 中的
optimizer=torch.optim.RMSprop(net.parameters(),lr=1e-3,alpha=0.9)

In [None]:
#(3) Adadelta

def adadelta(parameters,sqrs,deltas,rho):
    eps=1e-6
    for param,sqr,delta in zip(parameters,sqrs,deltas):
        sqr[:]=rho*sqr +(1-rho)*param.grad.data**2
        cur_delta=torch.sqrt(delta+eps)/torch.sqrt(sqr+eps)*param.grad.data
        delta[:]=rho*delta+(1-rho)*cur_delta**2
        param.data=param.data-cur_delta
        
# 要初始化 梯度平方项和delta 项
sqrs=[]
deltas=[]
for param in net.parameters():
    sqrs.append(torch.zero_like(param.data))
    delta.append(torch.zeros_like(param.data))

adadelta(net.parameters(),sqrs,deltas,0.9)

# pytorch 
optimizer=torch.optim.Adadelta(net.parameters(),rho=0.9)

In [None]:
#（4） Adam # 结合了momentum和RMSProp
def adam(parameters,vs,sqrts,lr,t,beta1=0.9,beta2=0.999):
    eps=1e-8
    for param,v,sqr in zip(parameters,vs,sqrs):
        v[:]=beta1*v+(1-beta1)*param.grad.data
        sqr[:]=beta2*sqr*sqr+(1-beta2)*param.grad.data**2
        v_hat=v/(1-beta1**t)
        s_hat=sqr/(1-beta2**t)
        param.data=param.data-lr*v_hat/torch.sqrt(s_hat+eps)

# 初始化梯度平方项和动量项
sqrs=[]
vs=[]
for param in net.parameters():
    sqrs.append(torch.zeros_like(param.data))
    vs.append(torch.zeros_like(param.data))
adam(net.parameters(),vs,sqrts,1e-3,t)

In [None]:
# pytorch 中的
optimizer=torch.optim.Adam(net.parameters(),lr=1e-3)