In [4]:
import torch

In [5]:
# 梯度下降
# 例一
x = torch.tensor(1,requires_grad=True,dtype=torch.float32)
y = x**2
torch.autograd.grad(y,x)

(tensor(2.),)

In [6]:
# 例二
x = torch.tensor(1,requires_grad=True,dtype=torch.float32)
z = x**2

In [7]:
y = torch.tensor(2,requires_grad=True,dtype=torch.float32) #真实标签

In [8]:
sigma = torch.sigmoid(z)

In [9]:
loss = -(y*torch.log(sigma) + (1-y)*torch.log(1-sigma))

In [10]:
torch.autograd.grad(loss,y)

(tensor(-1.0000),)

In [11]:
# 反向传播

# 3分类，500个样本，20个特征，共3层
# 第一层13个神经元，第二层8个神经元

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [13]:
# 确定数据
torch.manual_seed(420)
X = torch.rand((500,20),dtype=torch.float32) * 100
y = torch.randint(low=0,high=3,size=(500,),dtype=torch.float32)

In [14]:
# 定义神经网络的架构
class Net(nn.Module):
    def __init__(self,in_features=40,out_features=2):
        super().__init__()
        
        self.linear1 = nn.Linear(in_features,13,bias=False)
        self.linear2 = nn.Linear(13,8,bias=False)
        self.output = nn.Linear(8,out_features,bias=True)
        
    def forward(self,x):
        sigma1 = torch.relu(self.linear1(x)) # 激活函数（层的线性结果）

        sigma2 = torch.sigmoid(self.linear2(sigma1))
        """
        三分类的损失函数无论是使用logsoftmax+NLLLoss还是交叉熵,都不需要计算sigmoid,在层的最后要么写logsoftmax，要么不写
        二分类考虑使用BCE，BCEWithLogitsLoss
        """
        zhat = self.output(sigma2)
        
        return zhat               

In [15]:
input_ = X.shape[1]
output_ = len(y.unique())

In [16]:
torch.manual_seed(420)
net = Net(in_features=input_,out_features=output_)

In [17]:
zhat = net.forward(X)

In [18]:
# 定义损失函数
from torch.nn import CrossEntropyLoss as CEL
criterion = CEL()
loss = criterion(zhat,y.long()) # 交叉熵只接受整型的y

In [19]:
loss

tensor(1.1559, grad_fn=<NllLossBackward>)

In [20]:
net.linear1.weight.grad #还没有梯度

In [21]:
loss.backward(retain_graph=True)

In [22]:
net.linear1.weight.grad

tensor([[-3.0294e-04, -7.4870e-05, -3.6707e-04, -3.8675e-05, -1.3070e-04,
         -5.8467e-05, -3.3313e-04, -2.5863e-04, -9.4272e-05, -4.2617e-05,
         -7.9181e-05, -1.5179e-04, -9.2079e-05, -3.2513e-04, -1.0894e-04,
         -6.5035e-05, -1.3006e-04, -4.4115e-06, -9.6740e-05, -8.5472e-05],
        [ 1.0099e-02, -9.9187e-04,  1.2005e-02,  1.0411e-03,  5.1961e-03,
          3.5567e-03,  6.0618e-03,  3.9976e-03,  1.4981e-02,  9.0948e-03,
          6.0929e-03,  7.5188e-03,  1.3888e-02,  1.1927e-03,  8.9806e-03,
          9.8215e-03,  1.7637e-02,  1.0377e-02,  1.6778e-03,  7.8359e-04],
        [-1.0467e-02,  7.9738e-03,  3.0350e-03,  7.4985e-03,  4.3230e-03,
          9.8743e-04,  9.2578e-03,  6.3660e-03, -5.8071e-03,  1.1588e-03,
         -2.6623e-03,  6.3559e-03, -2.9338e-03, -4.4924e-03, -5.0406e-03,
          8.4390e-03, -1.0344e-02, -8.5044e-03,  2.1374e-03, -1.2447e-03],
        [-1.3464e-02, -1.0530e-02, -8.3316e-03, -1.5648e-02, -1.2376e-02,
         -1.6419e-02, -9.7809e-03, 

In [23]:
net.linear1.weight.grad.shape

torch.Size([13, 20])

In [24]:
loss.backward() #除非再次执行正向传播，否则反向传播只能执行一次

In [25]:
net.linear1.weight.grad.shape

torch.Size([13, 20])

In [26]:
# backwrd是靠requires_grad = True来求解梯度的
# 正常梯度下降和反向传播过程中，X和y不需要计算导数，不设置requires_grad,默认其为False,以节约计算资源
# 如果自己设置w，则一定要设置requires_grad=True

In [27]:
# w(t+1) = w(t) - 步长*grad

In [28]:
lr = 10 # 学习率，一般为0.001，0.01，0.05

In [31]:
w = net.linear1.weight.data

In [32]:
dw = net.linear1.weight.grad

In [33]:
w -= lr*dw # 一次梯度下降

In [34]:
w

tensor([[ 0.1426, -0.1331,  0.2201, -0.1769, -0.0656, -0.1529,  0.1791,  0.0891,
         -0.1096, -0.1721, -0.1279, -0.0401, -0.1123,  0.1694, -0.0919, -0.1450,
         -0.0664, -0.2183, -0.1067, -0.1203],
        [-0.1538,  0.2017, -0.2160, -0.1511, -0.0119, -0.1663, -0.2271, -0.1228,
         -0.4163, -0.1573,  0.0597, -0.1199, -0.1427, -0.2181, -0.3555, -0.2262,
         -0.3525, -0.0680, -0.2302,  0.0776],
        [ 0.0174, -0.1234,  0.0871, -0.1191, -0.0151,  0.1224,  0.0361, -0.2676,
          0.1894,  0.1609,  0.1804, -0.3296, -0.0964, -0.1291,  0.1998,  0.0525,
         -0.0097,  0.3489, -0.2520, -0.0024],
        [ 0.4507,  0.1754,  0.1914,  0.4759,  0.0600,  0.3850,  0.0865,  0.5287,
         -0.0435,  0.3413,  0.3482,  0.0230, -0.0789,  0.3571,  0.0130,  0.4151,
          0.2242,  0.0304,  0.4655,  0.0487],
        [ 0.5074,  0.5780,  0.3428,  0.5796,  0.2840,  0.1219,  0.1426,  0.6944,
          0.4039,  0.7053, -0.0789,  0.5690,  0.1387,  0.0774,  0.1352,  0.5968,
      

In [35]:
# 动量法momentum

In [36]:
# v(t) = gamma * v(t-1) - lr * dw
# w(t+1) = w(t) + v(t) 上一步和这一步方向的向量和

In [37]:
lr = 0.1
gamma = 0.9

In [38]:
dw = net.linear1.weight.grad

In [39]:
w = net.linear1.weight.data

In [40]:
# t = 1,走第一步，首次迭代，初始化v0

In [42]:
v = torch.zeros(dw.shape[0],dw.shape[1]) # 为了v和dw梯度可以相减，结构维度应该一致

In [49]:
# 一次迭代 
v = gamma * v - lr * dw
w += v

In [50]:
w

tensor([[ 1.4317e-01, -1.3296e-01,  2.2081e-01, -1.7678e-01, -6.5368e-02,
         -1.5282e-01,  1.7972e-01,  8.9526e-02, -1.0947e-01, -1.7201e-01,
         -1.2774e-01, -3.9827e-02, -1.1212e-01,  1.7004e-01, -9.1706e-02,
         -1.4487e-01, -6.6146e-02, -2.1827e-01, -1.0648e-01, -1.2013e-01],
        [-1.7212e-01,  2.0349e-01, -2.3769e-01, -1.5302e-01, -2.1269e-02,
         -1.7277e-01, -2.3804e-01, -1.3004e-01, -4.4342e-01, -1.7377e-01,
          4.8634e-02, -1.3348e-01, -1.6785e-01, -2.2027e-01, -3.7178e-01,
         -2.4396e-01, -3.8446e-01, -8.6759e-02, -2.3325e-01,  7.6203e-02],
        [ 3.6367e-02, -1.3784e-01,  8.1585e-02, -1.3270e-01, -2.2892e-02,
          1.2063e-01,  1.9306e-02, -2.7916e-01,  1.9991e-01,  1.5882e-01,
          1.8522e-01, -3.4115e-01, -9.1099e-02, -1.2101e-01,  2.0892e-01,
          3.7182e-02,  9.0076e-03,  3.6428e-01, -2.5583e-01, -1.2852e-04],
        [ 4.7509e-01,  1.9449e-01,  2.0651e-01,  5.0426e-01,  8.2361e-02,
          4.1469e-01,  1.0421e-01, 

In [51]:
# torch.optim 优化算法模块

In [None]:
# """
# 导入库
# 确定数据、超参数（lr，gamma）
# 定义神经网络的架构 Model,类Model需要输入的参数
# 实例化神经网络的类 - 让神经网络准备好正向传播
# 定义损失函数
# 定义优化算法
# """

In [53]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F


# 确定数据

torch.manual_seed(420)
X = torch.rand((500,20),dtype=torch.float32) * 100
y = torch.randint(low=0,high=3,size=(500,),dtype=torch.float32)

lr = 0.1
gamma = 0.9

# 定义神经网络的架构
class Net(nn.Module):
    def __init__(self,in_features=40,out_features=2):
        super().__init__()
        
        self.linear1 = nn.Linear(in_features,13,bias=False)
        self.linear2 = nn.Linear(13,8,bias=False)
        self.output = nn.Linear(8,out_features,bias=True)
        
    def forward(self,x):
        sigma1 = torch.relu(self.linear1(x)) # 激活函数（层的线性结果）

        sigma2 = torch.sigmoid(self.linear2(sigma1))
        """
        三分类的损失函数无论是使用logsoftmax+NLLLoss还是交叉熵,都不需要计算sigmoid,在层的最后要么写logsoftmax，要么不写
        二分类考虑使用BCE，BCEWithLogitsLoss
        """
        zhat = self.output(sigma2)
        
        return zhat 

input_ = X.shape[1] # 特征数目
output_ = len(y.unique()) # 分类数目

# 实例化神经网络类
torch.manual_seed(420)
net = Net(in_features=input_,out_features=output_)

# 定义损失函数
from torch.nn import CrossEntropyLoss as CEL
criterion = CEL()

# 定义优化算法,小批量梯度下降
# net.parameters()一次性导出现有神经网络架构下全部的权重和截距，net.linear1.weight只有一层的
opt = optim.SGD( net.parameters()# 需要进行迭代的权重
                ,lr = lr
                ,momentum = gamma
) 

In [54]:
# """
# 梯度下降流程
# 向前传播
# 本轮向前传播的损失函数值
# 反向传播 -》 得到了梯度
# 更新权重（和动量）
# 清空梯度：清楚上一次迭代的坐标计算的梯度
# """

'\n梯度下降流程\n向前传播\n本轮向前传播的损失函数值\n反向传播 -》 得到了梯度\n更新权重（和动量）\n清空梯度：清楚上一次迭代的坐标计算的梯度\n'

In [71]:
zhat = net.forward(X) # zhat 最后一个线性层的输出结果，向前传播
loss = criterion(zhat,y.reshape(500).long()) # 计算损失函数
loss.backward()
opt.step() # step是步子，走一步，更新权重w，更新动量v
opt.zero_grad()

print(loss)
print(net.linear1.weight.data[0][:10])

tensor(1.0606, grad_fn=<NllLossBackward>)
tensor([ 0.1430, -0.1330,  0.2206, -0.1768, -0.0655, -0.1529,  0.1795,  0.0893,
        -0.1095, -0.1720])


In [72]:
# 传统梯度下降GD
# 小批量梯度下降 mini-batch SGD 每次随机同样数量的数据
# 优化算法的目标：全局最优
# 梯度下降每次数据完整，X相同，方向与w有关，小批量梯度下降方向与X和w有关，更可能跳出局部最小
# 小批量梯度下降由于方向不太明确，迭代次数可能比梯度下降更多，也可能更快

In [73]:
# epoch 全体数据一共被学习了多少次
# 每次迭代都只使用了一部分数据，1个epoch需要多次迭代，全体数据都使用了一次
# 一个epoch需要的迭代次数n：n = m个样本 / NB 小批量batch_size

In [79]:
epoch = 60 #让神经网络学习60次数据
batch = 10 # 每次将全部数据X划分为10个batch

In [80]:
# 梯度下降迭代
for epoch in range(epoch):
    for batch in range(batch):
        zhat = net.forward(X) # zhat 最后一个线性层的输出结果，向前传播
        loss = criterion(zhat,y.reshape(500).long()) # 计算损失函数
        loss.backward()
        opt.step() # step是步子，走一步，更新权重w，更新动量v
        opt.zero_grad()

In [81]:
# 数据预处理 torch.utils
# TensorDataset合并与DataLoader分割

In [82]:
import torch
from torch.utils.data import TensorDataset

In [83]:
a = torch.randn(500,2,3) # 三维数据-二维表格

In [84]:
b = torch.randn(500,3,4,5) # 四维数据-图像

In [85]:
c = torch.randn(500,1) # 二维数据 - 标签

In [86]:
# 合并abc，被合并数据第一维度上的值相等

In [87]:
for x in TensorDataset(a,b,c): # generator
    print(x) # 元组
    break

(tensor([[ 0.0555,  0.0347, -0.0640],
        [-0.6151,  0.5850, -1.3424]]), tensor([[[ 1.4229,  0.3269, -0.7064,  0.4886, -0.4457],
         [-0.1819,  1.3381, -0.0515,  0.9612,  0.6173],
         [ 2.1468,  0.0329, -1.3354, -0.2216, -1.2585],
         [-0.0606, -0.7752,  1.5580,  0.8701,  2.0751]],

        [[-0.4195,  0.3641,  1.1461,  1.3315,  0.6182],
         [ 0.4945,  0.4110,  0.4114, -1.9308, -0.2237],
         [ 0.4374,  0.4338,  0.5920,  0.7556, -0.4258],
         [ 1.5789, -0.1794, -0.5889,  1.8905, -0.7718]],

        [[-0.7557, -1.2767,  1.0856,  0.7704,  2.3633],
         [ 0.0490, -0.9121, -0.0489, -1.2371, -1.2507],
         [-2.2677, -0.1536, -0.2799, -0.9272,  1.4546],
         [-0.8360, -0.3864, -0.9757, -0.5694,  0.2240]]]), tensor([-0.1178]))


In [88]:
from torch.utils.data import DataLoader

In [93]:
data = TensorDataset(b,c)

In [94]:
for x in DataLoader(data):
    print(x) # 列表
    break

[tensor([[[[ 1.4229,  0.3269, -0.7064,  0.4886, -0.4457],
          [-0.1819,  1.3381, -0.0515,  0.9612,  0.6173],
          [ 2.1468,  0.0329, -1.3354, -0.2216, -1.2585],
          [-0.0606, -0.7752,  1.5580,  0.8701,  2.0751]],

         [[-0.4195,  0.3641,  1.1461,  1.3315,  0.6182],
          [ 0.4945,  0.4110,  0.4114, -1.9308, -0.2237],
          [ 0.4374,  0.4338,  0.5920,  0.7556, -0.4258],
          [ 1.5789, -0.1794, -0.5889,  1.8905, -0.7718]],

         [[-0.7557, -1.2767,  1.0856,  0.7704,  2.3633],
          [ 0.0490, -0.9121, -0.0489, -1.2371, -1.2507],
          [-2.2677, -0.1536, -0.2799, -0.9272,  1.4546],
          [-0.8360, -0.3864, -0.9757, -0.5694,  0.2240]]]]), tensor([[-0.1178]])]


In [95]:
bs = 120
dataset = DataLoader(data
           ,batch_size = bs
           ,shuffle = True # 划分小批量之前请随机打乱数据
           ,drop_last = False # 是否舍弃最后一个batch(500个样本分120，最后一个只有20)
          )

In [107]:
dataset

<torch.utils.data.dataloader.DataLoader at 0x7febe15501c0>

In [102]:
for i in dataset:
    print(i) # [ tensor([batch个特征]),tensor([batch个标签]) ]
    print(i[0].shape) # batch个特征
    print(i[1].shape) # batch个标签
    break

[tensor([[[[-1.3346e+00, -9.5659e-01, -4.8822e-01,  3.3623e+00, -9.4634e-01],
          [ 5.5837e-01,  9.0281e-01,  1.1138e+00, -6.3412e-01,  6.5767e-01],
          [-4.6821e-01,  9.1996e-01,  1.7677e-01, -2.1329e+00, -1.0322e+00],
          [-1.0133e+00,  3.7822e-01,  6.0810e-01, -1.7115e-02, -2.6411e-03]],

         [[-1.1860e+00,  1.2023e-01, -2.3184e+00, -3.5003e-01, -8.0200e-01],
          [ 4.0105e-01, -1.4007e+00, -1.5301e+00,  8.1988e-01, -1.3349e-02],
          [ 7.3771e-01,  1.0310e-01,  6.1815e-01,  9.1976e-01, -1.1814e+00],
          [ 8.6148e-01, -1.8494e+00,  1.3333e+00,  5.3396e-01,  1.1866e+00]],

         [[ 2.4922e+00,  7.6012e-01, -9.8579e-01, -3.1428e-01,  2.4000e-02],
          [-4.0303e-01,  4.9993e-01,  1.7167e+00,  5.3382e-01,  2.5988e-02],
          [ 2.6928e-01, -4.9147e-01, -1.0665e+00,  1.5698e+00, -3.1296e+00],
          [ 5.1460e-01, -1.2841e+00,  3.8885e-01, -2.5729e-01,  9.3518e-01]]],


        [[[ 1.5414e+00,  2.4092e-01, -1.5182e-01,  5.2668e-01,  2.5

In [98]:
len(dataset) # 一共有多少个batch

5

In [103]:
dataset.dataset #展示里面全部的数据

<torch.utils.data.dataset.TensorDataset at 0x7febe1550610>

In [104]:
len(dataset.dataset) # 全部样本数

500

In [99]:
dataset.dataset[0] # 单个样本

(tensor([[[ 1.4229,  0.3269, -0.7064,  0.4886, -0.4457],
          [-0.1819,  1.3381, -0.0515,  0.9612,  0.6173],
          [ 2.1468,  0.0329, -1.3354, -0.2216, -1.2585],
          [-0.0606, -0.7752,  1.5580,  0.8701,  2.0751]],
 
         [[-0.4195,  0.3641,  1.1461,  1.3315,  0.6182],
          [ 0.4945,  0.4110,  0.4114, -1.9308, -0.2237],
          [ 0.4374,  0.4338,  0.5920,  0.7556, -0.4258],
          [ 1.5789, -0.1794, -0.5889,  1.8905, -0.7718]],
 
         [[-0.7557, -1.2767,  1.0856,  0.7704,  2.3633],
          [ 0.0490, -0.9121, -0.0489, -1.2371, -1.2507],
          [-2.2677, -0.1536, -0.2799, -0.9272,  1.4546],
          [-0.8360, -0.3864, -0.9757, -0.5694,  0.2240]]]),
 tensor([-0.1178]))

In [105]:
dataset.dataset[0][1] # 单个样本标签

tensor([-0.1178])

In [106]:
dataset.batch_size # 查看现有的batch_size

120