# 如何写一个训练的代码
一个训练部分的代码其实说简单也很简单，我觉得就可以分为以下几步：

1. 选择损失函数和优化器。
2. 把数据和模型加载到设备上。
3. 输入神经网络获得输出，计算损失。
4. 反向传播，梯度清零。

### 这里先把前面的代码给复制过来哈

In [35]:
import torch
from torch.utils.checkpoint import checkpoint
from torch.utils.data import DataLoader, Dataset
datas=torch.rand(1000,2)
#这里是贴标签的函数
def get_label(data):
    if data[0]*data[0] - data[1] >=0:
        return 0
    else:
        return 1

labels=[get_label(data)for data in datas]

class XY_Dataset(Dataset):
    def __init__(self,datas,labels):
        self.datas = datas
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        data=self.datas[item]
        label=self.labels[item]
        label_tensor=torch.tensor(label,dtype=torch.float32)
        return data,label_tensor

xy_dataset=XY_Dataset(datas,labels)
#划分数据集
p=0.9
train_size=int(len(xy_dataset)*p)
test_size=len(xy_dataset)-train_size
train_dataset,test_dataset=torch.utils.data.random_split(xy_dataset,[train_size,test_size])
print(f"train size: {len(train_dataset) }")
print(f"test size: {len(test_dataset)}")

#第一个参数传数据集，batch_size指定大小，shuffle指定是否序，num_workers指定进程数，可以加快训练,
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True,num_workers=0)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False,num_workers=0)

train size: 900
test size: 100


In [36]:
import torch
import torch.nn as nn
class Net(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        #用完成父类的初始化，不可以省略
        super(Net, self).__init__()
        self.layer1=nn.Linear(input_size,hidden_size)
        self.layer2=nn.Linear(hidden_size,output_size)
        self.relu=nn.ReLU()
        self.sigmoid=nn.Sigmoid()

    def forward(self,x):
        h=self.relu(self.layer1(x))
        result=self.sigmoid(self.layer2(h))
        return result

## 训练部分
查看设备，并且把模型放到设备上(我的电脑不能用cuda..麻了...)

In [127]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = Net(2,15,1)
model = model.to(device)

cpu


### 设置损失函数和优化器

In [128]:
import torch.nn as nn

loss_fn=nn.BCELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)

### 损失函数
关于损失函数有个注意事项:
### 最后一层加上了sigmoid函数
二分类是BCE_Loss
### 最后一层没有sigmoid/softmax激活
多分类是`nn.CrossEntropyLoss`多分类交叉熵里面包含了softmax不用再在神经网络中添加，二分类是`nn.BCEWithLogitsLoss()`，

### 将数据加载到设备/计算损失/反向传播
这里可以注意的点可太多了:
1. 首先在每一轮开始前需要把model设置为训练模式,这一步是非常有必要的！！因为只有设置了这一步，dropout层和batchnorm层才会生效。
2. 数据和标签都要放到设备上
3. 每次反向传播完后要将梯度清零
4. 定期打印loss

### 刚刚犯错误了
1. 我在设计神经网络时把输出层设置为了2，因为我想的总共要分为两类，但是当我用了BCE_Loss时，对应的输出应该只有一个。

BCELOSS:`−[ylog(p)+(1−y)log(1−p)]`,默认p是正类的概率。

当我的输出设置为2时，实际上应该用多分类交叉熵，这样会分别计算是负类和正类的概率。

2. 我的output是[16，1],labels是[16],需要统一维度，这里通过squeeze去除output第一维度

3. 我的label处理时写的时labels=torch.tensor([get_label(data)for data in datas])，但是标准写法应该是在getitem中转为tensor，BCE用float32，CE用long类型

### 用tensorboard绘制损失

1. `SummaryWriter("")`里面指定保存路径
2. `writer.add_scalar(name,记录的值，global_step)` 记录损失，根据name部分来画图
3. 要手动标注global,step。
4. 查看数据：tensorboard --logdir=绝对路径
5. 注意，要想用全局变量，并且在多次调用时续写的话，就不能把他当参数传入，不然只会局部修改，在修改前要声明

In [119]:

global_step=0

In [120]:
epochs=30
print(train_dataset[0])
from torch.utils.tensorboard import SummaryWriter

def train(model,train_loader,device,epochs,loss_fn=nn.BCELoss(),optimizer=None):
    global global_step
    writer=SummaryWriter("./train_loss_log")
    optimizer=optimizer or torch.optim.Adam(model.parameters(),lr=0.001)
    for epoch in range(epochs):
        model.train()
        for i,(datas,labels) in enumerate(train_loader):
            datas=datas.to(device)
            labels=labels.to(device)
            output=model(datas)
            output=output.squeeze(1)
            loss=loss_fn(output,labels)
            #清理梯度
            optimizer.zero_grad()
            #反向传播
            loss.backward()
            #更新权重
            optimizer.step()
            global_step+=1
            if i%10==0:
                print(f"epoch:{epoch}_{i},loss:{loss}")
                writer.add_scalar("Loss",loss,global_step)

        if epoch % 10==0:
            torch.save({
                "epoch":epoch,
                "model_state_dict":model.state_dict(),
                "optimizer_state_dict":optimizer.state_dict(),
                "loss":loss.item(),
            },f"./checkpoint/checkpoint_epoch{epoch}_loss{loss.item()}.pth")



(tensor([0.7741, 0.5030]), tensor(0.))


In [125]:
train(model,train_loader,device, epochs,loss_fn)

epoch:0_0,loss:0.08425977826118469
epoch:0_10,loss:0.10224828124046326
epoch:0_20,loss:0.16538560390472412
epoch:0_30,loss:0.1344987452030182
epoch:0_40,loss:0.17102794349193573
epoch:0_50,loss:0.3085537850856781
epoch:1_0,loss:0.11063604056835175
epoch:1_10,loss:0.08869641274213791
epoch:1_20,loss:0.06605039536952972
epoch:1_30,loss:0.07933147996664047
epoch:1_40,loss:0.21494369208812714
epoch:1_50,loss:0.23778650164604187
epoch:2_0,loss:0.06790468096733093
epoch:2_10,loss:0.15987886488437653
epoch:2_20,loss:0.16492272913455963
epoch:2_30,loss:0.14934711158275604
epoch:2_40,loss:0.08998394757509232
epoch:2_50,loss:0.1396142989397049
epoch:3_0,loss:0.22788046300411224
epoch:3_10,loss:0.08856367319822311
epoch:3_20,loss:0.13846871256828308
epoch:3_30,loss:0.09153831005096436
epoch:3_40,loss:0.11660245805978775
epoch:3_50,loss:0.20029005408287048
epoch:4_0,loss:0.16817080974578857
epoch:4_10,loss:0.14793434739112854
epoch:4_20,loss:0.10069829225540161
epoch:4_30,loss:0.18366087973117828


# 接下来是测试代码
其实大差不差和训练代码，只有几个需要注意的地方：
1. 计算损失时统计全部的损失再算平均
2. 切换model.eval()模式

In [126]:
def validation(model,test_loader,device,loss_fn=nn.BCELoss()):
    model.eval()
    model=model.to(device)
    test_loss=0
    correct=0
    test_size=0;
    with torch.no_grad():
        for i,(datas,labels) in enumerate(test_loader):
            datas=datas.to(device)
            labels=labels.to(device)
            output=model(datas)
            output=output.squeeze(1)
            result=output>0.5
            correct+=torch.sum(result==labels).item()
            loss=loss_fn(output,labels)
            test_loss+=loss.item()
            test_size+=len(labels)

    return test_loss/len(test_loader),correct/test_size


### 注意
1. 天啊，第一次写的时候没写with torch.no_grad(),在这个阶段是不需要计算图的！

In [114]:
validation(model,test_loader,device)

(tensor(0.1113, grad_fn=<DivBackward0>), 0.95)

可以看到训练效果是不错的

### 保存模型
保存模型是很重要的，用`torch.save(保存啥,".pth")`

```python
# 保存检查点
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    # 可以添加其他需要保存的信息
}
```
推荐按照这个来，不止保持model的权重，还保存优化器的数值，方便以后接着训练

### 早停
如果损失连续几轮没有下降，就退出训练，并保存模型

In [136]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = Net(2,15,1)
model = model.to(device)
import torch.nn as nn

loss_fn=nn.BCELoss()

cpu


In [138]:
epochs=30
print(train_dataset[0])
from torch.utils.tensorboard import SummaryWriter


def train_with_earlystop(model,train_loader,device,epochs,loss_fn=nn.BCELoss(),optimizer=None):
    global global_step
    writer=SummaryWriter("./train_loss_log")
    optimizer=optimizer or torch.optim.Adam(model.parameters(),lr=0.001)

    patience=5
    min_loss=float("inf")
    checkpoint=None

    for epoch in range(epochs):
        model.train()

        for i,(datas,labels) in enumerate(train_loader):
            datas=datas.to(device)
            labels=labels.to(device)
            output=model(datas)
            output=output.squeeze(1)
            loss=loss_fn(output,labels)
            #清理梯度
            optimizer.zero_grad()
            #反向传播
            loss.backward()
            #更新权重
            optimizer.step()
            global_step+=1
            if i%10==0:
                print(f"epoch:{epoch}_{i},loss:{loss}")
                writer.add_scalar("trainLoss",loss,global_step)


        test_loss,acc=validation(model,test_loader,device,loss_fn)
        writer.add_scalar("Test/Loss",test_loss,epoch)
        writer.add_scalar("Test/acc",acc,epoch)
        if test_loss<min_loss:
            min_loss=test_loss
            patience=5
            checkpoint={
                "epoch":epoch,
                "model_state_dict":model.state_dict(),
                "optimizer_state_dict":optimizer.state_dict(),
                "loss":min_loss,
            }
        else:
            patience-=1
        if patience<=0:
            print("Early stopping")
            break
    if checkpoint is not None:
        torch.save(checkpoint,f"./checkpoint/checkpoint_loss{min_loss}.pth")



(tensor([0.7741, 0.5030]), tensor(0.))


In [139]:
train_with_earlystop(model, train_loader, device, epochs, loss_fn)

epoch:0_0,loss:0.6809892654418945
epoch:0_10,loss:0.7293615937232971
epoch:0_20,loss:0.6373035907745361
epoch:0_30,loss:0.5946373343467712
epoch:0_40,loss:0.6164590716362
epoch:0_50,loss:0.6775141358375549
epoch:1_0,loss:0.6904236078262329
epoch:1_10,loss:0.6591238975524902
epoch:1_20,loss:0.5954392552375793
epoch:1_30,loss:0.6516152620315552
epoch:1_40,loss:0.6285984516143799
epoch:1_50,loss:0.596759021282196
epoch:2_0,loss:0.6630439162254333
epoch:2_10,loss:0.7140003442764282
epoch:2_20,loss:0.576139509677887
epoch:2_30,loss:0.634880542755127
epoch:2_40,loss:0.48055362701416016
epoch:2_50,loss:0.6837923526763916
epoch:3_0,loss:0.6279343962669373
epoch:3_10,loss:0.6583846807479858
epoch:3_20,loss:0.6579659581184387
epoch:3_30,loss:0.48801344633102417
epoch:3_40,loss:0.5295525789260864
epoch:3_50,loss:0.5071989893913269
epoch:4_0,loss:0.5638716816902161
epoch:4_10,loss:0.5995433926582336
epoch:4_20,loss:0.6190423965454102
epoch:4_30,loss:0.48967796564102173
epoch:4_40,loss:0.6711631417

### 注意
1. 早停应该针对测试集的平均loss，而不是单个训练batch的
2. 不用每隔十轮保存一次了，太浪费,应该保存loss最小的测试集
3. 极端情况可能一轮都不下降，所以开始把checkpoint设为None，检查被记录时才保存