### **尝试多GPU训练**

In [1]:
import torch
from torch import nn
from d2l import torch as d2l
from net_frame import *
from tqdm import tqdm

In [2]:
# 加载数据
from data_preprocess import get_data
train_iter, test_iter, vocab = get_data(batch_size = 768)

read pos:   0%|          | 0/12500 [00:00<?, ?it/s]

read pos: 100%|██████████| 12500/12500 [00:00<00:00, 68934.11it/s]
read neg: 100%|██████████| 12500/12500 [00:00<00:00, 73088.71it/s]


Load raw data use time:0.3845533225685358s


read pos: 100%|██████████| 12500/12500 [00:00<00:00, 73559.92it/s]
read neg: 100%|██████████| 12500/12500 [00:00<00:00, 75763.18it/s]


Load raw data use time:0.35823698714375496s
Build vocab....
Finish!
Build data-iter...
Finish!


**使用rnn测试**

In [3]:
class BiRNN(nn.Module):
    """双向RNN"""
    def __init__(self, vocab_size, embed_size, num_hiddens,
                 num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # 将bidirectional设置为True以获取双向循环神经网络
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是（批量大小，时间步数）
        # 因为长短期记忆网络要求其输入的第一个维度是时间维，
        # 所以在获得词元表示之前，输入会被转置。
        # 输出形状为（时间步数，批量大小，词向量维度）
        embeddings = self.embedding(inputs.T)
        self.encoder.flatten_parameters()
        # 返回上一个隐藏层在不同时间步的隐状态，
        # outputs的形状是（时间步数，批量大小，2*隐藏单元数）
        outputs, _ = self.encoder(embeddings)
        # 连结初始和最终时间步的隐状态，作为全连接层的输入，
        # 其形状为（批量大小，4*隐藏单元数）
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

# 权重初始化函数
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])

# 搭建网络
embed_size, num_hiddens, num_layers = 100, 100, 2
devices = d2l.try_all_gpus()
net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)
net.apply(init_weights)

BiRNN(
  (embedding): Embedding(76835, 100)
  (encoder): LSTM(100, 100, num_layers=2, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=2, bias=True)
)

In [4]:
# 使用预训练的embedding层
from embed_layer import TokenEmbedding
glove_embedding = TokenEmbedding('glove.6b.100d') # 100维，对应embed_size

In [5]:
# 测试
embeds = glove_embedding[vocab.idx_to_token]
print(embeds.shape)

# 加载
net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False

torch.Size([76835, 100])


In [6]:
# 计算准确率
def accurancy(net,data_iter,devices):
    """计算准确率"""
    net.eval()
    total_nums = 0
    correct_nums = 0
    for batch in data_iter:
        X,Y = batch[0],batch[1]
        X = X.to(devices[0])
        Y = Y.to(devices[0])
        y_pred = net(X).argmax(dim = 1)
        correct_nums += (y_pred == Y).sum().item()
        total_nums += X.shape[0]
    return correct_nums / total_nums

# 定义训练函数
def train(net,trainer:torch.optim.Adam,train_iter,test_iter,loss_fn,lr,num_epochs,devices_idx = None):
    """训练情感分析模型"""
    # 设置设备
    if devices_idx == None:
        device = try_gpu(i = 0)
    else:
        assert (type(devices_idx == list) and 
                type(devices_idx[0]) == int),"devices_idx must be list of int"
        devices = [torch.device(f"cuda:{i}")
                   for i in devices_idx]
    print(f"Training on{devices}")
    
    # 多GPU加载网络
    # net = net.to(device)
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])

    # 开始训练
    loss_plt = []
    train_accs = []
    test_accs = []
    for epoch in range(num_epochs):
        net.train() # 循环涉及评估，则每次循环前要net.train()
        loop = tqdm(train_iter,desc = f"Epoch:[{epoch + 1}/{num_epochs}]",
                    total = len(train_iter))
        loss_temp = 0
        total_nums = 0
        for batch in loop:
            # 清空梯度
            trainer.zero_grad()

            # forward
            X,Y = batch
            X = X.to(devices[0]) # 放置在devices[0]即可
            Y = Y.to(devices[0])
            # print(X.shape,Y.shape)
            y_pred = net(X)
            total_nums += X.shape[0]

            # count loss and backwar
            loss = loss_fn(y_pred,Y)
            loss.sum().backward()
            trainer.step()

            # 先step后再调用item()，否则切断计算图
            loss_temp += loss.sum().item()
            
            # # update parameters
            # trainer.step()
            loop.set_postfix({"LOSS" : loss_temp / total_nums,"lr" : "{:e}".format(trainer.param_groups[0]['lr'])})
        print("Counting accurancy...")
        train_acc = accurancy(net,train_iter,devices)
        test_acc = accurancy(net,test_iter,devices)
        print(f"train acc:{train_acc}")
        print(f"test acc:{test_acc}")
        loss_plt.append(loss_temp / total_nums)
        train_accs.append(train_acc)
        test_accs.append(test_acc)
    return loss_plt,train_accs,test_accs

In [7]:
# 训练超参数
lr, num_epochs = 0.01, 20
loss = nn.CrossEntropyLoss(reduction = 'none') # 返回每个样本的损失，即reduction = 'None'
trainer = torch.optim.Adam(net.parameters(),lr = lr)
plt_collections = train(net,trainer,train_iter,test_iter,loss,lr,num_epochs,devices_idx = [1])

Training on[device(type='cuda', index=1)]


Epoch:[1/20]: 100%|██████████| 33/33 [00:03<00:00,  9.64it/s, LOSS=0.703, lr=1.000000e-02]


Counting accurancy...
train acc:0.62564
test acc:0.61696


Epoch:[2/20]: 100%|██████████| 33/33 [00:03<00:00, 10.16it/s, LOSS=0.644, lr=1.000000e-02]


Counting accurancy...


KeyboardInterrupt: 

结论：我这组batch_size =  792,三卡训练平均1个epoch 1s，而单卡要3s.