In [1]:
import torch
import numpy as np
UNK, PAD, BOS, EOS = '<unk>', '<pad>', '<bos>', '<eos>'
def build_vocab(processed_text,processed_news,min_freq=3):
    """构建词典"""
    all_content = []
    all_content.extend(processed_text)
    all_content.extend(processed_news)
    
    # 统计词频
    tokens_dict = {}
    for content in all_content:
        for token in content:
            tokens_dict[token] = tokens_dict.get(token,0)+1

    vocab = {}
    extend_vocab = {}
    idx = 4
    # 映射 token and id
    for k,v in tokens_dict.items():
        if v>=min_freq:
            vocab[k] = idx
            idx+=1
        elif v>1:
            extend_vocab[k] = -1
    for k in extend_vocab.keys():
        extend_vocab[k] = idx
        idx+=1
    vocab.update({PAD:0, UNK:1, BOS:2, EOS:3})
    return vocab, extend_vocab

def build_dataset(vocab, extend_vocab, processed_content, max_len, is_extend_vocab=False, sentence_type=None):
    """pad token and to id"""
    content = []
    extend_content = []
    
    for sent in processed_content:
        if sentence_type == "summary":
            # 为摘要加上结尾符
            if len(sent)<max_len:
                sent.extend([EOS]+[PAD]*(max_len-len(sent)))
            else:
                sent[:] = sent[:max_len] + [EOS]
        else:
            sent[:] = sent[:max_len] + [PAD]*np.maximum(max_len-len(sent),0)
        
        if sentence_type == "summary":
            sent_id = [extend_vocab[token] if token in extend_vocab else vocab.get(token,vocab[UNK]) for token in sent]
        else:
            sent_id = [vocab.get(token,vocab[UNK]) for token in sent]
        if is_extend_vocab:
            extend_id = [extend_vocab[token] if token in extend_vocab else vocab.get(token,vocab[UNK]) for token in sent]
                
        content.append(sent_id)
        if is_extend_vocab: extend_content.append(extend_id)
    return torch.tensor(content), torch.tensor(extend_content)

def get_pretrained_embedding(vocab,pretrain_embedding_path,vector_dim=300):
    """加载词向量，vocab的token_id与embedding的index对齐，用作torch.nn.Embedding.from_pretrained(Embedding)"""
    with open(pretrain_embedding_path, 'r+', encoding='utf-8') as f:
        embeddings = torch.rand(len(vocab),vector_dim)
        for _,line in enumerate(f.readlines()):
            if _==0: continue
            line = line.strip().split(' ')
            if line[0] in vocab:
                idx = vocab[line[0]]
                embeddings[idx] = torch.tensor([float(i) for i in line[1:]], dtype=torch.float32)
        for i in range(4):
            embeddings[idx] = torch.ones(vector_dim, dtype=torch.float32)*float(i/100)
    return embeddings

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Encoder(nn.Module):
    """单层双向GRU
    ！！优化：n_layers增加
    """
    def __init__(self, vocab_size, embed_size, hidden_size, n_layers=1,
                 dropout=0.0, use_pretrained_embeddings=True, pre_embeddings=None):
        super(Encoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        if use_pretrained_embeddings:
            self.embedding = nn.Embedding.from_pretrained(pre_embeddings)
        else:
            self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.embedding.weight.requires_grad = True
        
        self.gru = nn.GRU(self.embed_size, self.hidden_size, self.n_layers, bidirectional=True)
        
        self.ln = nn.LayerNorm((self.hidden_size,), eps=1e-5, elementwise_affine=True)
    
    def forward(self, inputs, init_hidden):
        """
        :parma inputs: [batch, seq_len]
        :parma init_hidden: None
        """
        # [batch, seq_len, embed_size] -> [seq_len, batch, embed_size], 适应GRU的输入
        inputs = self.ln(self.embedding(inputs).permute(1,0,2))
        # outputs: [seq_len, batch, hidden_size*num_direction], hidden_states: [n_layers*num_direction, batch, hidden_size]
        outputs,hidden_states = self.gru(inputs,init_hidden)
        outputs = outputs[:,:,:self.hidden_size]+outputs[:,:,self.hidden_size:]
        outputs = outputs.permute(1,0,2) # [batch, seq_len, embed_size]
        hidden_states = hidden_states[:1,:,:] + hidden_states[1:,:,:]
        return outputs, hidden_states
    
    def get_init_hidden(self):
        return None

class Decoder(nn.Module):
    """双层单项GRU
    !!优化：双线性层输出；copy机制；目标函数优化
    """
    def __init__(self, vocab_size, embed_size, hidden_size, n_layers=1, dropout=0.1,
                 use_pretrained_embeddings=True, pre_embeddings=None, use_pointer_gen=True, is_coverage=True):
        super(Decoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.use_pointer_gen = use_pointer_gen
        
        self.attention = Attention(config.max_len_text, self.hidden_size, is_coverage=is_coverage)
        
        if use_pretrained_embeddings:
            self.embedding = nn.Embedding.from_pretrained(pre_embeddings)
        else:
            self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.embedding.weight.requires_grad = True
        
        # 衡量一个词是生成的还是复制的
        if self.use_pointer_gen:
            self.p_gen_sig = nn.Sequential(
                nn.Linear(self.hidden_size*2+self.embed_size, 1),
                nn.Sigmoid()
            )
            
        # 解码器的输入时间步拼接上下文向量，过一个线性映射
        self.x_context = nn.Linear(self.hidden_size+self.embed_size, self.embed_size)
        # 对输入向量进行解码
        self.gru = nn.GRU(self.embed_size, self.hidden_size, self.n_layers)
        # 将解码状态St和上下文向量ht*拼接后经过两层线性层得到单词表分布P_vocab
        self.out = nn.Sequential(
            nn.Linear(self.hidden_size*2, self.hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(self.hidden_size, self.vocab_size)
        )
        
        self.input_ln = nn.LayerNorm((hidden_size,), eps=1e-5, elementwise_affine=True)
        self.hidden_ln = nn.LayerNorm((hidden_size,), eps=1e-5, elementwise_affine=True)
        
    def forward(self, inputs, dec_hidden, enc_outputs, enc_padding_mask, context_vector_t1,
                enc_batch_extend_vocab, extra_zeros, coverage):
        """
        :parma inputs: 时间步t的输入（训练时是y_true的输入，预测时是上一时间步的输出） [batch, 1]
        :parma dec_hidden: 当前时间步的隐藏状态 [n_layers, batch, hidden_size]
        :parma enc_outputs: encoder的所有时间步的隐藏状态 [batch, seq_len, hidden_size]
        :parma enc_padding_mask: 对应pad的位置为-inf，其余为0 [batch, seq_len]
        :parma context_vector_t1: 时间步t-1的上下文向量 [batch, hidden_size]
        :parma enc_batch_extend_vocab: 原文章的编码，oov词汇用超过词汇表的编码 [batch, seq_len]
        :parma extra_zeros: 文章oov词汇数量 [batch, extend_vocab_size]
        :parma coverage: 用先前的注意力权重影响当前注意力权重的决策 [batch, seq_len]
        """
        x = self.input_ln(self.embedding(inputs).squeeze(1)) # batch * embed_size
        x = self.x_context(torch.cat((x,context_vector_t1),dim=1)) # batch * embed_size
        dec_hidden = self.hidden_ln(dec_hidden)
        # dec_output: [1, batch, hidden_size], s_t: [n_layers, batch, hidden_size]
        dec_output,s_t = self.gru(x.unsqueeze(0), dec_hidden)
        dec_output, s_t = dec_output[0], s_t[-1]
        
        # h_t: batch * hidden_size, atten_dist: batch * vocab_size, coverage_next: batch * vocab_size
        h_t,atten_dist,coverage_next = self.attention(enc_outputs, s_t, enc_padding_mask, coverage)
        
        if self.training and config.is_coverage:
            coverage = coverage_next
        
        p_gen = None
        if self.use_pointer_gen:
            p_gen_input = torch.cat((h_t, s_t, x),dim=1) # batch * hidden_size*3
            p_gen = self.p_gen_sig(p_gen_input).clamp(min=1e-8)
        
        s_t_h_t = torch.cat((dec_output,h_t), dim=1) # batch * hidden_size*2
        vocab_dist = F.softmax(self.out(s_t_h_t), dim=1)
        
        if self.use_pointer_gen:
            vocab_dist_ = p_gen * vocab_dist # batch * vocab_size
            atten_dist_ = (1-p_gen) * atten_dist # batch * seq_len

            if extra_zeros is not None:
                vocab_dist_ = torch.cat((vocab_dist_,extra_zeros), dim=1) # batch * vocab_size+seq_len
                
            final_dist = vocab_dist_.scatter_add(1, enc_batch_extend_vocab, atten_dist_).clamp(min=1e-8)
        else:
            final_dist = vocab_dist
        
        if config.is_coverage:
            coverage_loss = torch.sum(torch.min(atten_dist, coverage), 1)
        else:
            coverage_loss=None
        return torch.log(final_dist), s_t.unsqueeze(0), h_t, coverage, coverage_loss
    
    def get_init_hidden(self, enc_hidden):
        # 直接使用encoder端输出的隐含向量作为decoder端的初始化
        # enc_hidden: [n_layers*num_direction, batch, hidden_size]
        return enc_hidden
    
class Attention(nn.Module):
    def __init__(self, enc_seq_len, hidden_size, is_coverage=True):
        super(Attention,self).__init__()
        
        self.enc_seq_len = enc_seq_len
        self.hidden_size = hidden_size
        self.is_coverage = is_coverage
        
        if is_coverage:
            self.W_c = nn.Linear(1, self.hidden_size, bias=False)
        self.W_h = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.W_s = nn.Linear(self.hidden_size, self.hidden_size)
        self.V = nn.Linear(self.hidden_size, 1, bias=False)
        
        self.ln = nn.LayerNorm((hidden_size,), eps=1e-5, elementwise_affine=True)
        
    def forward(self, enc_output, dec_hidden, enc_padding_mask, coverage):
        """
        :parma enc_outputs: [batch, seq_len, hidden_size]
        :parma dec_hidden: [batch, hidden_size]
        :parma enc_padding_mask: [batch, seq_len]
        :parma coverage: [batch, seq_len]
        """
        batch_size,seq_len,enc_hidden_size = enc_output.size()
        
        # [batch*seq_len, hidden_size]
        enc_outputs = enc_output.contiguous().view(-1,enc_hidden_size)
        enc_feature = self.W_h(enc_outputs)
        dec_feature = self.W_s(dec_hidden) # [batch, hidden_size]
         # [batch*seq_len, hidden_size]
        dec_feature = dec_feature.unsqueeze(1).expand(batch_size, seq_len, enc_hidden_size).contiguous()
        dec_feature = dec_feature.view(-1,enc_hidden_size)
        
        atten_feature = enc_feature + dec_feature
        if self.is_coverage:
            coverage_input = coverage.view(-1, 1) # [batch*seq_len, 1]
            coverage_feature = self.W_c(coverage_input)
            atten_feature = atten_feature + coverage_feature
        
        # 注意力分数计算
        e_t = self.V(torch.tanh(atten_feature)).view(batch_size,-1) # [batch, seq_len]
        atten_dist = F.softmax(e_t + enc_padding_mask, dim=1)

        # 时间步的atten分数乘以其隐向量，相加得到上下文向量
        context_vector = self.ln(torch.bmm(atten_dist.unsqueeze(1),enc_output).squeeze(1)) # batch * hidden_size
        atten_dist = atten_dist.squeeze(1)
        
        if self.is_coverage:
            coverage = coverage + atten_dist

        return context_vector, atten_dist, coverage
    
class PointerGenerator(nn.Module):
    def __init__(self, encoder, decoder, extend_vocab_size, hidden_size):
        super(PointerGenerator, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.extend_vocab_size = extend_vocab_size
        
    def forward(self, inputs, target, text_extend):
        """
        :parma inputs: [batch, seq_len]
        :parma target: [batch, target_seq_len]
        :parma extend_token_id: [batch, seq_len]
        """
        batch_size,seq_len = inputs.size()
        null_enc_state = self.encoder.get_init_hidden()
        enc_output,enc_hidden  = self.encoder(inputs, null_enc_state)
        
        dec_hidden = self.decoder.get_init_hidden(enc_hidden)
        dec_input = torch.tensor([config.spc_token[BOS]]*batch_size, dtype=torch.long, device=device)
        
        enc_padding_mask = []
        for item in inputs:
            enc_padding_mask.append([float('-inf') if i==0 else 0 for i in item]) # 0为pad id
        enc_padding_mask = torch.tensor(enc_padding_mask, dtype=torch.float32, device=device)
        
        context_vector_t1, enc_batch_extend_vocab, extra_zeros, coverage = self.get_init_input_batch(text_extend, batch_size, enc_hidden.size(-1), seq_len)
        
        batch_output = torch.Tensor().to(device) # [batch, target_seq_len, vocab_size]
        for y in target.permute(1,0): # y: seq_len * batch
            dec_output, dec_hidden, context_vector_t1, coverage, coverage_loss = self.decoder(
                dec_input, dec_hidden, enc_output, enc_padding_mask,
                context_vector_t1, enc_batch_extend_vocab, extra_zeros, coverage
            )
            batch_output = torch.cat((batch_output,dec_output.unsqueeze(1)), dim=1)
            
            if random.uniform(0, 1) > 0.5 and self.training:
                dec_input = y
            else:
                dec_input = dec_output.argmax(dim=1)
        return batch_output, coverage_loss
    
    def get_init_input_batch(self, text_extend, batch_size, hidden_size, seq_len):
        coverage = None
        extra_zeros = None
        enc_batch_extend_vocab = None
        
        context_vector_t1 = torch.zeros(batch_size, hidden_size, device=device)
        
        if config.is_coverage:
            coverage = torch.zeros(batch_size, seq_len, device=device)
        
        if config.use_pointer_gen:
            enc_batch_extend_vocab = text_extend
            extra_zeros = torch.zeros(batch_size, self.extend_vocab_size, device=device)
        return context_vector_t1, enc_batch_extend_vocab, extra_zeros, coverage

In [3]:
import os
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
# from transformers import (
#     AdamW,
#     get_cosine_schedule_with_warmup,
#     get_linear_schedule_with_warmup
# )
from accelerate import Accelerator
import math
import random
from tqdm import tqdm
import time
import datetime

In [4]:
PAD, BOS, EOS = "<pad>", "<bos>", "<eos>"
accelerator = Accelerator(fp16=True, cpu=False)
device = accelerator.device
print('device:', device)

def get_optim_shedu(named_parameters, total_steps, Hyparameters_config, use_scheduler=True):
    ignored_params = ["bias", "ln.weight", "embedding.weight"]
    optimizer_parameters = [
        {
            "params": [p for n,p in named_parameters if not any(i in n for i in ignored_params)],
        },
        {
            "params": [p for n,p in named_parameters if any(i in n for i in ignored_params[:-1])],
            "weight_decay": 0.0
        },
        {
            "params": [p for n,p in named_parameters if ignored_params[-1] in n],
            "lr": Hyparameters_config['lr']*0.1,
            "weight_decay": 0.0
        }
    ]
    
    #optimizer = AdamW(model_parameters, lr=Hyparameters_config['lr'], weight_decay=Hyparameters_config['weight_decay'])
    optimizer = optim.Adam(optimizer_parameters, lr=Hyparameters_config['lr'], weight_decay=Hyparameters_config['weight_decay'])
    if use_scheduler:
        scheduler = get_linear_schedule_with_warmup(
            optimizer = optimizer,
            num_warmup_steps=0.1*total_steps,
            num_training_steps=total_steps
        )
    #     scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    #         optimizer,
    #         mode='max',
    #         factor=Hyparameters_config['lr_gamma'],
    #         patience=Hyparameters_config['patience']
    #     )
        return optimizer,scheduler
    else:
        return optimizer
    
def trainer(model, train_dataset, valid_dataset, num_epochs, Hyparameters_config, cov_loss_wt=1):
    train_iter = DataLoader(train_dataset, Hyparameters_config['batch_size'], shuffle=True)
    valid_iter = DataLoader(valid_dataset, Hyparameters_config['batch_size'], shuffle=True)
    total_steps = len(train_iter)*num_epochs
    
    optimizer = get_optim_shedu(model.named_parameters(), total_steps, Hyparameters_config, use_scheduler=False)
    
    criterion = nn.NLLLoss(reduction='none')
    model,optimizer = accelerator.prepare(model, optimizer)
    
    model.train()
    accumulation_steps = 4 # 梯度累积
    for epoch in range(num_epochs):
        epoch_loss_train, epoch_loss_eval, epoch_acc_train, epoch_acc_eval = 0, 0, 0, 0
        start_time = time.time()
        # count = 0
        for text,news,text_extend in tqdm(train_iter, desc="training for epoch {}: ".format(epoch)):
            text = text.long().to(device)
            news = news.long().to(device)
            text_extend = text_extend.long().to(device)
            
            optimizer.zero_grad()
            output,batch_coverage_loss = model(text, news, text_extend) # [batch, seq_len, vocab_size + extend_vocab_size]
            
            batch_loss_train,batch_acc_train = torch.tensor([0.0], device=device),torch.tensor([0.0])
            # [batch, seq_len, vocab_size + extend_vocab_size]
            for predicted, target in zip(output, news):
                dec_mask_padding = (target!=0).to(dtype=torch.float32)
                target_len = int(sum(dec_mask_padding).item()-1)
                
                batch_loss_train = batch_loss_train + torch.sum(dec_mask_padding * criterion(predicted, target)) / target_len
                batch_acc_train += (target == predicted.argmax(dim=1)).sum().item() / target.size(0)
            if config.is_coverage:
                print(batch_loss_train, cov_loss_wt * torch.sum(batch_coverage_loss))
                batch_loss_train = batch_loss_train/Hyparameters_config['batch_size'] + cov_loss_wt*torch.sum(batch_coverage_loss)
                
            Before = list(model.parameters())[1].clone() # 获取更新前模型的第0层权重
            batch_loss_train = batch_loss_train #/ accumulation_steps
            accelerator.backward(batch_loss_train)
            
            # 梯度截断
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)

            # 参数更新
            optimizer.step()
            # if((count+1)%accumulation_steps)==0:
            #     optimizer.step()
            #     optimizer.zero_grad()
            # count+=1
            After = list(model.parameters())[1].clone()
            print('encoder的第0层更新幅度：',torch.sum(After-Before))
            epoch_loss_train += batch_loss_train.item()
            epoch_acc_train += batch_acc_train.item() / Hyparameters_config['batch_size']
            print(epoch_loss_train,epoch_acc_train)
        
            del text, news, text_extend, output, batch_coverage_loss, batch_loss_train, batch_acc_train
        
        # 参数保存
        if epoch >= Hyparameters_config['save_state_epoch']:
            path = './save_models/'
            if not os.path.exists(path):
                os.makedirs(path)
            accelerator.save(
                {
                    'model_state_dict': model.state_dict(),
                },
                path + f'epoch_{epoch+1}.pkl'
            )
        
        # eval
        with torch.no_grad():
            for text,news,extend_token_id in tqdm(valid_iter, desc="evaluating for epoch {}: ".format(epoch)):
                text = text.long()
                news = news.long()
                extend_token_id = extend_token_id.long()
                
                output,batch_coverage_loss = model(text.to(device), news.to(device), extend_token_id.to(device))
                output,batch_coverage_loss = output.cpu(),batch_coverage_loss.cpu()
                
                batch_loss_eval,batch_acc_eval = torch.tensor([0.0]),torch.tensor([0.0])
                for predicted,target in zip(output,news):
                    dec_mask_padding = (target!=0).to(dtype=torch.float32)
                    target_len = int(sum(dec_mask_padding).item()-1)
                    
                    batch_loss_eval = batch_loss_eval + torch.sum(dec_mask_padding * criterion(predicted, target)) / target_len
                    batch_acc_eval += (target == predicted.argmax(dim=1)).sum().item() / target.size(0)
                if config.is_coverage:
                    batch_loss_eval = batch_loss_eval + cov_loss_wt * torch.sum(batch_coverage_loss)
                
                epoch_loss_eval += batch_loss_eval.item() / Hyparameters_config['batch_size']
                epoch_acc_eval += batch_acc_eval.item() / Hyparameters_config['batch_size']
        
        del text, news, extend_token_id, output, batch_coverage_loss, batch_loss_eval, batch_acc_eval
        torch.cuda.empty_cache()
        
        # 参数打印
        duration = str(datetime.timedelta(seconds=time.time() - start_time))[:7]
        print("Time: {} | Epoch: {}/{} | train_loss: {:.3} | train_acc: {:.3} | eval_loss: {:.3} | eval_acc: {:.3}".format(
            duration, epoch, num_epochs, epoch_loss_train/len(train_iter), epoch_acc_train/len(train_iter),
            epoch_loss_eval/len(valid_iter), epoch_acc_eval/len(valid_iter)))

device: cuda


In [5]:
import torch
from torch.utils.data import TensorDataset
from config import config
from utils.data_preprocess import preprocess_text
import pickle
config = config()

# 清洗文本
train_text, train_news = preprocess_text(config.valid_path, config.stopwords_path)
valid_text, valid_news = preprocess_text(config.test_path, config.stopwords_path)

# 建立词典
vocab, extend_vocab = build_vocab(train_text+valid_text, train_news+valid_news, min_freq=3)
vocab_size,extend_vocab_size = len(vocab), len(extend_vocab)
with open(config.vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
with open(config.extend_vocab_path, 'wb') as f:
    pickle.dump(extend_vocab, f)

# 构建dataset，tensor格式
train_text_dataset,train_extend_dataset = build_dataset(vocab, extend_vocab, train_text, config.max_len_text, is_extend_vocab=True)
train_news_dataset,_ = build_dataset(vocab, extend_vocab, train_news, config.max_len_news, sentence_type="summary")
train_dataset = TensorDataset(train_text_dataset, train_news_dataset, train_extend_dataset)

valid_text_dataset,valid_extend_dataset = build_dataset(vocab, extend_vocab, valid_text, config.max_len_text, is_extend_vocab=True)
valid_news_dataset,_ = build_dataset(vocab, extend_vocab, valid_news, config.max_len_news, sentence_type="summary")
valid_dataset = TensorDataset(valid_text_dataset, valid_news_dataset, valid_extend_dataset)

# 加载预训练的word2vec模型（使用搜狗新闻训练得到的word2vec），维度是300
vocab.update(extend_vocab)
pre_embeddings = get_pretrained_embedding(vocab, config.pretrained_vector_path, vector_dim=300)

num_epochs = 5
Hyparameters_config = {
    'lr': 5e-3,
    'weight_decay': 1e-4,
    'batch_size': 4,
    'lr_gamma': 0.1,
    'patience': 2,
    'save_state_epoch': 0
}

cuda:0 num_cuda:  1


In [6]:
# encoder 使用的是单层双向GRU
encoder = Encoder(vocab_size, 300, 300, n_layers=1, use_pretrained_embeddings=True, pre_embeddings=pre_embeddings)
# decoder 使用的是双向单层GRU
decoder = Decoder(vocab_size, 300, 300, n_layers=1, use_pretrained_embeddings=True, pre_embeddings=pre_embeddings)

model = PointerGenerator(encoder, decoder, extend_vocab_size, 300)
trainer(model, train_dataset, valid_dataset, num_epochs, Hyparameters_config)

training for epoch 0:   0%|          | 0/125 [00:00<?, ?it/s]

tensor([30.9879], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   1%|          | 1/125 [00:07<15:07,  7.32s/it]

encoder的第0层更新幅度： tensor(-5.1714, device='cuda:0', grad_fn=<SumBackward0>)
8.746971130371094 0.10891088843345642
tensor([32.9647], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   2%|▏         | 2/125 [00:14<14:52,  7.26s/it]

encoder的第0层更新幅度： tensor(-3.3966, device='cuda:0', grad_fn=<SumBackward0>)
17.988140106201172 0.2599009871482849
tensor([31.9084], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   2%|▏         | 3/125 [00:21<14:37,  7.19s/it]

encoder的第0层更新幅度： tensor(-2.9929, device='cuda:0', grad_fn=<SumBackward0>)
26.965240478515625 0.38861386477947235
tensor([33.2419], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   3%|▎         | 4/125 [00:28<14:25,  7.15s/it]

encoder的第0层更新幅度： tensor(-2.9362, device='cuda:0', grad_fn=<SumBackward0>)
36.275726318359375 0.5222772359848022
tensor([32.7924], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   4%|▍         | 5/125 [00:35<14:13,  7.12s/it]

encoder的第0层更新幅度： tensor(-2.6300, device='cuda:0', grad_fn=<SumBackward0>)
45.47383213043213 0.6485148668289185
tensor([30.9194], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   5%|▍         | 6/125 [00:42<14:11,  7.16s/it]

encoder的第0层更新幅度： tensor(-1.8939, device='cuda:0', grad_fn=<SumBackward0>)
54.203675270080566 0.769801989197731
tensor([30.2970], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   6%|▌         | 7/125 [00:49<14:04,  7.16s/it]

encoder的第0层更新幅度： tensor(-1.1493, device='cuda:0', grad_fn=<SumBackward0>)
62.777931213378906 0.8910891190171242
tensor([31.9416], device='cuda:0', grad_fn=<AddBackward0>) tensor(4., device='cuda:0', grad_fn=<MulBackward0>)


training for epoch 0:   6%|▋         | 8/125 [00:57<13:56,  7.15s/it]

encoder的第0层更新幅度： tensor(-0.5823, device='cuda:0', grad_fn=<SumBackward0>)
71.76332950592041 0.9975247606635094


training for epoch 0:   6%|▋         | 8/125 [01:00<14:38,  7.51s/it]


KeyboardInterrupt: 