In [3]:
import torch
import torch.nn as nn
# import torch.nn.functional as F
import time
from net.utils import get_model_memory_nolog
tic = time.time()

#------------------------------参数设置
cudadevice = 'cuda:0'
device = torch.device(cudadevice if torch.cuda.is_available() else "cpu")
tokenlength = 25000
hiddendim = 576
input_matrix = torch.randn(1, hiddendim, tokenlength).to(device)  # batchsize channel 长

#------------------------------模型初始化
transformer_model = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hiddendim, nhead=4, dim_feedforward=256),num_layers=6).to(device)
get_model_memory_nolog(transformer_model)

#------------------------------数据流正文
print(input_matrix.shape, hiddendim*tokenlength)
print(f'初始化耗时{time.time() - tic:.4f}s')
tic = time.time()

x = input_matrix.reshape(tokenlength,1,-1) # Reshape to (seq_len, batch_size, input_channel)
x = transformer_model(x)
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])
print(f'耗时{time.time() - tic:.4f}s')


模型占用0.0364GB
torch.Size([1, 576, 25000]) 14400000
初始化耗时0.2574s
torch.Size([25000, 1, 576]) 14400000
耗时0.0152s


In [11]:
import torch
import torch.nn as nn
# import torch.nn.functional as F
import time
from net.utils import get_model_memory_nolog
tic = time.time()

#------------------------------参数设置
cudadevice = 'cuda:0'
device = torch.device(cudadevice if torch.cuda.is_available() else "cpu")
tokenlength = 2500
hiddendim = 576
input_matrix = torch.randn(1, tokenlength, hiddendim).to(device)  # batchsize length dim
# input_matrix = torch.randn(1, hiddendim, tokenlength).to(device)  # batchsize dim length

#------------------------------模型初始化
# transformer_model = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hiddendim, nhead=4, dim_feedforward=256, batch_first=True),num_layers=6).to(device)
transformer_model = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hiddendim, nhead=4, dim_feedforward=256,activation='silu'),num_layers=6).to(device)
get_model_memory_nolog(transformer_model)

#------------------------------数据流正文
print(input_matrix.shape, hiddendim*tokenlength)
print(f'初始化耗时{time.time() - tic:.4f}s')
tic = time.time()

# x = input_matrix.reshape(1,tokenlength,-1) # Reshape to ( batch_size, seq_len, input_channel) (1,2500,576) batch length dim
x = input_matrix.reshape(tokenlength,1,-1) # Reshape to (seq_len, batch_size, input_channel) (2500,1,576) length batch dim
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])

x = transformer_model(x) #真正使用的时候
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])
print(f'耗时{time.time() - tic:.4f}s')


模型占用0.0364GB
torch.Size([1, 2500, 576]) 1440000
初始化耗时0.0927s
torch.Size([2500, 1, 576]) 1440000
torch.Size([2500, 1, 576]) 1440000
耗时0.0088s


加入Positional Encoding

In [4]:
import torch
import torch.nn as nn
# import torch.nn.functional as F
import time
from net.utils import get_model_memory_nolog
tic = time.time()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=22500): 
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

#------------------------------参数设置
cudadevice = 'cuda:1'
device = torch.device(cudadevice if torch.cuda.is_available() else "cpu")
tokenlength = 22500
hiddendim = 576
input_matrix = torch.randn(1, tokenlength, hiddendim).to(device)  # batchsize length dim
# input_matrix = torch.randn(1, hiddendim, tokenlength).to(device)  # batchsize dim length

#------------------------------模型初始化
# transformer_model = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hiddendim, nhead=4, dim_feedforward=256, batch_first=True),num_layers=6).to(device)
transformer_model = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hiddendim, nhead=4, dim_feedforward=256,activation='silu'),num_layers=6).to(device)
pe = PositionalEncoding(d_model=hiddendim).to(device)
get_model_memory_nolog(transformer_model)

#------------------------------数据流正文
print(input_matrix.shape, hiddendim*tokenlength)
print(f'初始化耗时{time.time() - tic:.4f}s')
tic = time.time()

# x = input_matrix.reshape(1,tokenlength,-1) # Reshape to ( batch_size, seq_len, input_channel) (1,2500,576) batch length dim
x = input_matrix.reshape(tokenlength,1,-1) # Reshape to (seq_len, batch_size, input_channel) (2500,1,576) length batch dim
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])

x = pe(x)
x = transformer_model(x) #真正使用的时候
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])
print(f'耗时{time.time() - tic:.4f}s')


模型占用0.0364GB
torch.Size([1, 22500, 576]) 12960000
初始化耗时0.2944s
torch.Size([22500, 1, 576]) 12960000
torch.Size([22500, 1, 576]) 12960000
耗时0.0196s


尝试Transformer pooling 和query vector动手脚

transformer pooling

In [2]:
import torch
# import torch.nn as nn
from mytransformer import TransformerEncoder,TransformerEncoderLayer,PositionalEncoding,TransformerWithPooling
# import torch.nn.functional as F
import time
from net.utils import get_model_memory_nolog
tic = time.time()

#------------------------------参数设置
cudadevice = 'cuda:1'
device = torch.device(cudadevice if torch.cuda.is_available() else "cpu")
tokenlength = 25000
hiddendim = 576
input_matrix = torch.randn(1, tokenlength, hiddendim).to(device)  # batchsize length dim

#------------------------------模型初始化
num_layers = 6
pool_size = 2  # 每次减少一半的序列长度

transformer_model = TransformerWithPooling(d_model=hiddendim, nhead=4, dim_feedforward=256, num_layers=num_layers, pool_size=pool_size, activation='silu').to(device)
pe = PositionalEncoding(d_model=hiddendim).to(device)
get_model_memory_nolog(transformer_model)

#------------------------------数据流正文
tic = time.time()
print(input_matrix.shape, hiddendim * tokenlength)
print(f'初始化耗时{time.time() - tic:.4f}s')
tic = time.time()

x = input_matrix.reshape(tokenlength, 1, -1)  # Reshape to (seq_len, batch_size, input_channel)
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])

x = pe(x)
x = transformer_model(x)  # 传入自定义的 Transformer 模型
print(x.shape, x.shape[0] * x.shape[1] * x.shape[2])
print(f'耗时{time.time() - tic:.4f}s')


模型占用0.0364GB
torch.Size([1, 25000, 576]) 14400000
初始化耗时0.0001s
torch.Size([25000, 1, 576]) 14400000
torch.Size([25000, 1, 576]) 14400000
1
torch.Size([12500, 1, 576]) 7200000
torch.Size([12500, 1, 576]) 7200000
1
torch.Size([6250, 1, 576]) 3600000
torch.Size([6250, 1, 576]) 3600000
1
torch.Size([3125, 1, 576]) 1800000
torch.Size([3125, 1, 576]) 1800000
1
torch.Size([1562, 1, 576]) 899712
torch.Size([1562, 1, 576]) 899712
1
torch.Size([781, 1, 576]) 449856
torch.Size([781, 1, 576]) 449856
1
torch.Size([390, 1, 576]) 224640
torch.Size([390, 1, 576]) 224640
耗时0.0083s
