Understanding Python Classes and nn.Module
Python Classes:


In [1]:

class Dog:
    # The __init__ method is the constructor.
    # It's called when you create a new object of this class.
    def __init__(self, name, breed):
        self.name = name  # Attribute 1
        self.breed = breed # Attribute 2

    # A method of the class
    def bark(self):
        print(f"{self.name} says Woof!")

# Create an object (instance) of the Dog class
my_dog = Dog("Buddy", "Golden Retriever")

# Access attributes
print(f"My dog's name is {my_dog.name}")
print(f"My dog's breed is {my_dog.breed}")

# Call a method
my_dog.bark()

My dog's name is Buddy
My dog's breed is Golden Retriever
Buddy says Woof!


In [2]:
# 了解nn.Module这个类的使用

import torch.nn as nn
import torch
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__() # Call the constructor of the parent class (nn.Module)
        self.fc1 = nn.Linear(input_size, hidden_size) # A linear layer
        self.relu = nn.ReLU() # An activation function
        self.fc2 = nn.Linear(hidden_size, output_size) # Another linear layer

    def forward(self, x):
        # Define the forward pass
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Example usage:
input_size = 10
hidden_size = 5
output_size = 2

# Create an instance of the SimpleNN module
model = SimpleNN(input_size, hidden_size, output_size)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))
print("SimpleNN model structure:")
print(model)

# Create some dummy input data
dummy_input = torch.randn(1, input_size) # Batch size of 1

# Pass the dummy input through the model
output = model(dummy_input)

print("\nDummy input shape:", dummy_input.shape)
print("Output shape:", output.shape)

67
SimpleNN model structure:
SimpleNN(
  (fc1): Linear(in_features=10, out_features=5, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=5, out_features=2, bias=True)
)

Dummy input shape: torch.Size([1, 10])
Output shape: torch.Size([1, 2])


In [None]:
# nn.Sequential —— 顺序执行的模块容器
# import torch.nn as nn
from collections import OrderedDict

model = nn.Sequential(
    nn.Linear(10, 20),
    nn.ReLU(),
    nn.Linear(20, 1)
)

def forward(x):
    x = Linear1(x)
    x = ReLU(x)
    x = Linear2(x)
    return x

# 或者你也可以显式命名：
model = nn.Sequential(
    OrderedDict([
        ('fc1', nn.Linear(10, 20)),
        ('relu', nn.ReLU()),
        ('fc2', nn.Linear(20, 1))
    ])
)

# 按顺序自动调用每一层的 forward()。
# 不需要自己写 forward()。
# 不适合复杂结构（比如需要跳连接、分支、合并的情况）。

In [None]:
# nn.ModuleList —— 子模块列表
# 用于保存一组子层，但不会自动执行它们。
# 你需要手动在 forward() 中定义执行逻辑。
import torch.nn as nn
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Linear(10, 20),
            nn.Linear(20, 30),
            nn.Linear(30, 40)
        ])

    def forward(self, x):
        for layer in self.layers:   # 手动循环
            x = layer(x)
        return x

In [None]:
# nn.ModuleDict —— 带名字的模块字典
# 用于保存带名称的模块集合（键值对形式），方便按名字访问。
# 不自动执行，需要自己在 forward() 中指定调用哪些模块。
# 常用于多分支网络、不同任务的共享骨干网络中。
import torch.nn as nn
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.blocks = nn.ModuleDict({
            'block1': nn.Linear(10, 20),
            'block2': nn.Linear(20, 30)
        })

    def forward(self, x, use_block='block1'):
        x = self.blocks[use_block](x)
        return x

In [None]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [7]:
model = nn.Linear(10,3)
# print(p.numel) #返回张量里面总元素
sum(p.numel() for p in model.parameters() if p.requires_grad)

33

In [11]:
# 全连接层，Linear层

m = nn.Linear(20, 30)
print(sum(p.numel() for p in m.parameters() if p.requires_grad))
input = torch.randn(128, 20) # 
output = m(input)
print(output.size()) #torch.Size([128, 30])

input = torch.randn(10, 128, 20) # 
output = m(input)
print(output.size()) #torch.Size([10, 128, 30])


630
torch.Size([128, 30])
torch.Size([10, 128, 30])


In [6]:
p = torch.rand(3,3)
p.numel()

9

In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe) # 注册缓存区，吧这个变量放入缓存区，使用更方便。

    def forward(self, x):
        """
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0), :]
        return x

# Example usage:
d_model = 512  # Embedding dimension
max_len = 100  # Maximum sequence length
seq_len = 10   # Actual sequence length
batch_size = 32

# Create a dummy input tensor (representing embeddings)
input_embeddings = torch.randn(seq_len, batch_size, d_model)
# 快速验证模型的前向传播（forward pass）是否能够正常执行，而无需准备真实的数据集。这有助于检查模型结构是否正确、是否存在维度不匹配等问题
# Create a PositionalEncoding layer
pos_encoder = PositionalEncoding(d_model, max_len)

# Add positional encoding to the input embeddings
output_with_pos = pos_encoder(input_embeddings)

print("Input embeddings shape:", input_embeddings.shape)
print("Output with positional encoding shape:", output_with_pos.shape)

In [None]:
import torch.nn as nn
# LayerNorm: normalized_shape参数指定要归一化的特征维度
self.ln = nn.LayerNorm(hidden_size)  # 参数是特征维度
# 前向传播 (Pre-Norm示例)
x = x + self.attention(self.ln(x))  # 残差连接 + 注意力 + LayerNorm

In [None]:
import torch.nn as nn

attn = nn.MultiheadAttention(embed_dim=512, num_heads=8)
output, weights = attn(query, key, value)

In [None]:
attn = nn.MultiheadAttention(embed_dim=512, num_heads=8)
### query is antibody
k = self.pos(ab_seq)
q = self.pos(ag_seq)
seq_out,attention = attn2(q, k, value=k)

In [12]:
import torch
import torch.nn as nn

# 假设参数
embed_dim = 512
num_heads = 8
batch_size = 2
src_len = 10    # encoder输出序列长度
tgt_len = 6     # decoder当前序列长度

# 初始化注意力层
attn = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=False)

# 模拟 encoder 输出 (K, V)
encoder_output = torch.randn(src_len, batch_size, embed_dim)

# 模拟 decoder 当前层输入 (Q)
decoder_hidden = torch.randn(tgt_len, batch_size, embed_dim)

# 执行交叉注意力
attn_output, attn_weights = attn(
    query=decoder_hidden,  # Q ← decoder
    key=encoder_output,    # K ← encoder
    value=encoder_output   # V ← encoder
)

print("attn_output shape:", attn_output.shape)
print("attn_weights shape:", attn_weights.shape)

attn_output shape: torch.Size([6, 2, 512])
attn_weights shape: torch.Size([2, 6, 10])


In [None]:
# Feed Forward层（前馈神经网络层）
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    def __init__(self, embed_dim=512, hidden_dim=2048, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(embed_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(hidden_dim, embed_dim)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)   # 或 F.gelu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# 示例
ffn = FeedForward()
x = torch.randn(10, 32, 512)  # (seq_len, batch_size, embed_dim)
y = ffn(x)
print(y.shape)  # torch.Size([10, 32, 512])

In [None]:
# Python实现
import numpy as np

def softmax(x):
    exp_x = np.exp(x - np.max(x))  # 数值稳定：减最大值
    return exp_x / np.sum(exp_x)

x = np.array([2.0, 1.0, 0.1])
print(softmax(x))  # [0.65900114 0.24243297 0.09856589]

# Pytorch实现
import torch
import torch.nn.functional as F

x = torch.tensor([2.0, 1.0, 0.1])
output = F.softmax(x, dim=0)
print(output)  # tensor([0.6590, 0.2424, 0.0986])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.d_k = d_model // nhead
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_Q(Q).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        K = self.W_K(K).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        V = self.W_V(V).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = F.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_O(context), attn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, nhead)
        self.ffn = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
    
    def forward(self, x):
        # 自注意力 + 残差
        attn_out, _ = self.self_attn(x, x, x)
        x = self.norm1(x + attn_out)
        # 前馈 + 残差
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, nhead)
        self.enc_dec_attn = MultiHeadAttention(d_model, nhead)
        self.ffn = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
    
    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        # 掩码自注意力 + 残差
        attn_out, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + attn_out)
        # 编码器-解码器注意力 + 残差
        attn_out, _ = self.enc_dec_attn(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + attn_out)
        # 前馈 + 残差
        ffn_out = self.ffn(x)
        x = self.norm3(x + ffn_out)
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, nhead) for _ in range(num_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, nhead) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
    
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # 编码器
        src_emb = self.pos_enc(self.src_embedding(src)) * math.sqrt(self.src_embedding.embedding_dim)
        enc_out = src_emb
        for layer in self.encoder_layers:
            enc_out = layer(enc_out)
        
        # 解码器
        tgt_emb = self.pos_enc(self.tgt_embedding(tgt)) * math.sqrt(self.tgt_embedding.embedding_dim)
        dec_out = tgt_emb
        for layer in self.decoder_layers:
            dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
        
        return self.fc_out(dec_out)

# 使用示例
model = Transformer(src_vocab_size=10000, tgt_vocab_size=10000)
src = torch.randint(0, 10000, (32, 10))  # batch=32, src_len=10
tgt = torch.randint(0, 10000, (32, 12))  # tgt_len=12
output = model(src, tgt)
print(output.shape)  # [32, 12, 10000]