<a href="https://colab.research.google.com/github/elangbijak4/LLM-SLM-Examples/blob/main/Rev3_Hitung_Bobot_WQ_WK_WV_k_Layer_Enkoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import torch
import torch.nn as nn
import torch.optim as optim

class SimpleMultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(SimpleMultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, x):
        Q = self.wq(x)
        K = self.wk(x)
        V = self.wv(x)

        # Simplified attention mechanism
        out = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_model ** 0.5)
        out = torch.matmul(out, V)
        out = self.fc_out(out)
        return out

class SimpleEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super(SimpleEncoderLayer, self).__init__()
        self.mha = SimpleMultiHeadAttention(d_model, num_heads)
        self.layernorm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model)
        )

    def forward(self, x):
        attn_output = self.mha(x)
        x = self.layernorm(x + attn_output)
        ffn_output = self.ffn(x)
        x = self.layernorm(x + ffn_output)
        return x

class SimpleEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads):
        super(SimpleEncoder, self).__init__()
        self.layers = nn.ModuleList([SimpleEncoderLayer(d_model, num_heads) for _ in range(num_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Instantiate model
num_layers = 6
d_model = 512
num_heads = 8
encoder = SimpleEncoder(num_layers, d_model, num_heads)

# Example input
x = torch.randn(32, 10, d_model)  # batch_size, seq_len, d_model

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(encoder.parameters(), lr=0.001)

# Dummy target for loss calculation
target = torch.randint(0, d_model, (32, 10), dtype=torch.long)  # Random target indices for demonstration

# Training loop
encoder.train()
for epoch in range(100):  # Dummy training loop
    optimizer.zero_grad()
    output = encoder(x)
    output = output.view(-1, d_model)  # Reshape for CrossEntropyLoss
    target = target.view(-1)  # Reshape to match output

    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Print weights for each encoder layer's W_Q, W_K, W_V
    for i, layer in enumerate(encoder.layers):
        print(f"Layer {i+1} Weights after epoch {epoch+1}")
        print("W_Q:", layer.mha.wq.weight.data.shape)
        print(layer.mha.wq.weight.data[:2])  # Print first 2 rows for brevity
        print("W_K:", layer.mha.wk.weight.data.shape)
        print(layer.mha.wk.weight.data[:2])  # Print first 2 rows for brevity
        print("W_V:", layer.mha.wv.weight.data.shape)
        print(layer.mha.wv.weight.data[:2])  # Print first 2 rows for brevity

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
tensor([[ 0.0253,  0.0308, -0.0155,  ..., -0.0167,  0.0080, -0.0211],
        [-0.0382, -0.0353, -0.0442,  ..., -0.0043,  0.0258, -0.0386]])
W_V: torch.Size([512, 512])
tensor([[ 0.0250, -0.0151, -0.0400,  ...,  0.0007, -0.0248,  0.0152],
        [-0.0147,  0.0517, -0.0295,  ..., -0.0085, -0.0343,  0.0377]])
Layer 3 Weights after epoch 21
W_Q: torch.Size([512, 512])
tensor([[ 0.0096, -0.0130,  0.0014,  ..., -0.0213,  0.0148, -0.0317],
        [-0.0087, -0.0176, -0.0235,  ...,  0.0262, -0.0271, -0.0174]])
W_K: torch.Size([512, 512])
tensor([[-0.0008,  0.0024, -0.0305,  ...,  0.0375,  0.0239,  0.0203],
        [-0.0324,  0.0155, -0.0271,  ..., -0.0144,  0.0073, -0.0169]])
W_V: torch.Size([512, 512])
tensor([[-0.0070,  0.0348,  0.0259,  ...,  0.0454, -0.0007,  0.0106],
        [ 0.0034,  0.0087,  0.0043,  ...,  0.0078,  0.0175,  0.0359]])
Layer 4 Weights after epoch 21
W_Q: torch.Size([512, 512])
tensor([[-0.0318, -0