In [1]:
import torch
import torch.nn as nn


In [2]:
class SimpleSelfAttention(nn.Module):
    def __init__(self, embed_size, heads=1):
        super(SimpleSelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.values = nn.Linear(embed_size, embed_size, bias=False)
        self.keys = nn.Linear(embed_size, embed_size, bias=False)
        self.queries = nn.Linear(embed_size, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, value, key, query):
        # Get Q, K, V matrices
        queries = self.queries(query)
        keys = self.keys(key)
        values = self.values(value)

        # Calculate the attention scores
        energy = torch.bmm(queries, keys.transpose(1, 2))
        attention = torch.softmax(
            energy / (self.embed_size ** (1 / 2)), dim=-1
        )

        # Get the weighted value vectors
        out = torch.bmm(attention, values)
        out = self.fc_out(out)
        return out

In [3]:
class SimpleTransformerBlock(nn.Module):
    def __init__(self, embed_size):
        super(SimpleTransformerBlock, self).__init__()
        self.attention = SimpleSelfAttention(embed_size)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, embed_size * 4),
            nn.ReLU(),
            nn.Linear(embed_size * 4, embed_size),
        )

    def forward(self, value, key, query):
        attention = self.attention(value, key, query)

        # Add skip connection, followed by LayerNorm
        x = self.norm1(attention + query)

        forward = self.feed_forward(x)
        # Add skip connection, followed by LayerNorm
        out = self.norm2(forward + x)
        return out

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, embed_size)
        for pos in range(max_len):
            for i in range(0, embed_size, 2):
                position = torch.tensor([[pos]], dtype=torch.float32)
                div_term = torch.pow(
                    10000, (
                        2 * (i // 2)) / torch.tensor(embed_size
                    ).float()
                )
                self.encoding[pos, i] = torch.sin(
                    position / div_term
                )
                self.encoding[pos, i + 1] = torch.cos(
                    position / div_term
                )
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, : x.size(1), :].to(x.device)

In [5]:
class SimpleTransformer(nn.Module):
    def __init__(self, embed_size, max_len, output_size):
        super(SimpleTransformer, self).__init__()
        self.embed = nn.Embedding(output_size, embed_size)
        self.pos_encoder = PositionalEncoding(embed_size, max_len)
        self.transformer_block = SimpleTransformerBlock(embed_size)
        self.fc_out = nn.Linear(embed_size, output_size)

    def forward(self, x):
        embedding = self.embed(x)
        # Add positional encoding
        embedding += self.pos_encoder(embedding)
        transformer_out = self.transformer_block(
            embedding, embedding, embedding
        )
        out = self.fc_out(transformer_out)
        return out

In [None]:
model = SimpleTransformer(2, 3, 10)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\e1003118\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\ipykernel_laun

In [8]:
sample_sequence = [1, 2, 3]
sample_tensor = torch.tensor(sample_sequence, dtype=torch.long).unsqueeze(0)

In [9]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation for inference
    predictions = model(sample_tensor)
    predicted_index = predictions.argmax(-1)  # Get the index of the max log-probability for the last position

# Assuming we're predicting the next number in the sequence
predicted_number = predicted_index[0, -1].item()  # Convert to Python number
print(f"Input Sequence: {sample_sequence}")
print(f"Predicted Next Number: {predicted_number}")


Input Sequence: [1, 2, 3]
Predicted Next Number: 1
