In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# What version of Python do you have?
import sys
import platform
import torch
import pandas as pd
import sklearn as sk

has_gpu = torch.cuda.is_available()
has_mps = getattr(torch,'has_mps',False)
device = "mps" if getattr(torch,'has_mps',False) \
    else "gpu" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")


Python Platform: macOS-14.1.2-arm64-arm-64bit
PyTorch Version: 2.1.2

Python 3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 16:35:41) 
[Clang 16.0.6 ]
Pandas 2.1.4
Scikit-Learn 1.3.2
NVIDIA/CUDA GPU is NOT AVAILABLE
MPS (Apple Metal) is AVAILABLE
Target device is mps


  has_mps = getattr(torch,'has_mps',False)
  device = "mps" if getattr(torch,'has_mps',False) \


In [5]:
import torch
from torch.testing import make_tensor

from model import InputEmbedding

d_model = 12
vocab_size = 3
batch_size = 3
seq_len = 5
# since this is the tensor for embedding, we need to use int type
t = make_tensor((batch_size,seq_len), device=device, dtype=torch.int, low=0, high=vocab_size-1)

input_embedding = InputEmbedding(d_model, vocab_size).to(device)
ret = input_embedding(t)
assert ret.shape == (batch_size, seq_len, d_model)
assert ret.dtype == torch.float
assert ret.device.type == device, f"device {ret.device}"
# torch.testing.assert_close(ret, expected, check_layout=True, check_device=True, check_dtype=True)

In [7]:
import torch
from torch.testing import make_tensor

from model import PositionEmbedding

batch_size = 1
d_model = 2
max_seq_len = 5
seq_len = 2
dropout = 0
pe = PositionEmbedding(d_model, max_seq_len, dropout).to(device)

t = make_tensor((batch_size, seq_len, d_model), device=device, dtype=torch.float32, low=-1, high=1)
ret = pe(t)
expected_diff = torch.tensor([[[0.0000, 1.0000], [0.8415, 0.5403]]], dtype=torch.float32, device=device)
torch.testing.assert_close(ret-t, expected_diff, rtol=0.001, atol=0.001)


In [8]:
import torch
from model import LayerNormalization

ln = LayerNormalization().to(device)
t = torch.tensor([[1.0, 3.0], [2.0, 4.0]], dtype=torch.float32, device=device)
ret = ln(t)
torch.testing.assert_close(ret, 
    torch.tensor([[-0.7071, 0.7071], [-0.7071, 0.7071]], device=device), 
    rtol=0.001, atol=0.001)

In [9]:
import torch
from model import FeedForwardLayer

d_model= 2
d_ff=4
ff = FeedForwardLayer(d_model, d_ff, 0.0).to(device)
t = torch.tensor([[1.0, 3.0], [2.0, 4.0]], dtype=torch.float32, device=device)
ret = ff(t)
assert ret.shape == t.shape

In [10]:
import torch
from model import MultiHeadAttentionBlock

d_model = 6
n_heads = 2
mh = MultiHeadAttentionBlock(d_model, n_heads, 0.0).to(device)
# (batch:1, seq:2, d_model:6)
t = torch.tensor([[1.0, 3.0, 5.0, 7.0, 9.0, 11.0], [2.0, 4.0, 6.0, 8.0, 10.0, 12.0]], dtype=torch.float32, device=device)
t = t.unsqueeze(0)
mask = torch.tensor([[1, 0], [1,1]], dtype=torch.float32, device=device)
ret = mh(t, t, t, mask)
print(ret)
assert ret.shape == t.shape

tensor([[[-0.4387,  1.0943,  1.6182, -0.1563,  0.9320, -0.1748],
         [-0.4572,  1.1914,  1.6225, -0.1268,  0.9637, -0.2005]]],
       device='mps:0', grad_fn=<LinearBackward0>)


In [11]:
import torch
import torch.nn as nn

from model import ResidualConnection

sublayer = nn.Linear(2, 2).to(device)
rc = ResidualConnection(0.0).to(device)
# (batch:1, seq:2, d_model:2)
t = torch.tensor([[1.0, 3.0,], [2.0, 4.0]], dtype=torch.float32, device=device)
ret = rc(t, sublayer)
print(ret)
assert ret.shape == t.shape

tensor([[1.3933, 2.2991],
        [2.3933, 3.2991]], device='mps:0', grad_fn=<AddBackward0>)


In [12]:
import torch
import torch.nn as nn

from model import EncoderBlock, MultiHeadAttentionBlock, FeedForwardLayer

d_model = 2
n_heads = 2
d_ff = 4
dropout = 0.0
mh = MultiHeadAttentionBlock(d_model, n_heads, dropout).to(device)
ff = FeedForwardLayer(d_model, d_ff, dropout).to(device)

eb = EncoderBlock(mh, ff, dropout).to(device)
# (batch:1, seq:2, d_model:2)
t = torch.tensor([[1.0, 3.0,], [2.0, 4.0]], dtype=torch.float32, device=device)
t = t.unsqueeze(0)
mask = torch.tensor([[1, 0], [1,1]], dtype=torch.float32, device=device)

ret = eb(t, mask)
print(ret)
assert ret.shape == t.shape, f"ret.shape: {ret.shape} vs t.shape: {t.shape}"

tensor([[[1.1108, 1.5394],
         [2.1108, 2.5394]]], device='mps:0', grad_fn=<AddBackward0>)


In [14]:
import torch
import torch.nn as nn

from model import DecoderBlock, MultiHeadAttentionBlock, FeedForwardLayer

d_model = 2
n_heads = 2
d_ff = 4
dropout = 0.0
mh = MultiHeadAttentionBlock(d_model, n_heads, dropout).to(device)
mh2 = MultiHeadAttentionBlock(d_model, n_heads, dropout).to(device)

ff = FeedForwardLayer(d_model, d_ff, dropout).to(device)

db = DecoderBlock(mh, mh2, ff, dropout).to(device)
# (batch:1, seq:2, d_model:2)
t = torch.tensor([[1.0, 3.0,], [2.0, 4.0]], dtype=torch.float32, device=device)
t = t.unsqueeze(0)
mask = torch.tensor([[1, 0], [1,1]], dtype=torch.float32, device=device)
ret = db(t, t, mask, mask)
print(ret)
assert ret.shape == t.shape, f"ret.shape: {ret.shape} vs t.shape: {t.shape}"

tensor([[[3.2758, 4.4797],
         [4.4520, 5.5926]]], device='mps:0', grad_fn=<AddBackward0>)


In [20]:
import torch
import torch.nn as nn

from model import ProjectionLayer

d_model = 2
vocab_size = 4
pj = ProjectionLayer(d_model, vocab_size).to(device)
t = torch.tensor([[1.0, 3.0,], [2.0, 4.0]], dtype=torch.float32, device=device)
ret = pj(t)
print(ret)
assert ret.shape == ( 2, 4)

tensor([[-0.9028, -1.8150, -2.0259, -1.2044],
        [-0.5085, -2.2451, -2.7488, -1.4754]], device='mps:0',
       grad_fn=<LogSoftmaxBackward0>)


In [25]:
from model import build_transformer

transformer = build_transformer(2, 2, 2, 2).to(device)
print(transformer)

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention): MultiHeadAttentionBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward): FeedForwardLayer(
          (dropout): Dropout(p=0.1, inplace=False)
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual1): ResidualConnection(
          (dropout): Dropout(p=0.1, inplace=False)
          (norm): LayerNormalization()
        )
        (residual2): ResidualConnection(
          (dropout): Dropout(p=0.1, inplace=False)
          (norm): LayerNormali