In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from model import *

In [3]:
# Input dimensions
batch_size = 27
c = 11

# Model parameters
d_model = 64
d_head = 17
n_heads = 5
d_inner = 123
vocab_size = 26
n_blocks = 6

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Attention Head

attn_head = AttentionHead(d_model=d_model, d_head=d_head).to(device)

x = torch.randn(batch_size, c, d_model).to(device)
o = attn_head(x)

print(f"{x.shape} -> {o.shape}")

torch.Size([27, 11, 64]) -> torch.Size([27, 11, 17])


In [5]:
# Multi-Head Attention

mha = MultiHeadAttention(d_model=d_model, n_heads=n_heads, d_head=d_head).to(device)

x = torch.randn(batch_size, c, d_model).to(device)
x_new = mha(x)

print(f"{x.shape} -> {x_new.shape}")

torch.Size([27, 11, 64]) -> torch.Size([27, 11, 64])


In [6]:
# Feed Forward

ff = FeedForward(d_model=d_model, d_inner=d_inner).to(device)

x = torch.randn(batch_size, c, d_model).to(device)
x_new = ff(x)

print(f"{x.shape} -> {x_new.shape}")

torch.Size([27, 11, 64]) -> torch.Size([27, 11, 64])


In [7]:
# Basic Building Block

b = Block(d_model=d_model, n_heads=n_heads, d_head=d_head, d_inner=d_inner).to(device)

x = torch.randn(batch_size, c, d_model).to(device)
x_new = b(x)

print(f"{x.shape} -> {x_new.shape}")

torch.Size([27, 11, 64]) -> torch.Size([27, 11, 64])


In [8]:
# Language Model

lm = LanguageModel(vocab_size=vocab_size, d_model=d_model, n_heads=n_heads, n_blocks=n_blocks).to(device)

x = torch.randint(0, vocab_size, (batch_size, c)).to(device)
x_new, _ = lm(x)

print(f"{x.shape} -> {x_new.shape}")

torch.Size([27, 11]) -> torch.Size([27, 11, 26])
