In [2]:
import torch as t
from torch import nn
import plotly.express as px
from IPython.display import display
import pandas as pd
import numpy as np
import utils

In [3]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")

In [4]:
test_str = """A day will come, one day in the unending succession of days, 
    when beings, beings who are now latent in our thoughts and hidden 
    in our loins, shall stand upon this earth as one stands upon a 
    footstool, and shall laugh and reach out their hands amidst the 
    stars."""

In [5]:
tkns = tokenizer.encode(test_str)

In [6]:
tokenizer.decode(tkns)

'A day will come, one day in the unending succession of days, \n    when beings, beings who are now latent in our thoughts and hidden \n    in our loins, shall stand upon this earth as one stands upon a \n    footstool, and shall laugh and reach out their hands amidst the \n    stars.'

In [7]:
tokenizer(test_str)

{'input_ids': [32, 1110, 481, 1282, 11, 530, 1110, 287, 262, 555, 1571, 22435, 286, 1528, 11, 220, 198, 220, 220, 220, 618, 9791, 11, 9791, 508, 389, 783, 41270, 287, 674, 6066, 290, 7104, 220, 198, 220, 220, 220, 287, 674, 2376, 1040, 11, 2236, 1302, 2402, 428, 4534, 355, 530, 6296, 2402, 257, 220, 198, 220, 220, 220, 2366, 301, 970, 11, 290, 2236, 6487, 290, 3151, 503, 511, 2832, 31095, 262, 220, 198, 220, 220, 220, 5788, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
class Embedding(nn.Module):

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super().__init__()
        self.num_embed = num_embeddings
        self.embed_dim = embedding_dim
        self.weight = nn.Parameter(t.ones(num_embeddings, embedding_dim).uniform_(-1, to=1))

    def forward(self, x: t.LongTensor) -> t.Tensor:
        '''For each integer in the input, return that row of the embedding.
        '''
        return self.weight[x]

    def extra_repr(self) -> str:
        return f"{self.num_embed}, {self.embed_dim}"

assert repr(Embedding(10, 20)) == repr(t.nn.Embedding(10, 20))
#utils.test_embedding(Embedding)

In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, max_seq_len: int, embedding_dim: int):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.embed_dim = embedding_dim
        self.n = 10000
        
        freqs = np.outer(
            np.arange(max_seq_len), 
            1 / self.n ** (2 * np.arange(embedding_dim//2) / embedding_dim)
        )
        enc_2d = np.zeros((max_seq_len, embedding_dim))
        enc_2d[:, ::2] = np.sin(freqs)
        enc_2d[:, 1::2] = np.cos(freqs)
        self.pos_enc = t.from_numpy(enc_2d)
        self.register_buffer("pos_enc", self.pos_enc)

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        x: shape (batch, seq_len, embedding_dim)
        '''
        return x + self.pos_enc[:x.shape[1],:]

    def extra_repr(self) -> str:
        return f"max_freq={self.n}, max_seq_len={self.max_seq_len}, embedding_dim={self.embed_dim}"

In [10]:
T = t.randn(2, 3, 4)
lnorm = nn.LayerNorm(T.shape[2])
out = lnorm(T)

In [11]:
T

tensor([[[ 0.9630,  0.5554, -0.8954,  0.9941],
         [ 0.0050, -1.0346,  2.0186,  0.2238],
         [-1.8682, -0.0536,  2.1194,  1.4801]],

        [[-2.7236, -1.3115,  0.3106,  0.0503],
         [ 0.0109,  0.1053,  0.8121, -0.0904],
         [ 0.8659, -0.3655, -0.6905, -1.3613]]])

In [12]:
out

tensor([[[ 0.7256,  0.1963, -1.6877,  0.7659],
         [-0.2714, -1.2178,  1.5615, -0.0723],
         [-1.4866, -0.3074,  1.1047,  0.6893]],

        [[-1.4910, -0.3246,  1.0153,  0.8003],
         [-0.5597, -0.2936,  1.6987, -0.8453],
         [ 1.5516,  0.0277, -0.3746, -1.2047]]],
       grad_fn=<NativeLayerNormBackward0>)

In [13]:
class LayerNorm(nn.Module):

    def __init__(
        self, 
        normalized_shape, 
        eps: float = 1e-05, 
        elementwise_affine: bool = True
        ):
        super().__init__()
        self.norm_shape = (normalized_shape, ) if isinstance(normalized_shape, int) else normalized_shape
        self.eps = eps
        self.elementwise_affine = elementwise_affine

        if self.elementwise_affine:
            self.weight = nn.Parameter(t.ones(normalized_shape))
            self.bias = nn.Parameter(t.zeros(normalized_shape))

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Normalize along each embedding'''
        x_dims, norm_shape_dims = len(x.shape), len(self.norm_shape)
        norm_dims = tuple([d for d in range(x_dims - norm_shape_dims, x_dims)])
        
        self.mean = t.mean(x, dim=norm_dims, keepdim=True)
        self.var = t.var(x, dim=norm_dims, unbiased=False, keepdim=True)

        out = (x - self.mean) / t.sqrt(self.var + self.eps)

        if self.elementwise_affine:
            out = out * self.weight + self.bias

        return out

    def extra_repr(self) -> str:
        return f"normalized_shape={self.norm_shape}, eps={self.eps}, elementwise_affine={self.elementwise_affine}"

utils.test_layernorm_mean_1d(LayerNorm)
utils.test_layernorm_mean_2d(LayerNorm)
utils.test_layernorm_std(LayerNorm)
utils.test_layernorm_exact(LayerNorm)
utils.test_layernorm_backward(LayerNorm)

All tests in `test_layernorm_mean_1d` passed.
All tests in `test_layernorm_mean_2d` passed.
All tests in `test_layernorm_std` passed.
All tests in `test_layernorm_exact` passed.
All tests in `test_layernorm_backward` passed.


In [21]:
class Dropout(nn.Module):

    def __init__(self, p: float):
        super().__init__()
        self.p = p

    def forward(self, x: t.Tensor) -> t.Tensor:
        if self.training:
            d_shape = x.shape
            dropout_matrix = t.rand(d_shape)
            dropout_matrix[dropout_matrix < self.p] = 0
            dropout_matrix[dropout_matrix >= self.p] = 1
            # should this be on the device?
            out = x * dropout_matrix
            out = out / (1 - self.p)
            return out
        else:
            return x

    def extra_repr(self) -> str:
        return f"p={self.p}"

utils.test_dropout_eval(Dropout)
utils.test_dropout_training(Dropout)

All tests in `test_dropout_eval` passed.
All tests in `test_dropout_training` passed.


In [27]:
class GELU(nn.Module):

    def forward(self, x: t.Tensor) -> t.Tensor:
        out = x * 0.5 * (1 + t.tanh(t.sqrt(t.tensor(2 / t.pi)) * (x + 0.044715 * x ** 3)))
        return out

utils.plot_gelu(GELU)

In [None]:
def single_head_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor) -> t.Tensor:
    '''
    Should return the results of self-attention (see the "Self-Attention in Detail" section of the Illustrated Transformer).

    With this function, you can ignore masking.

    Q: shape (FILL THIS IN!)
    K: shape (FILL THIS IN!)
    V: shape (FILL THIS IN!)

    Return: shape (FILL THIS IN!)
    '''
    pass