In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

import lightning as L

In [3]:
# create vocabulary
token_to_id = {
    "who": 0,
    "is": 1,
    "charley": 2,
    "an": 3,
    "awesome": 4,
    "guy": 5,
    "<EOS>": 6,
}

# create reverse
id_to_token = dict(map(reversed, token_to_id.items()))

In [None]:
# create inputs
inputs = torch.tensor([
    [token_to_id["who"],
     token_to_id["is"],
     token_to_id["charley"],
     token_to_id["<EOS>"],
     token_to_id["an"],
     token_to_id["awesome"],
     token_to_id["guy"]],

    [token_to_id["charley"],
     token_to_id["is"],
     token_to_id["who"],
     token_to_id["<EOS>"],
     token_to_id["an"],
     token_to_id["awesome"],
     token_to_id["guy"]]
])

# create labels
labels = torch.tensor([
    [token_to_id["is"],
     token_to_id["charley"],
     token_to_id["<EOS>"],
     token_to_id["an"],
     token_to_id["awesome"],
     token_to_id["guy"],
     token_to_id["<EOS>"]],

    [token_to_id["is"],
     token_to_id["who"],
     token_to_id["<EOS>"],
     token_to_id["an"],
     token_to_id["awesome"],
     token_to_id["guy"],
     token_to_id["<EOS>"]]
])

# create dataset from inputs and labels
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

In [5]:
# word embedding
embedding = nn.Embedding()

TypeError: Embedding.__init__() missing 2 required positional arguments: 'num_embeddings' and 'embedding_dim'

In [6]:
# position encoding
class PositionEncoding(nn.Module):

    def __init__(self, d_model=2, max_len=6):

        super().__init__()

        pe = torch.zeros(max_len, d_model)

        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        embedding_index = torch.arange(start=0, end=d_model, step=2).float()

        div_term = 1/torch.tensor(10000.0) ** (embedding_index / d_model)

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe)

    def forward(self, word_embeddings):

        return word_embeddings + self.pe[:word_embeddings.size(0), :]

In [10]:
# multi headed attention
class Attention(nn.Module):

    def __init__(self, d_model=2):

        super().__init__()

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):

        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores



In [None]:
# now create transformer
class DecoderOnlyTransformer(L.LightningModule):

    def __init__(self, num_tokens=4, d_model=2, max_len=6):

        super().__init__()

        self.we = nn.Embedding(num_embeddings=num_tokens,
                               embedding_dim=d_model)
        
        self.pe = PositionEncoding(d_model=d_model,
                                   max_len=max_len)
        
        self.self_attention = Attention(d_model=d_model)

        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)

        self.loss = nn.CrossEntropyLoss()