In [2]:
# # download the complete works of shakespeare as a text file and save it in the home directory
# !wget https://www.gutenberg.org/files/100/100-0.txt -O ./shakespeare.txt

![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/derpyplops/arena/blob/main/shakespeare.ipynb)

In [3]:
import torch as t
import torch.nn as nn
from einops import rearrange, repeat
from fancy_einsum import einsum
from torch import optim
from impl.transformer_modules import DecoderTransformer, TransformerConfig
from torch.utils.data import DataLoader, Dataset, random_split

In [4]:
import re

# read the file
with open('./shakespeare.txt', 'r') as f:
    text = f.read()

print(re.split(r"\b", text)[:100])

['\ufeff', 'The', ' ', 'Project', ' ', 'Gutenberg', ' ', 'eBook', ' ', 'of', ' ', 'The', ' ', 'Complete', ' ', 'Works', ' ', 'of', ' ', 'William', ' ', 'Shakespeare', ', ', 'by', ' ', 'William', ' ', 'Shakespeare', '\n\n', 'This', ' ', 'eBook', ' ', 'is', ' ', 'for', ' ', 'the', ' ', 'use', ' ', 'of', ' ', 'anyone', ' ', 'anywhere', ' ', 'in', ' ', 'the', ' ', 'United', ' ', 'States', ' ', 'and', '\n', 'most', ' ', 'other', ' ', 'parts', ' ', 'of', ' ', 'the', ' ', 'world', ' ', 'at', ' ', 'no', ' ', 'cost', ' ', 'and', ' ', 'with', ' ', 'almost', ' ', 'no', ' ', 'restrictions', '\n', 'whatsoever', '. ', 'You', ' ', 'may', ' ', 'copy', ' ', 'it', ', ', 'give', ' ', 'it', ' ', 'away']


In [5]:
# dataset containing shakespeare
from torch.utils.data import Dataset

import re

# read the file
with open('./shakespeare.txt', 'r') as f:
    text = f.read()

class ShakespeareDataset(Dataset):
    def __init__(self, text, seq_size):
        super().__init__()
        self.text = text
        self.vocab = sorted(set(text))
        self.vocab_size = len(self.vocab)
        self.char_to_idx = {c: i for i, c in enumerate(self.vocab)}
        self.idx_to_char = {i: c for i, c in enumerate(self.vocab)}
        self.text_as_int = t.tensor([self.char_to_idx[c] for c in self.text])

        self.seq_size = seq_size

        self.num_batches = int(len(text) / (seq_size))

    def __len__(self):
        return self.num_batches

    def __getitem__(self, idx):
        text = self.text_as_int[idx * self.seq_size:(idx + 1) * self.seq_size]
        label = self.text_as_int[idx * self.seq_size + 1:(idx + 1) * self.seq_size + 1]
        return (text, label)

    def to_text(self, idxs):
        return ''.join([self.idx_to_char[idx] for idx in idxs])

    def to_int(self, text):
        return [self.char_to_idx[c] for c in text]

    def to_one_hot(self, idxs):
        return t.eye(self.vocab_size)[idxs]

    def to_text_from_one_hot(self, one_hot):
        return self.to_text(t.argmax(one_hot, dim=-1))

# create the dataset
shakespeare_dataset = ShakespeareDataset(re.split(r"\b", text), 100)

In [6]:
# print(shakespeare_dataset.text[0:52])
# print(shakespeare_dataset.vocab)

for x, y in shakespeare_dataset:
    print(x)
    print(y)
    break


tensor([34542,  9992,   113,  8237,   113,  5523,   113, 17830,   113, 24979,
          113,  9992,   113,  3477,   113, 10995,   113, 24979,   113, 10916,
          113,  9165,   480, 14228,   113, 10916,   113,  9165,     1, 10039,
          113, 17830,   113, 22293,   113, 19582,   113, 31392,   113, 33037,
          113, 24979,   113, 12315,   113, 12317,   113, 21768,   113, 31392,
          113, 10403,   113,  9566,   113, 12244,     0, 24317,   113, 25174,
          113, 25577,   113, 24979,   113, 31392,   113, 34221,   113, 12640,
          113, 24717,   113, 15883,   113, 12244,   113, 34099,   113, 12140,
          113, 24717,   113, 28036,     0, 33837,   786, 11076,   113, 23721,
          113, 15820,   113, 22310,   480, 20265,   113, 22310,   113, 12779])
tensor([ 9992,   113,  8237,   113,  5523,   113, 17830,   113, 24979,   113,
         9992,   113,  3477,   113, 10995,   113, 24979,   113, 10916,   113,
         9165,   480, 14228,   113, 10916,   113,  9165,     1,

In [7]:
def train(config: TransformerConfig, model, train_dataloader: DataLoader, optimizer, criterion):
    criterion = nn.CrossEntropyLoss()
    model = DecoderTransformer(config)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    accuracy_list = []

    for epoch in range(3):  # loop over the dataset multiple times
        accuracy = 0
        total = 0

        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(
                rearrange(outputs, 'batch seq vocab -> batch vocab seq'),
                labels
            )
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            print(i)
            # if i % 20 == 19:
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss:.5f}')
            running_loss = 0.0

            
            
        # for (x, y) in train_dataloader:

        #     x = x.to(config.device)
        #     y = y.to(config.device)

        #     y_hat = model(x)
        #     y_predictions = y_hat.argmax(2)
        #     accuracy += (y_predictions == y).sum().item()
        #     total += y.size(0) * 6

        #     accuracy_list.append(accuracy/total)
        # print(f'accuracy: {accuracy/total:.3f}')

    print('Finished Training')

    return accuracy_list

In [8]:
shakespeare_dataloader = DataLoader(shakespeare_dataset, batch_size=128, shuffle=False)

config = TransformerConfig(
    vocab_size=shakespeare_dataset.vocab_size,
    hidden_size=256,
    num_heads=4,
    num_layers=2
)

model = DecoderTransformer(config)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

accuracy_list = train(config, model, shakespeare_dataloader, optimizer, criterion)

0
[1,     1] loss: 199.93967
1
[1,     2] loss: 191.31787
2
[1,     3] loss: 181.74298
3
[1,     4] loss: 174.21786
4
[1,     5] loss: 164.65164
5
[1,     6] loss: 153.55565
6
[1,     7] loss: 143.33014
7
[1,     8] loss: 132.33330
8
[1,     9] loss: 123.01982
9
[1,    10] loss: 112.78036
10
[1,    11] loss: 102.35070
11
[1,    12] loss: 92.07424
12
[1,    13] loss: 82.79670
13
[1,    14] loss: 74.98795
14
[1,    15] loss: 67.26454
15
[1,    16] loss: 59.73179
16
[1,    17] loss: 55.01676
17
[1,    18] loss: 52.07555
18
[1,    19] loss: 49.65736
19
[1,    20] loss: 48.31279
20
[1,    21] loss: 46.10990
21
[1,    22] loss: 44.47622
22
[1,    23] loss: 43.51808
23
[1,    24] loss: 42.71238
24
[1,    25] loss: 41.77654
25
[1,    26] loss: 41.26542
26
[1,    27] loss: 40.75078
27
[1,    28] loss: 39.27156
28
[1,    29] loss: 39.28583
29
[1,    30] loss: 38.65879
30
[1,    31] loss: 37.96714
31
[1,    32] loss: 38.37193
32
[1,    33] loss: 36.63941
33
[1,    34] loss: 37.01942
34
[1,    35]

In [9]:
model.eval()

DecoderTransformer(
  (decoderlayer): Sequential(
    (decoder0): DecoderBlock(
      (attention): MultiheadMaskedAttention(
        (W_QKV): Linear(in_features=256, out_features=768, bias=True)
        (W_O): Linear(in_features=256, out_features=256, bias=True)
      )
      (layernorm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): MultiLayerPerceptron(
        (model): Sequential(
          (linear1): Linear(in_features=256, out_features=1024, bias=True)
          (GELU): GELU(approximate=none)
          (linear2): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (decoder1): DecoderBlock(
      (attention): MultiheadMaskedAttention(
        (W_QKV): Linear(in_features=256, out_features=768, bias=True)
        (W_O): Linear(in_features=256, out_features=256, bias=True)
      )
      (layernorm1): LayerNorm(