In [None]:
import compyute as cp

In [None]:
device = "cuda" if cp.engine.gpu_available() else "cpu"
device

# Example 5.3

### Language Model: LSTM

A dense neural network is not able to capture the sequential and time-dependent character of text. An alternative is the LSTM, which is able to memorize past tokens.

### Step 1: Prepare data
Again, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read().lower()

### Step 2: Tokenization

In [None]:
itos = {i:s for i, s in enumerate(sorted(list(set(data))))}
stoi = {s:i for i, s in itos.items()}

In [None]:
data_enc = cp.tensor([stoi[s] for s in data])
len(data_enc)

### Step 3: Build dataset

In [None]:
block_size = 256

In [None]:
X = cp.stack([data_enc[i : i + block_size] for i in range(len(data_enc) - block_size)])
y = cp.tensor([data_enc[i + 1 : i + 1 + block_size] for i in range(len(data_enc) - block_size)])

n = int(len(X) * 0.99)

X_train = X.int()[:n]
y_train = y.int()[:n]
X_val = X.int()[n:]
y_val = y.int()[n:]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

### Step 4: Build the neural network structure

Now, `LSTM`-layers are used, followed by a dense layer.

In [None]:
import compyute.nn as nn
from transformer.transformer import Transformer

embed_dims = 384
mask = cp.triu(cp.full(shape=(block_size, block_size), value=float("-inf")), d=1)

model = Transformer(
    vocab_size=len(itos),
    emb_dim=embed_dims,
    ffd_dim=4*embed_dims,
    n_heads=6,
    n_layers=6,
    sequence_length=block_size,
    mask=mask,
    dropout=0.2
)

model.to_device(device)

In [None]:
summary = model.get_summary(input_shape=(block_size,), input_dtype="int32")
print(summary)

### Step 5: Train the model

In [None]:
batch_size = 16

train_dl = nn.DataLoader(X_train, y_train, batch_size, device=device)
val_dl = nn.DataLoader(X_val, y_val, batch_size, device=device)
loss_func = nn.CrossEntropy()
optim = nn.optimizers.AdamW(model.parameters, lr=3e-4)

step = 1
for x, y in train_dl():
    # training
    with model.training():
        # forward pass
        y_pred = model(x)
        loss = loss_func(y_pred, y).item()
        print(f"step {step}: {loss=:.4f}")

        # backward pass
        optim.reset_grads()  # reset all gradients
        model.backward(loss_func.backward())  # compute new gradients
        optim.step()  # update parameters

    step += 1

### Step 6: Generate text

In [None]:
context = "KING HENRY"
print(context, end="")

context = tokenizer.encode(context)  # encode context
context = cp.pad(context, padding=(block_size - len(context), 0))  # fill with zeros to match context window
context = cp.reshape(context, shape=(1, -1))
context.to_device(model.device)

for _ in range(300):
    pred, _ = cp.nn.functional.softmax(model(context))
    index = cp.random.multinomial(x=tokenizer.vocab_size, p=pred[0, -1], shape=(1,))
    char = tokenizer.decode(index)
    print(char, end="")
    context = cp.append(context, values=cp.reshape(index, shape=(1, 1)), axis=1).int()
    context = context[:, 1:]