In [None]:
import compyute as cp

In [None]:
device = cp.cuda if cp.backend.gpu_available() else cp.cpu
device

# Example 5.3

### Language Model: LSTM

A dense neural network is not able to capture the sequential and time-dependent character of text. An alternative is the LSTM, which is able to memorize past tokens.

### Step 1: Prepare data
Again, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization

In [None]:
from compyute.preprocessing.text import BPETokenizer

tokenizer = BPETokenizer()

In [None]:
# train new tokenizer
# tokenizer.fit(data, vocab_size=1024)
# cp.save(tokenizer.get_state_dict(), "tokenizer.cp")

In [None]:
# load tokenizer
tokenizer_state = cp.load("tokenizer.cp")
tokenizer.load_state_dict(tokenizer_state)

In [None]:
tokenizer.vocab_size

In [None]:
data_enc = tokenizer.encode(data)
len(data_enc)

### Step 3: Build dataset

In [None]:
block_size = 16

In [None]:
data_enc = cp.tensor(data_enc, dtype=cp.int32)
X = cp.stack([data_enc[i * block_size : i * block_size + block_size] for i in range(len(data_enc) // block_size - 1)])
y = cp.stack([data_enc[i * block_size + 1 : i * block_size + block_size + 1] for i in range(len(data_enc) // block_size - 1)])

X, idx = cp.random.shuffle(X)
y = y[idx]

n = int(len(X) * 0.90)

X_train = X.to_int()[:n]
y_train = y.to_int()[:n]
X_val = X.to_int()[n:]
y_val = y.to_int()[n:]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

### Step 4: Build the neural network structure

Now, `LSTM`-layers are used, followed by a dense layer.

In [None]:
from compyute import nn

embed_dims = 64

model = nn.Sequential(
    nn.Embedding(tokenizer.vocab_size, embed_dims),
    nn.LSTM(embed_dims, 128),
    nn.Dropout(0.3),
    nn.LSTM(128, 128),
    nn.Dropout(0.3),
    nn.Linear(128, tokenizer.vocab_size)
)

model.to_device(device)

In [None]:
summary = cp.nn.utils.get_module_summary(model, input_shape=(block_size,), input_dtype=cp.int32)
print(summary)

### Step 5: Train the model

In [None]:
epochs = 50
batch_size = 4096

In [None]:
import os

from compyute.nn.trainer import Trainer
from compyute.nn.trainer.callbacks import Tensorboard
from compyute.nn.trainer.callbacks.lr_schedulers import CosineLrScheduler


logdir = "./runs/lstm"
if not os.path.exists(logdir):
    os.makedirs(logdir)

tb = Tensorboard(logdir)
optim = nn.optimizers.AdamW(lr=1e-3)

steps_per_epoch = X_train.shape[0] // batch_size + 1
warmup_steps = 25 * steps_per_epoch # warmup over first few epochs
decay_steps = 225 * steps_per_epoch # decay over following epochs
lr_scheduler = CosineLrScheduler(optimizer=optim, target_lr=1e-4, warmup_steps=warmup_steps, decay_steps=decay_steps)

trainer = Trainer(
    model=model,
    optimizer=optim,
    loss="cross_entropy",
    metric="accuracy",
    callbacks=[tb, lr_scheduler]
)

In [None]:
trainer.train(X_train, y_train, epochs=epochs, batch_size=batch_size, val_data=(X_val, y_val))

### Step 6: Generate text

In [None]:
context = "KING HENRY"
print(context, end="")

context = tokenizer.encode(context)  # encode context
context = cp.reshape(cp.tensor(context), shape=(1, -1)).to_device(model.device)

for _ in range(300):
    pred = cp.nn.functional.softmax(model(context))
    index = cp.random.multinomial(x=tokenizer.vocab_size, p=pred[0, -1], shape=(1,))
    char = tokenizer.decode(index.to_list())
    print(char, end="")
    context = cp.append(context, values=cp.reshape(index, shape=(1, 1)), axis=1).to_int()
    context = context[:, -block_size:]