In [None]:
import compyute as cp

In [None]:
device = "cuda" if cp.engine.gpu_available() else "cpu"
device

In [None]:
cp.engine.set_cuda_tf32(True)

# Example 5.3

### Language Model: LSTM

A dense neural network is not able to capture the sequential and time-dependent character of text. An alternative is the LSTM, which is able to memorize past tokens.

### Step 1: Prepare data
Again, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization

In [None]:
from compyute.preprocessing.text import BPETokenizer, save_tokenizer, load_tokenizer

# tokenizer = BPETokenizer()
# tokenizer.fit(data, vocab_size=512)

# save_tokenizer(tokenizer, "nn_tokenizer.cp")
tokenizer = load_tokenizer("nn_tokenizer.cp")

tokenizer.vocab_size

In [None]:
data_enc = tokenizer.encode(data)
len(data_enc)

### Step 3: Build dataset

In [None]:
block_size = 128

In [None]:
X = cp.stack([data_enc[i * block_size : i * block_size + block_size] for i in range(len(data_enc) // block_size)])
y = cp.stack([data_enc[i * block_size + 1 : i * block_size + block_size + 1] for i in range(len(data_enc) // block_size)])

X, idx = cp.random.shuffle(X)
y = y[idx]

n = int(len(X) * 0.90)

X_train = X.to_int()[:n]
y_train = y.to_int()[:n]
X_val = X.to_int()[n:]
y_val = y.to_int()[n:]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

### Step 4: Build the neural network structure

In [None]:
import compyute.nn as nn
from transformer import Transformer

In [None]:
embed_dims = 384
mask = cp.triu(cp.full(shape=(block_size, block_size), value=float("-inf")), d=1)

model = Transformer(
    n_embeddings=tokenizer.vocab_size,
    embedding_dim=embed_dims,
    feedforward_channels=4*embed_dims,
    n_heads=6,
    n_layers=6,
    sequence_length=block_size,
    mask=mask,
    activation="gelu"
)

model.to_device(device)

In [None]:
summary = model.get_summary(input_shape=(block_size,), input_dtype=cp.int32)
print(summary)

### Step 5: Train the model

In [None]:
# model = cp.nn.load_module(f"transformer4_2000.cp")
# model.to_device(device)

In [None]:
batch_size = 64

train_dl = nn.Dataloader(X_train, y_train, batch_size, device=device)
val_dl = nn.Dataloader(X_val, y_val, batch_size, device=device)
loss_func = nn.CrossEntropy()
optim = nn.optimizers.AdamW(model.parameters, lr=6e-4)

val_interval = 200
max_iter = 5000
checkpoint_interal = 1000
step = 1

In [None]:
from datetime import datetime
import os
import time

from compyute.utils.tensorboard import SummaryWriter

# create tensorboard logging directory
label = "transformer5"
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logdir = f"./runs/{label}_{timestamp}/"
if not os.path.exists(logdir):
    os.makedirs(logdir)

with SummaryWriter(log_dir=logdir) as writer:
    while step < max_iter:
        for x, y in train_dl():
            start = time.time()
            # training
            with model.training():
                # forward pass
                y_pred = model(x)
                loss = loss_func(y_pred, y).item()
                writer.add_scalar("train/loss", loss, step)

                # backward pass
                optim.reset_grads()  # reset all gradients
                model.backward(loss_func.backward())  # compute new gradients
                optim.step()  # update parameters

            dt = (time.time() - start) * 1000
            writer.add_scalar("train/dt", dt, step)

            # validation
            if step % val_interval == 0:
                val_loss = 0
                for x_val, y_val in val_dl():
                    y_pred = model(x_val)
                    val_loss += loss_func(y_pred, y_val).item()
                val_loss /= len(val_dl)
                writer.add_scalar("val/loss", val_loss, step)

            # save checkpoints
            if step > 1 and step % checkpoint_interal == 0:
                cp.nn.save_module(model, f"{label}_{step}.cp")

            if step > max_iter:
                break
            step += 1

### Step 6: Generate text

In [None]:
context = "First citizen:"
print(context, end="")

context = tokenizer.encode(context)  # encode context
context = cp.reshape(context, shape=(1, -1)).to_device(model.device)

for _ in range(300):
    logits = model(context)[0, -1].to_cpu()  # get logits
    probs, _ = cp.nn.functional.softmax(logits)  # convert to probs
    topk_probs, topk_indices = cp.topk(probs, 50)  # get top 50 probs
    topk_probs /= cp.sum(topk_probs)  # normalize probs
    index = cp.random.multinomial(x=50, p=topk_probs, shape=(1,))  # sample
    index = topk_indices[index]  # get token id
    char = tokenizer.decode(index)
    print(char, end="")
    context = cp.append(context, values=cp.reshape(index, shape=(1, 1)), axis=1).to_int()  # append to context
    context = context[:, -block_size:].to_device(device)