In [1]:
import sys
sys.path.append("..") # for sibling import

import compyute as cp

Compyute: available devices ['cpu', 'cuda']


In [2]:
device = "cuda" if cp.engine.gpu_available() else "cpu"
device

'cuda'

# Example 5.3

### Language Model: LSTM

A dense neural network is not able to capture the sequential and time-dependent character of text. An alternative is the LSTM, which is able to memorize past tokens.

### Step 1: Prepare data
Again, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [3]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization

In [4]:
from compyute.preprocessing.text import BPETokenizer, save_tokenizer, load_tokenizer

# tokenizer = BPETokenizer()
# tokenizer.fit(data, vocab_size=400)
# save_tokenizer(tokenizer, "nn_tokenizer.cp")
tokenizer = load_tokenizer("nn_tokenizer.cp")

tokenizer.vocab_size

400

In [5]:
data_enc = tokenizer.encode(data)
len(data_enc)

617172

### Step 3: Build dataset

In [6]:
block_size = 8

In [7]:
X = cp.stack([data_enc[i : i + block_size] for i in range(len(data_enc) - block_size)])
y = cp.Tensor([data_enc[i + 1 : i + 1 + block_size] for i in range(len(data_enc) - block_size)])

n = int(len(X) * 0.9)

X_train = X.int()[:n]
y_train = y.int()[:n]
X_val = X.int()[n:]
y_val = y.int()[n:]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

X_train.shape=(555447, 8)
y_train.shape=(555447, 8)
X_val.shape=(61717, 8)
y_val.shape=(61717, 8)


### Step 4: Build the neural network structure

Now, `LSTM`-layers are used, followed by a dense layer.

In [8]:
import compyute.nn as nn

vocab_size = tokenizer.vocab_size
embed_dims = 32

model = nn.Sequential(
    nn.Embedding(vocab_size, embed_dims),

    nn.Layernorm((block_size, embed_dims)),
    nn.LSTM(embed_dims, 256),

    nn.Layernorm((block_size, 256)),
    nn.Linear(256, vocab_size)
)

model.to_device(device)

In [9]:
model.summary(input_shape=(block_size,), input_dtype="int32")

Sequential
---------------------------------------------------------------
Layer                     Output Shape            # Parameters
Sequential                (-1, 8, 400)                  417168
 Embedding                (-1, 8, 32)                    12800
 Layernorm                (-1, 8, 32)                      512
 LSTM                     (-1, 8, 256)                  296960
 Layernorm                (-1, 8, 256)                    4096
 Linear                   (-1, 8, 400)                  102800

Total parameters: 417168


### Step 5: Train the model

In [10]:
from compyute.nn.trainer import optimizers, Trainer
from compyute.nn.trainer.callbacks import AdaptiveLR, EarlyStopping, History, ProgressBar

history = History()

trainer = Trainer(
    model=model,
    optimizer=optimizers.Adam(lr=1e-3),
    loss="cross_entropy",
    metric="accuracy",
    callbacks=[
        history,
        EarlyStopping(target="val_loss", patience=5),
        AdaptiveLR(target="val_loss", epoch_range=3),
        ProgressBar()
    ]
)

In [11]:
epochs = 50
batch_size = 4096

trainer.train(X_train, y_train, epochs=epochs, batch_size=batch_size, val_data=(X_val, y_val))

Epoch 1/50:   0%|          | 0/136 [00:00<?, ? steps/s]

Epoch 2/50:   0%|          | 0/136 [00:00<?, ? steps/s]

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def plot_history(t1, t2):
    trace1 = history[t1]
    trace2 = history[t2]
    plt.figure(figsize=(10, 3))
    plt.plot(cp.arange(start=1, stop=len(trace1) + 1).to_numpy(), trace1, linewidth=1)
    plt.plot(cp.arange(start=1, stop=len(trace2) + 1).to_numpy(), trace2, linewidth=1)
    plt.legend([t1, t2])
    plt.grid(color="gray", linestyle="--", linewidth=0.5)

plot_history("val_loss", "val_accuracy_score")

### Step 6: Generate text

In [None]:
context = "You shall "
print(context, end="")

context = tokenizer.encode(context)
context = context.pad((block_size - len(context), 0)).reshape((1, -1))
context.to_device(model.device)

for _ in range(250):
    pred = model(context).squeeze()
    index = pred[-1].argmax(-1)
    char = tokenizer.decode([index.item()])
    print(char, end="")
    context = context.append(index.reshape((1, 1)), axis=1).int()
    context = context[:, 1:]