In [None]:
import compyute as cp

# Example 5.2

### Language Model: Neural network

The bigram model is able to predict the following character by looking at the previous one. For better predictions it helps to not only consider one character for a prediction. In this example a neural network is used that uses multiple characters for predictions.

### Step 1: Prepare data
Like in the bigram model, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()[:100000]

### Step 2: Tokenization

This time, a Byte-Pair-Encoding tokenizer is used to allow for more information to be passed into the neural net without increasing the context size.

In [None]:
from compyute.preprocessing.text import BPETokenizer

tokenizer = BPETokenizer()

In [None]:
# train new tokenizer
# tokenizer.fit(data, vocab_size=1024)
# cp.save(tokenizer.get_state_dict(), "small_tokenizer.cp")

In [None]:
# load tokenizer
tokenizer_state = cp.load("small_tokenizer.cp")
tokenizer.load_state_dict(tokenizer_state)

In [None]:
tokenizer.vocab_size

In [None]:
data_enc = tokenizer.encode(data)
len(data_enc)

### Step 3: Build dataset
In this example a larger `block_size` is now used.

In [None]:
block_size = 32

In [None]:
data_enc = cp.tensor(data_enc, dtype=cp.int32)
X = cp.stack([data_enc[i * block_size : i * block_size + block_size] for i in range(len(data_enc) // block_size - 1)])
y = cp.stack([data_enc[i * block_size + block_size] for i in range(len(data_enc) // block_size - 1)])

X, idx = cp.random.shuffle(X)
y = y[idx]

X_train = X
y_train = y

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")

### Step 4: Build the neural network structure

As our first layer, again, an `Embedding` layer is used. It is followed by a stack of linear layers.

In [None]:
from compyute import nn

vocab_size = tokenizer.vocab_size
embed_dims = 10
n_hidden = 128

weight_init = "kaiming_normal"

emb = nn.Embedding(vocab_size, embed_dims)
lin = nn.Linear(n_hidden, vocab_size)
nn.utils.initializers.get_initializer(weight_init, "tanh")(lin.w)

model = nn.Sequential(
    emb,
    nn.Flatten(),
    nn.DenseBlock(block_size*embed_dims, n_hidden, activation="tanh", weight_init=weight_init),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init=weight_init),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init=weight_init),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init=weight_init),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init=weight_init),
    lin
)

In [None]:
summary = cp.nn.utils.get_module_summary(model, input_shape=(block_size,), input_dtype=cp.int32)
print(summary)

### Step 5: Train the model

To avoid overfitting the model, the `EarlyStopping`-Callback can be used.

In [None]:
epochs = 1
batch_size = 32

dl = nn.utils.Dataloader((X, y), batch_size=batch_size)
optim = nn.optimizers.Adam(model.get_parameters())
loss_fn = nn.CrossEntropy()

model.retain_values = True

for e in range(epochs):
    # training
    model.training()
    for x, y in dl():
        # forward pass
        y_pred = model(x)
        _ = loss_fn(y_pred, y)

        # backward pass
        model.compute_grads(loss_fn.compute_grads())  # compute new gradients
        optim.step()  # update parameters
        break
    break

### Step 6: Analyze the model
Usind different plots, the models performance and training behaviour can be analyzed (eg. checking for overfitting)

In [None]:
# !pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

def plot_distrbution(ys):
    plt.figure(figsize=(20, 4))
    legends = []
    for y in ys:
        label, tensor = y
        mean = cp.mean(tensor).item()
        std = cp.std(tensor).item()
        print(f"{label:10s} | mean {mean:9.4f} | std {std:9.4f}")
        y_vals, x_vals = cp.histogram(tensor, bins=n_hidden, density=True)
        plt.plot(x_vals[:-1], y_vals, linewidth=1)
        legends.append(f"{label:s}")
    plt.legend(legends)

##### Weights

In [None]:
lin_weights = [(m.label + str(i), p.to_cpu()) for i, m in enumerate(model.get_modules()) for p in m.get_parameters(False) if p.ndim > 1 and m.label == "Linear"]
plot_distrbution(lin_weights)

##### Activations

In [None]:
tanh_activations = [(m.label + str(i), m.y.to_cpu()) for i, m in enumerate(model.get_modules()) if m.label == "Tanh"]
plot_distrbution(tanh_activations)

In [None]:
tanh_saturations = [(m.label + str(i), cp.abs(m.y.to_cpu()) > 0.99) for i, m in enumerate(model.get_modules()) if m.label == "Tanh"]

for label, image in tanh_saturations:
    plt.imshow(image, cmap="gray")
    plt.xlabel(label)
    plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
    plt.show()

As mentioned, the gradient of saturated neurons get very close to zero. If that happens for all batches, then the neuron is not learning and it is considererd dead (white pixels in the plot).

##### Gradients

In [None]:
lin_weight_grads = [(m.label + str(i), p.grad.to_cpu()) for i, m in enumerate(model.get_modules()) for p in m.get_parameters(False) if p.ndim > 1 and m.label == "Linear"]
plot_distrbution(lin_weight_grads)

In [None]:
activation_gradients = [(m.label + str(i), m.y.grad.to_cpu()) for i, m in enumerate(model.get_modules()) if m.label == "Tanh"]
plot_distrbution(activation_gradients)