In [None]:
import sys
sys.path.append("..") # for sibling import

import walnut
import walnut.tensor_utils as tu

# Example 5.3

### Language Model: Recurrent Neural network

The neural network is able to predict the following character by looking at multiple previous ones. They do not, however, consider their position and context. The next step is to therefore use a recurrent neural network that allows individual sequence elements to communicate.

### Step 1: Prepare data
Like in the bigram model, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization

In [None]:
from walnut.preprocessing.text import CharacterTokenizer

tknzr = CharacterTokenizer()
tknzr.fit(data)
tknzr.vocab_size

In [None]:
data_enc = tknzr.encode(data)
data_enc[:100]

### Step 3: Build dataset
In this example a larger `block_size` is now used.

In [None]:
num_samples = 100000
block_size = 8

In [None]:
import numpy as np

X = walnut.zeros((num_samples, block_size))
Y = walnut.zeros((num_samples, block_size))

rand_indices = np.random.randint(0, len(data) - block_size - 1, (num_samples,))

for i, index in enumerate(rand_indices):
    context = data_enc[index : index + block_size]
    label = data_enc[index + 1 : index + block_size + 1]

    X[i] = context
    Y[i] = label

X = X.astype("int")
Y = Y.astype("int")

In [None]:
n1 = int(0.99*X.len)

x_train = X[:n1]
y_train = Y[:n1]
x_test = X[n1:]
y_test = Y[n1:]

print(f"{x_train.shape=}")
print(f"{y_train.shape=}")
print(f"{x_test.shape=}")
print(f"{y_test.shape=}")

### Step 4: Build the neural network structure

As our first layer, again, an `Embedding` layer is used. It is followed by a stack of recurrent layers.

In [None]:
import walnut.nn as nn
from walnut.nn.layers import *
from walnut.nn.blocks import *

vocab_size = tknzr.vocab_size
embed_dims = 10
n_hidden = 256
num_rec_layers = 3

model = nn.Sequential([
    Embedding(vocab_size, embed_dims),
    Recurrent(embed_dims, n_hidden, num_layers=num_rec_layers),
    Linear(n_hidden, vocab_size)
])

In [None]:
model.compile(
    optimizer=nn.optimizers.AdamW(3e-4),
    loss_fn=nn.losses.Crossentropy(),
    metric=nn.metrics.get_accuracy
)

In [None]:
from walnut.nn.analysis import model_summary
model_summary(model, (block_size,), "int")

### Step 5: Train the model

In [None]:
epochs = 2
batch_size = 32

train_loss_hist, val_loss_hist = model.train(x_train, y_train, epochs=epochs, batch_size=batch_size)

### Step 6: Evaluate the model

In [None]:
loss, _ = model.evaluate(x_test, y_test)
print(f'loss {loss:.4f}')

# 1.9249 (2 epochs, 8 blocksize, 32 batches, 10 emb dims, 3 rec layers, 256 n_hidden)

### Step 5: Generate text

In [None]:
from walnut.nn.funcional import softmax
context = walnut.ones((1, block_size,)).astype("int")

for _ in range(1000):
    pred = model(context)
    index = walnut.choice(softmax(pred[:, -1]))
    print(tknzr.decode(walnut.expand_dims(index, 0)), end="")
    context = context.append(tu.expand_dims(index, 0), axis=1).astype("int")
    context = context[:, 1:]