In [None]:
import sys
sys.path.append("..") # for sibling import

import walnut
import walnut.tensor_utils as tu

# Example 5.1

### Language Model: Bigram model

The goal of this model is to be able to generate text that is similar to the training data using a single character to predict the next one.

### Step 1: Prepare data
The dataset can be downloaded from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt. Afterwards, it needs to be placed it into the */data* directory.

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization
To train a neural network on text, it needs to be represented by numerical values. For this reason a tokenizer is used. To build a vocabulary of tokens, here the `CharacterTokenizer` is used. In this step `fit()` is used to extract tokens from the previously imported data. Every character is assigned an integer token.

In [None]:
from walnut.preprocessing.text import CharacterTokenizer

tknzr = CharacterTokenizer()
tknzr.fit(data)
tknzr.vocab_size

Now that the tokenizer has built a vocabulary, it can be used to encode and decode data. Here, the string "Hello World!" is encoded and afterwards represented by the respective tokens.

In [None]:
string = "Hello World!"
encoded_string = tknzr.encode(string)
encoded_string

A tensor of tokens can also be decoded again to get the original string.

In [None]:
tknzr.decode(encoded_string)

To use the data to train a model, it needs to be encoded.

In [None]:
data_enc = tknzr.encode(data)
data_enc[:20]

### Step 3: Build dataset
Next up, we need to build the training dataset. `num_samples` represents the number of samples for the neural network. `block_size` defines how many characters are considered when trying to predict the following one. Since this is a bigram model, only one character is considered.

In [None]:
num_samples = 100000
block_size = 1

To create a dataset, samples are taken from the data by randomly selecting a character as an input and the following character as the target.

In [None]:
import numpy as np

# initialize tensors with zeros
X = walnut.zeros((num_samples, block_size))
Y = walnut.zeros((num_samples, block_size))

#randomly choose indices of blocks in the original data
rand_indices = np.random.randint(0, len(data) - block_size - 1, (num_samples,))

for i, index in enumerate(rand_indices):
    # get characters and the label from the data
    context = data_enc[index : index + block_size]
    label = data_enc[index + 1 : index + block_size + 1]

    # one-hot-encode indices and add to the tensors
    X[i] = context
    Y[i] = label

X = X.astype("int")
Y = Y.astype("int")

In [None]:
n1 = int(0.99*X.len)

x_train = X[:n1]
y_train = Y[:n1]
x_test = X[n1:]
y_test = Y[n1:]

print(f"{x_train.shape=}")
print(f"{y_train.shape=}")
print(f"{x_test.shape=}")
print(f"{y_test.shape=}")

### Step 4: Build the neural network structure

An `Embedding` layer is used to assign each token an n-dimensional vector. The vector's components are then learned and updated during the training process.

In [None]:
import walnut.nn as nn
from walnut.nn.layers import *

vocab_size = tknzr.vocab_size

model = nn.Sequential([
    Embedding(vocab_size, vocab_size)
])

In [None]:
model.compile(
    optimizer=nn.optimizers.AdamW(3e-4),
    loss_fn=nn.losses.Crossentropy(),
    metric=nn.metrics.get_accuracy
)

In [None]:
from walnut.nn.analysis import model_summary
model_summary(model, (block_size,), "int")

### Step 5: Train the model

Since there are usually quite a large number of classes (=tokens) in language models, the training process can be slower.

In [None]:
epochs = 100
batch_size = 32

train_loss_hist, val_loss_hist = model.train(x_train, y_train, epochs=epochs, batch_size=batch_size)

### Step 6: Evaluate the model

In [None]:
loss, _ = model.evaluate(x_test, y_test)
print(f'loss {loss:.4f}')

### Step 6: Generate text
To see the model in action, it is given a starting character (here token 1). Then it is used to generate $n$ characters using previous charactes as input. The better the model is trained, the more sensical the output will be. This can take quite a lot of training though.

In [None]:
from walnut.nn.funcional import softmax
context = walnut.ones((1, block_size,)).astype("int") # use 1 as startig context

n = 1000

for _ in range(n):
    pred = model(context).squeeze() # predict following character using the current context
    index = walnut.random_choice_indices(softmax(pred)) # choose a character from prediction
    print(tknzr.decode(walnut.expand_dims(index, 0)), end="")
    context = context.append(tu.expand_dims(index, 0), axis=1).astype("int") # append predicted character to context
    context = context[:, 1:] # set new context