In [None]:
import sys
sys.path.append("..") # for sibling import

import compyute as cp

# Example 5.1

### Language Model: Bigram model

The goal of this model is to be able to generate text that is similar to the training data using a single character to predict the next one.

### Step 1: Prepare data
The dataset can be downloaded from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt. Afterwards, it needs to be placed it into the */data* directory.

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization
To train a neural network on text, it needs to be represented by numerical values. For this reason a tokenizer is used. To build a vocabulary of tokens, here the `CharacterTokenizer` is used.

In [None]:
from compyute.preprocessing.text import CharacterTokenizer

tknzr = CharacterTokenizer()
tknzr.vocab_size

Every character is assigned an integer token.

In [None]:
tknzr.vocab

Tokenizers can be used to encode text and decode token ids. Here, the string "Hello World!" is encoded and afterwards represented by the respective token ids.

In [None]:
string = "Hello World!"
encoded_string = tknzr.encode(string)
encoded_string

A tensor of token ids can also be decoded again to get the original text.

In [None]:
[tknzr.decode([i]) for i in encoded_string]

In [None]:
tknzr.decode(encoded_string)

To use the data to train a model, it needs to be encoded.

In [None]:
data_enc = tknzr.encode(data)
data_enc[:20]

### Step 3: Build dataset
Next up, we need to build the training dataset. `num_samples` represents the number of samples for the neural network. `block_size` defines how many characters are considered when trying to predict the following one. Since this is a bigram model, only one character is considered.

In [None]:
num_samples = 100000
block_size = 1

To create a dataset, samples are taken from the data by randomly selecting a character as an input and the following character as the target.

In [None]:
# initialize tensors with zeros
X = cp.zeros((num_samples, block_size))
y = cp.zeros((num_samples, block_size))

#randomly choose indices of blocks in the original data
rand_indices = cp.random.uniform_int((num_samples,), 0, len(data) - block_size - 1)

for i, index in enumerate(rand_indices):
    # get characters and the label from the data
    context = data_enc[index : index + block_size]
    label = data_enc[index + 1 : index + block_size + 1]

    # one-hot-encode indices and add to the tensors
    X[i] = context
    y[i] = label

X_train = X.astype("int32")
y_train = y.astype("int32")[:, -1]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")

### Step 4: Build the neural network structure

An `Embedding` layer is used to assign each token an n-dimensional vector. The vector's components are then learned and updated during the training process.

In [None]:
import compyute.nn as nn
from compyute.nn.layers import *

vocab_size = tknzr.vocab_size

model = nn.SequentialModel([
    Embedding(vocab_size, vocab_size),
    Flatten()
])

In [None]:
nn.models.model_summary(model, (block_size,), "int32")

### Step 5: Train the model

Since there are usually quite a large number of classes (=tokens) in language models, the training process can be slower.

In [None]:
from compyute.nn.trainer import Trainer
from compyute.nn.trainer.optimizers import AdamW
from compyute.nn.trainer.losses import Crossentropy
from compyute.nn.trainer.metrics import Accuracy

trainer = Trainer(
    model=model,
    optimizer=AdamW(lr=1e-2),
    loss_functon=Crossentropy(),
    metric_function=Accuracy(),
)

In [None]:
epochs = 1
batch_size = 32

_ = trainer.train(X_train, y_train, epochs=epochs, batch_size=batch_size)

### Step 6: Analyze Results

In [None]:
# !pip install scikit-learn matplotlib

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# get numpy array of embedding table
embs = model.child_modules[0].child_modules[0].w.cpu().data

# reduce dimensions to 2 to make
tsne = TSNE(random_state=0).fit_transform(embs)

# plot results
plt.figure(figsize=(8, 8))
plt.scatter(x=tsne[:,0], y=tsne[:,1], alpha=0.5, s=100)
plt.axis("off")
for i in range(len(tsne)):
    char = tknzr.decode(cp.Tensor([i]))
    plt.text(x=tsne[i,0]-0.1, y=tsne[i,1]-0.15, s=char)

### Step 7: Generate text
To see the model in action, it is given a starting character (here token 1). Then it is used to generate $n$ characters using previous charactes as input. The better the model is trained, the more sensical the output will be. This can take quite a lot of training though.

In [None]:
from compyute.nn.funcional import softmax

context = cp.ones((1, block_size,)).int() # use 1 as startig context
n = 1000

for _ in range(n):
    pred = model(context).squeeze() # predict following character using the current context
    index = cp.random.multinomial(
        x=pred.shape[-1],
        p=softmax(pred),
        shape=(1,)
    ) # choose a character from prediction
    print(tknzr.decode([index.item()]), end="")
    context = context.append(index[None, :], axis=1).int() # append predicted character to context
    context = context[:, 1:] # set new context