### Lab 8.3 Text generation

In this lab you will finish building your RNN text generator.  I found that this code actually runs pretty quickly on my MacBook without GPU acceleration.

In [1]:
device = 'cpu'
seq_len = 20
hidden_size = 100
batch_size = 32
lr = 3e-4
epochs = 10

In [2]:
import numpy as np

from tqdm import tqdm, trange

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchmetrics

  from .autonotebook import tqdm as notebook_tqdm


Here's the code to download and prepare the sonnet dataset.

In [3]:
!wget --no-clobber "https://www.dropbox.com/scl/fi/7r68l64ijemidyb9lf80q/sonnets.txt?rlkey=udb47coatr2zbrk31hsfbr22y&dl=1" -O sonnets.txt
text = (open("sonnets.txt").read())
text = text.lower().strip()

File ‘sonnets.txt’ already there; not retrieving.


In [4]:
print(text[:1000])

i

 from fairest creatures we desire increase,
 that thereby beauty's rose might never die,
 but as the riper should by time decease,
 his tender heir might bear his memory:
 but thou, contracted to thine own bright eyes,
 feed'st thy light's flame with self-substantial fuel,
 making a famine where abundance lies,
 thy self thy foe, to thy sweet self too cruel:
 thou that art now the world's fresh ornament,
 and only herald to the gaudy spring,
 within thine own bud buriest thy content,
 and tender churl mak'st waste in niggarding:
   pity the world, or else this glutton be,
   to eat the world's due, by the grave and thee.

 ii

 when forty winters shall besiege thy brow,
 and dig deep trenches in thy beauty's field,
 thy youth's proud livery so gazed on now,
 will be a tatter'd weed of small worth held:
 then being asked, where all thy beauty lies,
 where all the treasure of thy lusty days;
 to say, within thine own deep sunken eyes,
 were an all-eating shame, and thriftless praise.


Here's my solution for the `CharacterDataset` class.

Note that it returns an entire sequence of tokens for the target (unlike what we did on Monday where it only output a single token for the target.)

In [6]:
class CharacterDataset(Dataset):
  def __init__(self,text,seq_len=100,device='cpu'):
    """
    Initialize a dataset using character tokenization.
    Arguments:
      text: a string containing the dataset
      seq_len: sequence length provided by __getitem__
      device: device for PyTorch tensors
    """
    self.text = text
    self.seq_len = seq_len
    self.vocabulary = ''.join(sorted(list(set(text))))
    self.index_to_char = {n:char for n, char in enumerate(self.vocabulary)}
    self.char_to_index = {char:n for n, char in enumerate(self.vocabulary)}
    self.device = device

  def __len__(self):
    """ Return the length of sequences in the dataset. """
    return len(self.text)-self.seq_len-1

  def __getitem__(self,idx):
    """ Return the input and target sequences starting at given index. """

    text = self.text[idx:idx+self.seq_len+1]
    tokens = self.encode(text)

    return torch.tensor(tokens[:-1],device=self.device),torch.tensor(tokens[1:],device=self.device)
  
  def encode(self,text):
    """ Encode a string to a list of integer tokens. """
    return list(map(self.char_to_index.get,text))

  def decode(self,tokens):
    """ Decode a list of token integers into a string. """
    return ''.join(list(map(self.index_to_char.get,tokens)))

In [7]:
ds = CharacterDataset(text,seq_len=seq_len,device=device)

In [8]:
ds.encode(text[:100])

[38,
 20,
 0,
 0,
 1,
 17,
 29,
 26,
 24,
 1,
 17,
 12,
 20,
 29,
 16,
 30,
 31,
 1,
 14,
 29,
 16,
 12,
 31,
 32,
 29,
 16,
 30,
 1,
 34,
 16,
 1,
 15,
 16,
 30,
 20,
 29,
 16,
 1,
 20,
 25,
 14,
 29,
 16,
 12,
 30,
 16,
 6,
 0,
 1,
 31,
 19,
 12,
 31,
 1,
 31,
 19,
 16,
 29,
 16,
 13,
 36,
 1,
 13,
 16,
 12,
 32,
 31,
 36,
 3,
 30,
 1,
 29,
 26,
 30,
 16,
 1,
 24,
 20,
 18,
 19,
 31,
 1,
 25,
 16,
 33,
 16,
 29,
 1,
 15,
 20,
 16,
 6,
 0,
 1,
 13,
 32,
 31,
 1,
 12,
 30]

In [26]:
print(ds.decode(ds.encode(text[:100])))

i

 from fairest creatures we desire increase,
 that thereby beauty's rose might never die,
 but as


In [10]:
x, y = ds[0]
x.shape, y.shape

(torch.Size([20]), torch.Size([20]))

In [11]:
dl = DataLoader(ds,shuffle=True,batch_size=batch_size)

Here's my solution for the recurrent neural network (RNN) implementation.

In [12]:
class CharacterRNN(nn.Module):
  def __init__(self,vocabulary_size,hidden_size):
    super().__init__()
    self.embedding = nn.Embedding(vocabulary_size,hidden_size)
    self.hidden_size = hidden_size
    self.U = nn.Linear(hidden_size,hidden_size)
    self.W = nn.Linear(hidden_size,hidden_size)
    self.act = nn.SiLU()
    self.V = nn.Linear(hidden_size,vocabulary_size)

  def forward(self,x):
    x = self.embedding(x)
    B,N = x.shape[:2]
    h = torch.zeros(B,self.hidden_size).to(x.device)
    Ux = self.U(x)
    y = []
    for i in range(N):
      Wh = self.W(h)
      h = self.act(Ux[:,i] + Wh)
      y.append(self.V(h))
    return torch.stack(y,dim=1)

In [13]:
model = CharacterRNN(len(ds.vocabulary),hidden_size).to(device)

In [14]:
x_batch, y_batch = next(iter(dl))
x_batch.shape, y_batch.shape

(torch.Size([32, 20]), torch.Size([32, 20]))

In [15]:
model(x_batch).shape

torch.Size([32, 20, 39])

Finally here is my code to train the model.

Note that I needed to use `.view()` to reshape the model output and target, becuase the loss and metric functions want the data to have shape [B,C] not [B,N,C].

In [16]:
opt = torch.optim.Adam(model.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()

metric = torchmetrics.classification.Accuracy(task="multiclass", num_classes=len(ds.vocabulary))
metric.to(device)

MulticlassAccuracy()

In [17]:
epochs = 10

for epoch in range(epochs):
  model.train()
  pbar = tqdm(total=len(dl))
  for x_batch, y_batch in dl:
    opt.zero_grad()

    y_pred = model(x_batch)
    loss = loss_fn(y_pred.view(-1,len(ds.vocabulary)),y_batch.view(-1))

    loss.backward()

    opt.step()

    pbar.update(1)
  pbar.close()

  model.eval()

  metric.reset()
  pbar = tqdm(total=len(dl))
  for x_batch, y_batch in dl:
    y_pred = model(x_batch)
    metric(y_pred.view(-1,len(ds.vocabulary)),y_batch.view(-1))
    pbar.update(1)
  pbar.close()

  acc = metric.compute().item()

  print(f'epoch {epoch}: {acc}')

100%|██████████| 3060/3060 [00:17<00:00, 179.96it/s]
100%|██████████| 3060/3060 [00:05<00:00, 543.82it/s]


epoch 0: 0.46870550513267517


100%|██████████| 3060/3060 [00:14<00:00, 210.84it/s]
100%|██████████| 3060/3060 [00:05<00:00, 557.40it/s]


epoch 1: 0.4899917244911194


100%|██████████| 3060/3060 [00:12<00:00, 239.54it/s]
100%|██████████| 3060/3060 [00:05<00:00, 584.60it/s]


epoch 2: 0.5006649494171143


100%|██████████| 3060/3060 [00:13<00:00, 229.49it/s]
100%|██████████| 3060/3060 [00:05<00:00, 535.39it/s]


epoch 3: 0.508805513381958


100%|██████████| 3060/3060 [00:14<00:00, 212.09it/s]
100%|██████████| 3060/3060 [00:06<00:00, 467.93it/s]


epoch 4: 0.5122350454330444


100%|██████████| 3060/3060 [00:14<00:00, 209.98it/s]
100%|██████████| 3060/3060 [00:06<00:00, 446.35it/s]


epoch 5: 0.5169149041175842


100%|██████████| 3060/3060 [00:14<00:00, 209.55it/s]
100%|██████████| 3060/3060 [00:05<00:00, 566.59it/s]


epoch 6: 0.5203076601028442


100%|██████████| 3060/3060 [00:12<00:00, 245.97it/s]
100%|██████████| 3060/3060 [00:06<00:00, 487.12it/s]


epoch 7: 0.5229532718658447


100%|██████████| 3060/3060 [00:12<00:00, 236.07it/s]
100%|██████████| 3060/3060 [00:05<00:00, 570.31it/s]


epoch 8: 0.5253639221191406


100%|██████████| 3060/3060 [00:12<00:00, 241.14it/s]
100%|██████████| 3060/3060 [00:05<00:00, 564.95it/s]

epoch 9: 0.5271657705307007





### Exercises

1. Write a deterministic function to generate text given some starter text.  The function should iteratively add characters to the prompt using the trained model.  This version should be deterministic, in that in always takes the most likely next character according to the model.

Test the function by prompting it with the first 10 characters in the dataset.

In [None]:
def generate_text_deterministic(model,prompt,num_to_generate=1000):
    model.eval()
    generated = prompt # start with the given prompt 

    input_seq = torch.tensor([ds.char_to_index[c] for c in prompt], device=device) # Shape: (seq_len, )
    # Creates a word using indices; EX: "hello" is like [0, 1, 2, 2, 3, 4]

    input_seq = input_seq.unsqueeze(0) # Shape: (1, seq_len)
    # Adds dimension at index 0. 

    for _ in range(num_to_generate):
        # nums_to_generate: number of characters/tokens to be generated
        with torch.no_grad():
            output = model(input_seq)  # Get model predictions
            next_token = torch.argmax(output[0, -1]).item()  # most probable character (represented as an index)
            next_char = ds.index_to_char[next_token] 

            generated += next_char

            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], device=device)], dim=1) # add the new token
            input_seq = input_seq[:, -seq_len:]  # keep only the last seq_len characters
            # input_seq  = [0, 1, 2, 2, 3, 4] ("hello") -> [1, 2, 2, 3, 4, 5] ("ello ")

    return generated

3. Write a stochastic version of the text generation function.  This one should use `torch.multinomial` to sample the next character.  Note that you will need to apply `torch.softmax` to convert the model output to probabilities.  (In my experience if you don't this you end up with a CUDA error and you end up needing to restart your kernel, so be careful!)

Test the function by prompting it with the first 10 characters in the dataset, and run the generation multiple times to verify the stochastic behavior.

In [None]:
def generate_text_stochastic(model,prompt,num_to_generate=1000, temperature=0.1):
    model.eval()
    generated = prompt
    input_seq = torch.tensor([ds.char_to_index[c] for c in prompt], device=device).unsqueeze(0)
    for _ in range(num_to_generate):
        with torch.no_grad():
            output = model(input_seq)  # Get model predictions

            probabilities = torch.softmax(output[0, -1] / temperature, dim=0)  # logits to probabilities
            # dim = 0 corresponds to vocabulary dimension C in (B X N X C)
            # C is the number of possible classes/characters
            # higher temperature -> Uniform probability; more random
            # lower temperature -> More peaks; more confident

            next_token = torch.multinomial(probabilities, 1).item()  # sample next character
            next_char = ds.index_to_char[next_token]

            generated += next_char
            
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], device=device)], dim=1) # add the new token
            input_seq = input_seq[:, -seq_len:]  # keep only the last seq_len characters

    return generated

In [32]:
print(ds.decode(ds.encode(text[:100])))

i

 from fairest creatures we desire increase,
 that thereby beauty's rose might never die,
 but as


In [34]:

prompt_text = text[:10]
print("prompt_text: ", prompt_text)
print("Generate Text Deterministic: ", generate_text_deterministic(model, prompt_text, num_to_generate=200))
print("Generate Text Stochastic: ", generate_text_stochastic(model, prompt_text, num_to_generate=200, temperature=2))


prompt_text:  ﻿i

 from 
Generate Text Deterministic:  ﻿i

 from the summer's shall the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth the worth t
Generate Text Stochastic:  ﻿i

 from a-crow:
 'tis forgworn impwams turow:
 a gifey
 up
 repumace,
 shadisw; retrupowss coudsts,
 calth, hom if thou will,--righethel,' whicvel's plence bo.'inh touch
, of i loveirad
 sheelg, mysish';
 ino
