In [4]:
import os
if 'A306709' in os.environ['USERNAME']:
    print("Running on Christophs computer: update proxy settings.")
    os.environ["http_proxy"] = "http://sia-lb.telekom.de:8080"
    os.environ["https_proxy"] = "http://sia-lb.telekom.de:8080"
else:
    print("Running on any computer but not Christophs: don't update any proxy settings.")

Running on Christophs computer: update proxy settings.


In [5]:
from __future__ import annotations

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from helper import (
    start_time,
    time_since,
    ShakespeareDataset,
    TokenMapping,
    build_model,
    next_token,
    # Character-based helpers
    encode_text,
    # Subword-based helpers
    encode_text_from_tokenizer,
    tokenize_text_from_tokenizer,
)

In [6]:
# Deterministic training
torch.manual_seed(0)

# Attempt GPU; if not, stay on CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Load Data

In [10]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = 'data/shakespeare_small.txt'

with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

print(f'Number of characters in text file: {len(raw_text):,}')

Number of characters in text file: 50,085


# Character-Based Text Generation

The first model you'll build for text generation will use character-based
tokens.

Each token will be a single character from the text and the model will learn
to predict the next character (a token).

To generate text, the model will take in a new string,
character-by-character, and then generate a new likely character based on the
past input. Then the model will take into account that new character and
generate the following character and so on and so on until the model has
produced a set number of characters.

## Encode Text into Integer Tokens

### Normalization

In [11]:
def normalize_text(text: str) -> str:
    # TODO: Normalize incoming text; can be multiple actions
    return text.lower()

In [12]:
# TEST: Is your text normalized the way you expected?
# Only the first 500 characters of the original text
normalized_text = normalize_text(raw_text[:500])
print(normalized_text)

first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you are all resolved rather to die than to famish?

all:
resolved. resolved.

first citizen:
first, you know caius marcius is chief enemy to the people.

all:
we know't, we know't.

first citizen:
let us kill him, and we'll have corn at our own price.
is't a verdict?

all:
no more talking on't; let it be done: away, away!

second citizen:
one word, good citizens.

first citizen:
we are accounted poor


### Pretokenization

In [13]:
def pretokenize_text(text: str) -> str | list[str]:
    # TODO: Pretokenize normalized text into character strings
    return [c for c in text]

In [14]:
# TEST: Is your (normalized) text pretokenized the way you expected?
# Only the first 500 characters of the original text
pretokenized_text = pretokenize_text(normalized_text)
print(pretokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'a', 'l', 'l', ':', '\n', 's', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', '\n', '\n', 'a', 'l', 'l', ':', '\n', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'f', 'i', 'r', 's', 't', ',', ' ', '

### Tokenize

In [15]:
# Combine normalization and pretokenization steps
def tokenize_text(text: str) -> str | list[str]:
    normalized_text: str = normalize_text(text)
    pretokenized_text: str | list[str] = pretokenize_text(normalized_text)
    # Characters are already tokens so pretokenized text is already tokenized
    tokenized_text = pretokenized_text
    return tokenized_text

In [16]:
# TEST: Is your tokenized text the way you expected?
tokenized_text = tokenize_text(raw_text[:500])
print(tokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'a', 'l', 'l', ':', '\n', 's', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', '\n', '\n', 'a', 'l', 'l', ':', '\n', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'f', 'i', 'r', 's', 't', ',', ' ', '

### Postprocessing

We'll skip postprocessing since we don't have any special tokens we want to
consider for our task here.

### Encode (Tokens → Integer IDs)

We have `encode_text()` from our helper module that can encode our text based on
our tokenization process from our created `tokenize_text()` function.

This will also provide us with `character_mapping`, an object that we can use to
map our tokens back and forth from strings to integer IDs.

In [17]:
encoded_text, character_mapping = encode_text(raw_text, tokenize_text)

## Prepare Dataset

In [18]:
n_tokens = character_mapping.n_tokens
dataset_size = len(encoded_text)
print(f'Size of dataset: {dataset_size:,} characters')

Size of dataset: 50,086 characters


In [19]:
# Defining sequence length that will be taken in at a time by our model
sequence_length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded_text, sequence_length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure deterministic training
    batch_size=batch_size,
)

## Define Model

We'll provide a defined model today, but this could be a step that you would
modify and experiment in other NLP projects you'll do.

In [20]:
# Defining the model to be trained and generate text with
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Generation

The `generate_text_by_char()` function will use your tokenizer and NLP model to
generate new text token-by-token (character-by-character in this case) by taking
in the input text and token sampling parameters.

We can use temperature and top-k sampling to adjust the "creativeness" of the
generated text.

We also pass in the `num_chars` parameter to tell the function how many tokens
(characters in this case) to generate.

In [21]:
def generate_text_by_char(
    input_str: str,
    model,
    token_mapping: TokenMapping = character_mapping,
    num_chars: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    # Uses your character-based tokenizer
    tokenized_text: list[str] = tokenize_text(input_str)
    # Generates token-by-token and creates a list of those tokens
    generated_tokens = []
    for _ in range(num_chars):
        # Uses the input text and generated text (so far) to get next token
        new_char = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            # Temperature & top-k sampling used in determining the next token
            temperature=temperature,
            topk=topk,
            device=device,
        )
        generated_tokens.append(new_char)
    # Returns input string plus the full generated string (of generated tokens)
    full_text = ''.join(tokenized_text + generated_tokens)
    return full_text

## Train Model

At this point, the model has not been trained so the code below will train the
NLP model that will be used to generate new text.

The model will take in the text data (broken by tokens by our character-based
tokenizer) and attempt to predict the next token. Over time, the model should
hopefully get better in predicting the next token (given the previous text).

To help us visualize how the model is training, at the end of every epoch, we
generate text using the `TEST_PHRASE` with the improving model.

In [22]:
TEST_PHRASE = 'To be or not to be'
# Use more epochs if not CPU device
epochs = 5 if device == 'cpu' else 25

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print(f'[{time_since(start)} ({epoch} {epoch / epochs * 100}) {loss:.4f}]')
    print('-'*72)
    gen_output = generate_text_by_char(
        input_str=TEST_PHRASE,
        model=model,
        num_chars=100,
    )
    print(gen_output)

Epoch 1/25, Loss: 2.5301671769291447
[00m 15.9s (0 0.0) 2.1864]
------------------------------------------------------------------------
to be or not to bedy doy ethinus,
none onth hea fhind kinguc's thers, dher there weral, the mawerte.

vier:
wrant, apk
Epoch 2/25, Loss: 2.1812583895917896
[00m 31.5s (1 4.0) 1.9870]
------------------------------------------------------------------------
to be or not to be weve hame stem the i crourcius,
arcis,
cir.
ste fhakn wor
huce theel whey al moire come
at boment p
Epoch 3/25, Loss: 2.078537766087931
[00m 46.7s (2 8.0) 1.8850]
------------------------------------------------------------------------
to be or not to berth brut,
fot heverh serseng
leend. and ofpor nais iais and of voitn maning of frenead,
whingy whron
Epoch 4/25, Loss: 2.019213487317387
[01m 2.5s (3 12.0) 1.8197]
------------------------------------------------------------------------
to be or not to beatlive theer the the ceplinenus:
with, thy comnogh arous,
uly sutar:
hinsud,
ou

## Generate Text

Now that the model has been trained, go ahead and observe how it performs!

Try adjusting the different sampling methods using the `temperature` and `topk`
parameters on the same input string to see the differences.

You might also try different phrases as well as how many tokens (`num_chars`) to
generate and observe how it does.

In [23]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=100,
    temperature=1.0,
    topk=None,
)
print(output)

to be or not to betold marcius
come.

unth thecius:
what, done geak the mens afrumus,
to done.

fill onk
say scolding 


# Subword Text Generation

The next model you'll build will use subword-tokenization instead of 
characters-based token to train a model and ultimately generate new text
token-by-token.

Although this could be done by creating your own tokenizer, you'll use
Hugging Face to use a pretrained tokenizer to tokenize the data.

After training the model with subword tokens, 
the model will take in a new string, token-by-token, and then generate a new
token (subword).
The model will continue producing new subword tokens based on the input text
and already produced tokens until a set number of tokens have been generated.

## Encode Text into Integer Tokens

### Choosing a Tokenizer

> NOTE:
> 
> You can load another model outside of these choices but the model
> will have to be downloaded and may or may not be effective.
>
> If you'd like to explore more, here's a link to you might want to start with
> of different available pretrained models on Hugging Face:
> https://huggingface.co/models?pipeline_tag=text-generation

In [25]:
# TODO: Choose a pretrained tokenizer to use:

# Docs: https://huggingface.co/xlm-roberta-base
# model_name = 'xlm-roberta-base'
# DOCS: https://huggingface.co/bert-base-cased
# model_name = 'bert-base-cased'
# DOCS: https://huggingface.co/bert-base-uncased 
# model_name = 'bert-base-uncased'

my_tokenizer = AutoTokenizer.from_pretrained(
    'xlm-roberta-base',
)

### Encode (Tokens → Integer IDs)

We have `encode_text_from_tokenizer()` from our helper module that can encode
our text based on our tokenization process from our tokenizer `my_tokenizer`.

This will also provide us with `token_mapping`, an object that we can use to
map our tokens back and forth from strings to integer IDs.

In [26]:
encoded_text, token_mapping = encode_text_from_tokenizer(
    text=raw_text,
    tokenizer=my_tokenizer,
)

## Prepare Dataset

In [27]:
n_tokens = token_mapping.n_tokens
dataset_size = len(encoded_text)
print(f'Size of dataset: {dataset_size:,} tokens')

Size of dataset: 14,374 tokens


In [28]:
# Defining sequence length that will be taken in at a time by our model
sequence_length = 32 # Number of tokens
batch_size = 32

train_dataset = ShakespeareDataset(encoded_text, sequence_length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure deterministic training
    batch_size=batch_size,
)

## Define Model

We'll provide a defined model today, but this could be a step that you would
modify and experiment in other NLP projects you'll do.

In [29]:
# Defining the model to be trained and generate text with
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Generation

The `generate_text_by_subword()` function will use your chosen tokenizer and the
NLP model to generate new text token-by-token (subwords in this case) by taking
in the input text and token sampling parameters.

We can use temperature and top-k sampling to adjust the "creativeness" of the
generated text.

We also pass in the `num_tokens` parameter to tell the function how many
(subword)tokens to generate.

In [30]:
def generate_text_by_subword(
    input_str: str,
    model,
    token_mapping: TokenMapping = token_mapping,
    tokenizer = my_tokenizer,
    num_tokens: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    # Use your chosen subword-tokenizer
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    # Generates token-by-token and creates a list of those tokens
    generated_tokens = []
    for _ in range(num_tokens):
        # Uses the input text and generated text (so far) to get next token
        new_token = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            # Temperature & top-k sampling used in determining the next token
            temperature=temperature,
            topk=topk,
            device=device,
        )
        generated_tokens.append(new_token)
    # List of all token IDs (input text and generated text)
    output_ids = tokenizer.convert_tokens_to_ids(
        tokenized_text + generated_tokens
    )
    # Returns input string plus the full generated string from list of token IDs
    full_text = tokenizer.decode(output_ids)
    return full_text

## Train Model

At this point, the model has not been trained so the code below will train the
NLP model that will be used to generate new text.

The model will take in the text data (broken by tokens by our subword tokenizer)
and attempt to predict the next token. Over time, the model should hopefully
get better in predicting the next token (given the previous text).

To help us visualize how the model is training, at the end of every epoch, we
generate text using the `TEST_PHRASE` with the improving model.

In [31]:
TEST_PHRASE = 'To be or not to be'
# Use more epochs if not CPU device
epochs = 5 if device == 'cpu' else 25

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print(f'[{time_since(start)} ({epoch} {epoch / epochs * 100}) {loss:.4f}]')
    print('-'*72)
    output = generate_text_by_subword(
        input_str=TEST_PHRASE,
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
    )
    print(output)

Epoch 1/25, Loss: 6.739584188949823
[00m 11.5s (0 0.0) 5.8395]
------------------------------------------------------------------------
To be or not to be drum elementses know clubs No Well.le he broughtwordCIN BRU has mouth e buy MAR. come no but That and be conteed with done
Epoch 2/25, Loss: 6.093517935355681
[00m 22.4s (1 4.0) 5.2964]
------------------------------------------------------------------------
To be or not to be., for,mos ofted-- flatmb where, The this pair be liry that he will putlye I strong abundaTUS
Epoch 3/25, Loss: 5.770365263677652
[00m 33.1s (2 8.0) 4.8581]
------------------------------------------------------------------------
To be or not to be in my war shallthink nowders him loved,, andture is enough be Wednesdayrry true, We nor ear, say TI isius thought may
Epoch 4/25, Loss: 5.478529153794117
[00m 44.0s (3 12.0) 4.5978]
------------------------------------------------------------------------
To be or not to beCor VAL Tu Citizen: fier rob And, grim's! fear

## Generate Text

Now that the model has been trained, go ahead and observe how it performs!

Try adjusting the different sampling methods using the `temperature` and `topk`
parameters on the same input string to see the differences.

You might also try different phrases as well as how many tokens (`num_tokens`)
to generate and observe how it does.

------------

Consider how this model differs from the results from the text generation using
the character-based tokenization.

In [33]:
output = generate_text_by_subword(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=0.5,
        topk=100,
    )
print(output)

To be or not to be content and pettye: and they are almost thorough; and they are set down o' the people, and did Ret noble ladies,
