In [1]:
from __future__ import annotations

import string
import re
from transformers import AutoTokenizer

# Intro

# Tokenization Steps

### Normalization

In [2]:
def normalize_text(text: str) -> str:
    # Can be multiple actions to normalize text
    # Only keep ASCII letters, numbers, punctuation, and whitespace characters
    # Evquivalent to string.printable
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.punctuation
        + string.whitespace
    )
    normalized_text = ''.join(
        filter(lambda letter: letter in acceptable_characters, text)
    )
    # Make text lower-case
    normalized_text = normalized_text.lower()
    return normalized_text

### Pretokenization

In [3]:
def pretokenize_text(text: str) -> list[str]:
    # Character-based
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [4]:
def pretokenize_text(text: str) -> list[str]:
    # Split based on spaces
    smaller_pieces = text.split()
    return smaller_pieces

### Tokenization

In [5]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    # Apply created steps 
    normalized_text: str = normalize_text(text)
    pretokenized_text: list[str] = pretokenize_text(normalized_text)
    tokens = []
    # Go through small pieces to make full tokens
    for word in pretokenized_text:
        tokens.extend(
            re.findall(
                f'[\w]+|[{string.punctuation}]', # Split word at punctuations 
                word,
            )
        )
    return tokens

### Postprocessing

In [6]:
# Useful for some tasks
def postprocess_tokens(tokens: list[str]) -> list[str]:
    # Add beginning and end of sequence tokens
    bos_token = '##BOS##'
    eos_token = '##EOS##'
    updated_tokens = (
        [bos_token]
        + tokens
        + [eos_token]
    )
    return updated_tokens

### Encoding: Putting It All Together

We can now try out our full tokenization process! Let's use this sample text to
see how our tokenization pipeline handles it!

In [7]:
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''

print(sample_text)
sample_text

Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"



'Mr. Louis continued to say, "Penguins are important, \nbut we mustn\'t forget the nuumber 1 priority: the READER!"\n'

In [8]:
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(tokens)

['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']


We now need to encode the tokens to IDs we'll give to a model. But first we need
to define how to map each token to a unique ID. An easy method can be to
arbitrarily count the unique tokens from our corpus.

We'll use the following as our sample corpus.

In [9]:
# Normally this would be much bigger
sample_corpus = (
    '''Mr. Louis continued to say, "Penguins are important, \nbut we mustn't forget the nuumber 1 priority: the READER!"''',
    '''BRUTUS:\nHe's a lamb indeed, that baes like a bear.''',
    '''Both by myself and many other friends:\mBut he, his own affections' counsellor,\nIs to himself--I will not say how true--\nBut to himself so secret and so close,'''
)

In [10]:
# Retrieve unique tokens (from the pipeline defined above) in a set
unique_tokens = set()
for text in sample_corpus:
    tokens_from_text = tokenize_text(text)
    tokens_from_text = postprocess_tokens(tokens_from_text)
    unique_tokens.update(tokens_from_text)

In [11]:
# Create mapping (dictionary) for unique tokens using arbitrary & unique IDs
token2id = {
    token: idx
    for idx, token in enumerate(unique_tokens)
}

For good measure, create a mapping for IDs to convert back to token

In [12]:
id2token = {idx: token for token, idx in token2id.items()}

Let's create our encoder and decoder to transform our tokens to IDS and back

In [13]:
def encode(tokens: list[str]) -> list[int]:
    # Note this doesn't handle tokens not mapped
    encoded_tokens = [
        token2id[token]
        for token in tokens
    ]
    return encoded_tokens


In [14]:
def decode(ids: list[int]) -> list[str]:
    token_strings = [
        id2token[idx]
        for idx in ids
    ]
    return token_strings

In [15]:
# Testing out encoding and decoding 
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(f'Tokens:\n{tokens}\n')

encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

decoded_tokens = decode(encoded_tokens)
print(f'Decoded Tokens:\n{decoded_tokens}\n')

Tokens:
['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']

Encoded Tokens:
[33, 43, 57, 45, 31, 41, 32, 4, 20, 14, 21, 9, 4, 46, 35, 47, 5, 39, 49, 58, 38, 51, 53, 26, 58, 6, 22, 20, 8]

Decoded Tokens:
['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']



## Tokenization Reflection

# Using Hugging Face Tokenizers

Tokenization is an important step in most NLP tasks. Hugging Face has been an
invaluable resource in training, using, and sharing different tokenizers!

The API is flexible where you can use a tokenizer off the shelf, fine-tune a
tokenizer with your own data, or even train your own completely from scratch!

### Loading Tokenizer

In this notebook, we'll explore Hugging Face's tokenizers by using a pretrained
model. Hugging Face has many tokenizers available that have already been trained
for specific models and tasks!

In [16]:
# Choose a pretrained tokenizer to use
my_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

### Encoding: Text to Tokens

There's a couple ways to get the tokens from text. The simplest is calling
`.tokenize()` on the text.

In [17]:
raw_text = '''Rory\'s shoes are magenta and so are Corey\'s but they aren\'t nearly as dark!'''
tokens = my_tokenizer.tokenize(raw_text)

print(tokens)

['Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!']


Another is calling the tokenizer with the text and then calling the `.tokens()`
method.

This will also return some special tokens depending on the pretrained tokenizer
used.

In [18]:
detailed_tokens = my_tokenizer(raw_text).tokens()

print(detailed_tokens)

['[CLS]', 'Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!', '[SEP]']


To get the tokens as integer IDs, there are again a few methods.

The first is using the tokenizers `.encode()` method on the text.

In [19]:
print(my_tokenizer.encode(raw_text))

[101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102]


We can also the `.convert_tokens_to_ids()` tokenizer method if we already have
the tokens (as strings) to get the IDs

In [20]:
print(detailed_tokens)
detailed_ids = my_tokenizer.convert_tokens_to_ids(detailed_tokens)
print(detailed_ids)

['[CLS]', 'Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!', '[SEP]']
[101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102]


Another way can look a little complex but can be useful when working with
tokenizers for certain tasks.

We first call the tokenizer on the text like we did last time but with no extra 
method.

This returns an object that has a few different keys available.

In [21]:
my_tokenizer(raw_text)

{'input_ids': [101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

We'll then focus on `input_ids` which are the IDs associated with the
tokenizations.

In [22]:
print(my_tokenizer(raw_text).input_ids)

[101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102]


### Decoding: Tokens to Text

We of course can use the tokenizer to go from token IDs to tokens and back to text!

We'll start with the inverse of the `.enocde()` method: `.decode()`!

This will get us a full string using the tokens (from the IDs).

In [23]:
ids = my_tokenizer.encode(raw_text)
my_tokenizer.decode(ids)

"[CLS] Rory's shoes are magenta and so are Corey's but they aren't nearly as dark! [SEP]"

You might've noticed that you didn't go back to the original text and instead
have some special tokens. This is because the `.encode()` will return IDs for
special tokens (assuming the pretrained tokenizer used special tokens).

To use `.decode()` but not the special tokens, we can use the parameter
`skip_special_tokens` and set it to `True`.

In [24]:
my_tokenizer.decode(ids, skip_special_tokens=True)

"Rory's shoes are magenta and so are Corey's but they aren't nearly as dark!"

If we instead wanted a list of the tokens decoded from the IDs, we could instead
use the `.convert_ids_to_tokens()` method.

In [25]:
print(my_tokenizer.convert_ids_to_tokens(ids))

['[CLS]', 'Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!', '[SEP]']


#### A Note on the Unknown

One thing to consider is if a string is outside of the tokenizer's vocabulary,
also know as an "unkown" token. They are typically represented with `[UNK]` or
some other similar variant.


If the tokenizer encoded the text so each character was a token (which is
actually not as easy as it sounds), then it would be impossible to have an
"unknown" token. Word-based tokenization will always be in danger of having 
"unknown" tokens since it's virtually impossible to have every possible word (
and "non-word") in its vocabulary!

And so you might think that subword tokenization wouldn't have an issue with
"unknown" tokens. And although there are fewer than word-based tokenization, it
does happen!

Tokenizers are specific so it's important to use a tokenizer that will recognize
most of the text you're working with! For example, a lot of tokenizers might not
consider emoji as tokens but could be really important if emoji are especially
numerous in your data (like a corpus of chat messages)!

In [26]:
phrase = '🥱 the dog next door kept barking all night!!'
ids = my_tokenizer.encode(phrase)
print(phrase)
print(my_tokenizer.convert_ids_to_tokens(ids))
print(my_tokenizer.decode(ids))

🥱 the dog next door kept barking all night!!
['[CLS]', '[UNK]', 'the', 'dog', 'next', 'door', 'kept', 'barking', 'all', 'night', '!', '!', '[SEP]']
[CLS] [UNK] the dog next door kept barking all night!! [SEP]


In [27]:
phrase = '''wow my dad thought mcdonalds sold tacos \N{SKULL AND CROSSBONES}'''
ids = my_tokenizer.encode(phrase)
print(phrase)
print(my_tokenizer.convert_ids_to_tokens(ids))
print(my_tokenizer.decode(ids))

wow my dad thought mcdonalds sold tacos ☠
['[CLS]', 'w', '##ow', 'my', 'dad', 'thought', 'm', '##c', '##don', '##ald', '##s', 'sold', 'ta', '##cos', '[UNK]', '[SEP]']
[CLS] wow my dad thought mcdonalds sold tacos [UNK] [SEP]


If you're seeing a lot of "unknown" tokens with the text you're working with,
might consider using a different tokenizer appropiate for the task. Or it's also
possible to fine-tune a pretrained model or train one from scratch!

### More Properties of Hugging Face's Tokenizers

There are a lot of great features when using tokenizers in Hugging Face that can make it very simple to try out and use different models. Here we'll breifly discuss some properties that can be useful.

We'll load a couple different models:

* `bert-base-cased` ([doc](https://huggingface.co/docs/transformers/model_doc/bert))
* `xlm-roberta-base` ([doc](https://huggingface.co/docs/transformers/model_doc/xlm-roberta))
* `google/pegasus-xsum` ([doc](https://huggingface.co/docs/transformers/model_doc/pegasus))
* `allenai/longformer-base-4096` ([doc](https://huggingface.co/docs/transformers/model_doc/longformer))

In [28]:
model_names = (
    'bert-base-cased',
    'xlm-roberta-base',
    'google/pegasus-xsum',
    'allenai/longformer-base-4096',
)

model_tokenizers = {
    model_name: AutoTokenizer.from_pretrained(model_name)
    for model_name in model_names
}

#### `model_max_length`

Many models that tokenizers are associated with can only take in a maximum number of tokens and so the tokenizer might not be equipped to encode a very long sequence. It might not always be relevant, but you can find this length with `.model_max_length`.

In [29]:
for model_name, temp_tokenizer in model_tokenizers.items():
    max_length = temp_tokenizer.model_max_length
    print(f'{model_name}\n\tmax length: {max_length}')
    print('\n')

bert-base-cased
	max length: 512


xlm-roberta-base
	max length: 512


google/pegasus-xsum
	max length: 512


allenai/longformer-base-4096
	max length: 4096




#### Special Tokens

We've already mentioned special tokens like the "unknown" token. Different models use different ways to distinguish special tokens and not all models cover all the special tokens since it's dependent on the model's task it was trained for.

In [30]:
for model_name, temp_tokenizer in model_tokenizers.items():
    special_tokens = temp_tokenizer.all_special_tokens
    print(f'{model_name}\n\tspecial tokens: {special_tokens}')
    print('\n')

bert-base-cased
	special tokens: ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


xlm-roberta-base
	special tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>']


google/pegasus-xsum
	special tokens: ['</s>', '<unk>', '<pad>', '<mask_2>', '<mask_1>', '<unk_2>', '<unk_3>', '<unk_4>', '<unk_5>', '<unk_6>', '<unk_7>', '<unk_8>', '<unk_9>', '<unk_10>', '<unk_11>', '<unk_12>', '<unk_13>', '<unk_14>', '<unk_15>', '<unk_16>', '<unk_17>', '<unk_18>', '<unk_19>', '<unk_20>', '<unk_21>', '<unk_22>', '<unk_23>', '<unk_24>', '<unk_25>', '<unk_26>', '<unk_27>', '<unk_28>', '<unk_29>', '<unk_30>', '<unk_31>', '<unk_32>', '<unk_33>', '<unk_34>', '<unk_35>', '<unk_36>', '<unk_37>', '<unk_38>', '<unk_39>', '<unk_40>', '<unk_41>', '<unk_42>', '<unk_43>', '<unk_44>', '<unk_45>', '<unk_46>', '<unk_47>', '<unk_48>', '<unk_49>', '<unk_50>', '<unk_51>', '<unk_52>', '<unk_53>', '<unk_54>', '<unk_55>', '<unk_56>', '<unk_57>', '<unk_58>', '<unk_59>', '<unk_60>', '<unk_61>', '<unk_62>', '<unk_63>', '<unk_64>', '<

Yout can also call the specific token you're interested in to see its representation.

In [31]:
model_tokenizers['bert-base-cased'].unk_token

'[UNK]'

In [32]:
for model_name, temp_tokenizer in model_tokenizers.items():
    print(f'{model_name}')
    print(f'\tUnknown: \n\t\t{temp_tokenizer.unk_token=}')
    print(f'\tBeginning of Sequence: \n\t\t{temp_tokenizer.bos_token=}')
    print(f'\tEnd of Sequence: \n\t\t{temp_tokenizer.eos_token=}')
    print(f'\tMask: \n\t\t{temp_tokenizer.mask_token=}')
    print(f'\tSentence Separator: \n\t\t{temp_tokenizer.sep_token=}')
    print(f'\tClass of Input: \n\t\t{temp_tokenizer.cls_token=}')
    print('\n')

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.


bert-base-cased
	Unknown: 
		temp_tokenizer.unk_token='[UNK]'
	Beginning of Sequence: 
		temp_tokenizer.bos_token=None
	End of Sequence: 
		temp_tokenizer.eos_token=None
	Mask: 
		temp_tokenizer.mask_token='[MASK]'
	Sentence Separator: 
		temp_tokenizer.sep_token='[SEP]'
	Class of Input: 
		temp_tokenizer.cls_token='[CLS]'


xlm-roberta-base
	Unknown: 
		temp_tokenizer.unk_token='<unk>'
	Beginning of Sequence: 
		temp_tokenizer.bos_token='<s>'
	End of Sequence: 
		temp_tokenizer.eos_token='</s>'
	Mask: 
		temp_tokenizer.mask_token='<mask>'
	Sentence Separator: 
		temp_tokenizer.sep_token='</s>'
	Class of Input: 
		temp_tokenizer.cls_token='<s>'


google/pegasus-xsum
	Unknown: 
		temp_tokenizer.unk_token='<unk>'
	Beginning of Sequence: 
		temp_tokenizer.bos_token=None
	End of Sequence: 
		temp_tokenizer.eos_token='</s>'
	Mask: 
		temp_tokenizer.mask_token='<mask_2>'
	Sentence Separator: 
		temp_tokenizer.sep_token=None
	Class of Input: 
		temp_tokenizer.cls_token=None


allenai/longform

### Considerations


- Tokenizers differ
- Long sequences
- Adjusting to your use case (fine-tuning)


## Hugging Face Tokenizers Reflection

# Overall Reflection