<a href="https://colab.research.google.com/github/dagyeomJung/deeplearning_master/blob/main/hf_tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Training SOTA tokenizer models using HuggingFace `tokenizers` package

1. Word Level
2. BPE - Byte Pair Encoder
3. Unigram
4. Word Piece

In [None]:
!pip install tokenizers




## Importing packages

In [None]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace


#### Download the data to train the model.

In [None]:
!wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt

--2021-10-17 10:16:15--  http://www.gutenberg.org/cache/epub/16457/pg16457.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/16457/pg16457.txt [following]
--2021-10-17 10:16:15--  https://www.gutenberg.org/cache/epub/16457/pg16457.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 617622 (603K) [text/plain]
Saving to: ‘pg16457.txt.2’


2021-10-17 10:16:16 (5.45 MB/s) - ‘pg16457.txt.2’ saved [617622/617622]



In [None]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2021-10-17 10:16:59--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.88.229
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.88.229|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip.3’

Archive:  wikitext-103-raw-v1.zip
replace wikitext-103-raw/wiki.test.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.test.raw  
replace wikitext-103-raw/wiki.valid.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.valid.raw  
replace wikitext-103-raw/wiki.train.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.train.raw  y
y
y






## Define the 3-step process

In [None]:
unk_token = "<UNK>"  # token for unknown words
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]  # special tokens

def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer


def train_tokenizer(files, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save("./tokenizer-trained.json")
    tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer

def tokenize(input_string, tokenizer):
    """
    Tokenizes the input string using the tokenizer provided.
    """
    output = tokenizer.encode(input_string)
    return output


## Training each model on the small as well as the large dataset

In [None]:
##training on a small dataset
small_file = ['pg16457.txt']
large_files = [f"./wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokens_dict = {}

for files in [small_file, large_files]:
    print(f"========Using vocabulary from {files}=======")
    for alg in ['WLV', 'BPE', 'UNI', 'WPC']:
        trained_tokenizer = train_tokenizer(files, alg)
        input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
        output = tokenize(input_string, trained_tokenizer)
        tokens_dict[alg] = output.tokens
        print("----", alg, "----")
        print(output.tokens, "->", len(output.tokens))


---- WLV ----
['This', 'is', 'a', 'deep', 'learning', 'tokenization', 'tutorial', '.', 'Tokenization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'NLP', 'pipeline', '.', 'We', 'will', 'be', 'comparing', 'the', 'tokens', 'generated', 'by', 'each', 'tokenization', 'model', '.', 'Excited', 'much', '?!😍'] -> 35
---- BPE ----
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 't', 'ut', 'or', 'ial', '.', 'T', 'ok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', 'k', 'ens', 'generated', 'by', 'each', 'to', 'ken', 'ization', 'model', '.', 'Ex', 'c', 'ited', 'much', '?', '!', '<UNK>'] -> 55
---- UNI ----
['Thi', 's', 'is', 'a', 'deep', 'learn', 'ing', 'to', 'ken', 'iz', 'ation', 't', 'u', 'to', 'rial', '.', 'To', 'ken', 'iz', 'ation', 'is', 'the', 'fir', 's', 't', 'step', 'in', 'a', 'deep', 'learn', 'ing', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'W

## Comparing the BPE and Unigram tokens

In [None]:

tokens_dict = {}

for alg in ['BPE', 'UNI', 'WPC']:
    trained_tokenizer = train_tokenizer(large_files, alg)
    input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
    output = tokenize(input_string, trained_tokenizer)
    tokens_dict[alg] = output.tokens

In [None]:
import pandas as pd

max_len = max(len(tokens_dict['UNI']), len(tokens_dict['WPC']), len(tokens_dict['BPE']))
diff_bpe = max_len - len(tokens_dict['BPE'])
diff_wpc = max_len - len(tokens_dict['WPC'])

tokens_dict['BPE'] = tokens_dict['BPE'] + ['<PAD>']*diff_bpe
tokens_dict['WPC'] = tokens_dict['WPC'] + ['<PAD>']*diff_wpc

del tokens_dict['WLV']

df = pd.DataFrame(tokens_dict)

68
68
68


Unnamed: 0,BPE,UNI,WPC
0,This,This,This
1,is,i,is
2,a,s,a
3,deep,a,deep
4,learning,deep,learning
...,...,...,...
63,<PAD>,cited,<PAD>
64,<PAD>,much,<PAD>
65,<PAD>,?,<PAD>
66,<PAD>,!,<PAD>


In [None]:
df.head(10)


Unnamed: 0,BPE,UNI,WPC
0,This,This,This
1,is,i,is
2,a,s,a
3,deep,a,deep
4,learning,deep,learning
5,to,learn,to
6,ken,ing,##ken
7,ization,t,##ization
8,tut,o,tut
9,orial,ken,##orial


In [None]:
df.describe(include= 'all')

Unnamed: 0,BPE,UNI,WPC
count,68,68,68
unique,37,41,37
top,<PAD>,o,<PAD>
freq,21,5,20


In [None]:
set(df['UNI']) - set(df['BPE'])

{'L',
 'N',
 'T',
 'W',
 'com',
 'd',
 'e',
 'generate',
 'i',
 'ing',
 'learn',
 'line',
 'o',
 'p',
 'par',
 'rial',
 's',
 't',
 'u',
 '😍'}

In [None]:
set(df['UNI']) - set(df['WPC'])

{'!',
 '?',
 'Ex',
 'L',
 'N',
 'P',
 'T',
 'W',
 'cited',
 'com',
 'd',
 'e',
 'generate',
 'i',
 'ing',
 'ization',
 'ken',
 'learn',
 'line',
 'o',
 'p',
 'par',
 'rial',
 's',
 't',
 'u',
 '😍'}

In [None]:
set(df['WPC']) - set(df['UNI'])


{'##P',
 '##eni',
 '##ited',
 '##ization',
 '##ken',
 '##on',
 '##orial',
 '##s',
 '##ti',
 '##za',
 '<PAD>',
 '<UNK>',
 'Exc',
 'NL',
 'Tok',
 'We',
 'comparing',
 'generated',
 'is',
 'learning',
 'pipeline',
 'to',
 'tut'}