# SentencePiece Demo
This notebook will go through some examples of how to setup and use the SentencePiece subword library

## Import our libraries

In [4]:
import sentencepiece as spm
import re
import os
import pandas as pd

## Inspect the dataset
Lets take a look at the blogger data we have downloaded and see what we can use.

In [18]:
files = os.listdir('/floyd/input/blog_dirs/blogs')
for i in range(20):
    print(files[i])

4162441.male.16.Student.Sagittarius.xml
3489929.female.25.Student.Cancer.xml
3954575.female.23.BusinessServices.Gemini.xml
3364931.male.16.Student.Virgo.xml
3162067.female.24.Education.Cancer.xml
813360.female.23.BusinessServices.Capricorn.xml
4028373.female.17.indUnk.Leo.xml
3630901.male.34.Technology.Leo.xml
2467122.female.23.Student.Taurus.xml
3732850.female.45.Technology.Taurus.xml
3846432.male.16.Student.Leo.xml
3600967.female.33.Arts.Scorpio.xml
3753301.female.14.Non-Profit.Aries.xml
4157968.male.16.Student.Pisces.xml
3699514.male.34.InvestmentBanking.Capricorn.xml
2727849.female.26.Arts.Libra.xml
3791552.female.24.indUnk.Virgo.xml
4278694.female.24.Technology.Virgo.xml
1618178.male.14.Arts.Scorpio.xml
669719.male.26.Science.Taurus.xml


### Example blog text
Below we can see the output of an example blog. It looks good, we have misspellings and names and a variety of words and phrases. It should generate some interesting results. Only thing we need to do is remove the blog tags "<>" so we are not inclduing them in the training

In [56]:
text = []
# Note you can grab as many files as you like. 
# I am just using 10 here as an example.
for i in range(10):
    for line in open("/floyd/input/blog_dirs/blogs/{}".format(files[i]), encoding="latin-1").readlines():
        if re.search("<", line) or len(line) < 5:
            continue
        else:
            text.append(line)

### Write the data to a file
This will make it easy to pass into SentencePiece

In [59]:
with open('blog_test.txt', 'w') as fw:
    for l in text:
        fw.write(l)

## Train a BPE model

In [5]:
# train sentencepiece model from our blog corpus
spm.SentencePieceTrainer.train('--model_type=bpe --input=blog_test.txt --model_prefix=bpe --vocab_size=500 --normalization_rule_tsv=normalization_rule.tsv')

True

## Train a Unigram model

In [6]:
# train sentencepiece model from our blog corpus
spm.SentencePieceTrainer.train('--model_type=unigram --input=blog_test.txt --model_prefix=uni --vocab_size=500 --normalization_rule_tsv=normalization_rule.tsv')

True

## Load your newly trained models

In [7]:
# makes segmenter instance and loads the BPE model file (bpe.model)
sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load('bpe.model')

True

In [8]:
# makes segmenter instance and loads the BPE model file (bpe.model)
sp_uni = spm.SentencePieceProcessor()
sp_uni.load('uni.model')

True

## Look at some example tokens

In [9]:
print("BPE: {}".format(sp_bpe.encode_as_pieces('This is a test')))
print("UNI: {}".format(sp_uni.encode_as_pieces('This is a test')))

BPE: ['▁This', '▁is', '▁a', '▁t', 'est']
UNI: ['▁Thi', 's', '▁is', '▁a', '▁t', 'est']


In [10]:
print("BPE: {}".format(sp_bpe.encode_as_pieces(' This is a test')))
print("UNI: {}".format(sp_uni.encode_as_pieces(' This is a test')))

BPE: ['▁This', '▁is', '▁a', '▁t', 'est']
UNI: ['▁Thi', 's', '▁is', '▁a', '▁t', 'est']


In [12]:
print("BPE: {}".format(sp_bpe.encode_as_pieces('I think this is a test')))
print("UNI: {}".format(sp_uni.encode_as_pieces('I think this is a test')))

BPE: ['▁I', '▁think', '▁this', '▁is', '▁a', '▁t', 'est']
UNI: ['▁I', '▁think', '▁this', '▁is', '▁a', '▁t', 'est']


In [13]:
print("BPE: {}".format(sp_bpe.encode_as_pieces('Carbon dioxide')))
print("UNI: {}".format(sp_uni.encode_as_pieces('Carbon dioxide')))

BPE: ['▁C', 'ar', 'b', 'on', '▁d', 'i', 'o', 'x', 'ide']
UNI: ['▁C', 'ar', 'b', 'on', '▁d', 'i', 'o', 'x', 'id', 'e']


## Get a list of all of the BPE tokens

In [18]:
vocabs = [sp_bpe.id_to_piece(id) for id in range(sp_bpe.get_piece_size())]
bpe_tokens = sorted(vocabs, key=lambda x: len(x), reverse=True)
bpe_tokens

['▁something',
 '▁because',
 '▁thought',
 '▁really',
 '▁people',
 '▁little',
 '▁things',
 '▁friend',
 '▁should',
 '▁think',
 '▁about',
 '▁would',
 '▁there',
 '▁thing',
 '▁going',
 '▁every',
 '▁other',
 '▁somet',
 '▁could',
 '▁essay',
 '▁their',
 '▁being',
 '▁again',
 '<unk>',
 '▁that',
 '▁have',
 '▁with',
 '▁this',
 '▁like',
 '▁just',
 '▁what',
 '▁when',
 '▁they',
 '▁some',
 '▁want',
 '▁feel',
 '▁will',
 '▁time',
 '▁know',
 '▁work',
 '▁more',
 '▁them',
 '▁from',
 '▁good',
 'ation',
 '▁thou',
 '▁back',
 '▁life',
 '▁then',
 '▁been',
 '▁were',
 'other',
 '▁your',
 '▁year',
 'ittle',
 '▁much',
 'riend',
 '▁need',
 'essay',
 '▁comp',
 '▁home',
 '▁make',
 '▁over',
 '▁than',
 '▁even',
 'thing',
 '▁very',
 '▁take',
 '▁also',
 '▁This',
 '▁into',
 '▁only',
 '</s>',
 '▁the',
 '▁and',
 '▁for',
 'nbsp',
 '▁was',
 '▁not',
 '▁you',
 'ould',
 '▁but',
 'ally',
 '▁wor',
 '▁her',
 '▁get',
 '▁out',
 '▁all',
 '▁can',
 '▁she',
 '▁The',
 '▁are',
 'very',
 'king',
 '▁tim',
 '▁had',
 'ight',
 '▁one',
 '▁day',


## Get a list of all of the Unigram tokens

In [19]:
vocabs = [sp_uni.id_to_piece(id) for id in range(sp_uni.get_piece_size())]
bpe_tokens = sorted(vocabs, key=lambda x: len(x), reverse=True)
bpe_tokens

['▁everything',
 '▁something',
 '▁different',
 '▁actually',
 '▁remember',
 '▁anything',
 '▁children',
 '▁interest',
 '▁because',
 '▁thought',
 '▁getting',
 '▁another',
 '▁someone',
 '▁urlLink',
 '▁believe',
 '▁husband',
 '▁through',
 '▁morning',
 '▁problem',
 '▁really',
 '▁people',
 '▁little',
 '▁things',
 '▁should',
 '▁better',
 '▁myself',
 '▁though',
 '▁around',
 '▁before',
 '▁person',
 '▁author',
 '▁trying',
 '▁school',
 '▁family',
 '▁wonder',
 '▁enough',
 '▁happen',
 '▁having',
 '▁change',
 '▁think',
 '▁about',
 '▁would',
 '▁there',
 '▁going',
 '▁other',
 '▁could',
 '▁essay',
 '▁their',
 '▁again',
 '▁start',
 '▁after',
 '▁night',
 '▁being',
 '▁today',
 '▁first',
 '▁never',
 '▁right',
 '▁which',
 '▁where',
 '▁still',
 '▁great',
 '▁thing',
 '▁house',
 '▁place',
 '▁every',
 '▁while',
 '▁those',
 '▁point',
 '▁story',
 '▁class',
 '▁found',
 '▁month',
 '▁watch',
 '▁enjoy',
 '▁guess',
 '▁money',
 '▁least',
 '▁write',
 '▁world',
 '▁happy',
 '▁since',
 '▁phone',
 '<unk>',
 '▁that',
 '▁have'

## Reversing the process

In [25]:
# encode: text => id
print("BPE {}".format(sp_bpe.encode_as_pieces('This is a test')))
print("BPE {}".format(sp_bpe.encode_as_ids('This is a test')))

print("UNI {}".format(sp_uni.encode_as_pieces('This is a test')))
print("UNI {}".format(sp_uni.encode_as_ids('This is a test')))

BPE ['▁This', '▁is', '▁a', '▁t', 'est']
BPE [400, 61, 4, 3, 231]
UNI ['▁Thi', 's', '▁is', '▁a', '▁t', 'est']
UNI [284, 3, 37, 15, 78, 338]


In [27]:
# decode: id => text
print("BPE {}".format(sp_bpe.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est'])))
print("BPE {}".format(sp_bpe.decode_ids([400, 61, 4, 3, 231])))

print("UNI {}".format(sp_uni.decode_pieces(['▁Thi', 's', '▁is', '▁a', '▁t', 'est'])))
print("UNI {}".format(sp_uni.decode_ids([284, 3, 37, 15, 78, 338])))

BPE This is a test
BPE This is a test
UNI This is a test
UNI This is a test


## Get a list of tokens

In [24]:
vocabs = [sp_bpe.id_to_piece(id) for id in range(sp_bpe.get_piece_size())]
bpe_list = sorted(vocabs, key=lambda x: len(x), reverse=True)

In [25]:
vocabs = [sp_uni.id_to_piece(id) for id in range(sp_uni.get_piece_size())]
uni_list = sorted(vocabs, key=lambda x: len(x), reverse=True)

### Unigram tokens not in BPE

In [33]:
uni_tok_diff = [u for u in uni_list if u not in bpe_list]
print(len(uni_tok_diff))

171


### BPE tokens not in Unigram

In [38]:
bpe_tok_diff = [b for b in bpe_list if b not in uni_list]
print(len(bpe_tok_diff))

171


In [44]:
diff_pairs = list(zip(uni_tok_diff, bpe_tok_diff))
diff_df = pd.DataFrame(diff_pairs, 
                       columns=(["Unigram tokens not in BPE", "BPE tokens not in Unigram"]))
diff_df.head()

Unnamed: 0,Unigram tokens not in BPE,BPE tokens not in Unigram
0,▁everything,▁friend
1,▁different,▁somet
2,▁actually,▁thou
3,▁remember,other
4,▁anything,ittle


## Compare the tokens
How do the different tokenizers deal with missing tokens

### Unigram tokens
Let's look first at the Unigram tokens which are in BPE but not in the Unigram tokenizer

In [83]:
diff_df['BPE tokens not in Unigram'].values.tolist()

['▁friend',
 '▁somet',
 '▁thou',
 'other',
 'ittle',
 'riend',
 'essay',
 'thing',
 '▁This',
 'nbsp',
 'ould',
 '▁wor',
 'very',
 'king',
 '▁tim',
 '▁com',
 'hing',
 '▁bec',
 '▁any',
 '▁int',
 'ause',
 '▁man',
 'ople',
 'omet',
 'ving',
 '▁wee',
 'reat',
 '▁too',
 'fter',
 'self',
 'here',
 'ning',
 '▁bet',
 '▁exp',
 '▁hel',
 '▁try',
 'hat',
 '▁ha',
 '▁th',
 '▁li',
 'ith',
 'all',
 'ght',
 'ome',
 'ust',
 'her',
 'ill',
 '▁wh',
 '▁ne',
 'ink',
 'out',
 'hen',
 'ess',
 '▁mo',
 '▁ab',
 'one',
 'ake',
 'ack',
 'art',
 'ind',
 'eel',
 'ast',
 'ain',
 '▁kn',
 'rom',
 'use',
 'ear',
 'rou',
 'ell',
 'itt',
 'han',
 'ers',
 'are',
 'ven',
 '▁pl',
 '▁le',
 '▁sp',
 'way',
 'ite',
 'ong',
 '▁pe',
 'ure',
 'ore',
 'ide',
 '▁ch',
 '▁ag',
 'ort',
 'our',
 'ame',
 'ook',
 'ist',
 'own',
 '▁Th',
 '▁wr',
 '▁sa',
 'uch',
 'ard',
 '▁cl',
 'ass',
 'ice',
 'ond',
 'ous',
 'ich',
 '▁en',
 'ace',
 'ool',
 '▁gu',
 'red',
 '▁qu',
 '▁fe',
 '▁bl',
 '▁tw',
 'alk',
 'ild',
 'ble',
 'ile',
 '▁im',
 '▁ri',
 'ose',


In [108]:
test_list = ["friend", "sometimes", "thousand", ".other", ".little"]
for ut in test_list:
    print("Unigram token {} \nBPE token {}\n".format(sp_uni.encode_as_pieces(ut), sp_bpe.encode_as_pieces(ut)))

Unigram token ['▁f', 'ri', 'end'] 
BPE token ['▁friend']

Unigram token ['▁some', 't', 'im', 'es'] 
BPE token ['▁somet', 'im', 'es']

Unigram token ['▁', 'th', 'o', 'us', 'and'] 
BPE token ['▁thou', 's', 'and']

Unigram token ['▁', '.', 'o', 'ther'] 
BPE token ['▁', '.', 'other']

Unigram token ['▁', '.', 'li', 't', 't', 'le'] 
BPE token ['▁', '.', 'l', 'ittle']



## Unigram sampling

In [109]:
for n in range(10):
  print(sp_uni.encode_as_pieces('hello world'))

['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']
['▁he', 'll', 'o', '▁world']


In [110]:
# Can obtain different segmentations per request.
# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.
for n in range(10):
  print(sp_uni.sample_encode_as_pieces('remembers', -1, 0.1))

['▁re', 'me', 'm', 'b', 'er', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁remember', 's']
['▁remember', 's']
['▁remember', 's']
['▁', 're', 'me', 'm', 'b', 'er', 's']
['▁', 'r', 'e', 'me', 'm', 'b', 'er', 's']
['▁re', 'me', 'm', 'b', 'e', 'r', 's']
['▁', 'r', 'e', 'me', 'm', 'b', 'er', 's']
['▁remember', 's']


In [111]:
# get 10 best
best_seg = sp_uni.nbest_encode_as_pieces('remembers', 10)
for i in best_seg:
    print(i)

['▁remember', 's']
['▁re', 'me', 'm', 'b', 'er', 's']
['▁re', 'm', 'e', 'm', 'b', 'er', 's']
['▁', 're', 'me', 'm', 'b', 'er', 's']
['▁re', 'me', 'm', 'b', 'e', 'r', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'er', 's']
['▁re', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁', 'r', 'e', 'me', 'm', 'b', 'er', 's']
['▁', 're', 'me', 'm', 'b', 'e', 'r', 's']
['▁', 'r', 'e', 'm', 'e', 'm', 'b', 'er', 's']


## HuggingFace Tokenizers

In [3]:
from tokenizers import (ByteLevelBPETokenizer,
                            BPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)

In [30]:
tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["../blog_test.txt"], vocab_size=500, min_frequency=2)

In [28]:
output = tokenizer.encode("This is a test")
print(output.tokens)

['▁Th', 'is', '▁is', '▁a', '▁t', 'est']
