In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   - ------------------------------------- 41.0/991.5 kB 245.8 kB/s eta 0:00:04
   -- ------------------------------------ 61.4/991.5 kB 363.1 kB/s eta 0:00:03
   -- ------------------------------------ 61.4/991.5 kB 363.1 kB/s eta 0:00:03
   ----- -------------------------------- 143.4/991.5 kB 532.5 kB/s eta 0:00:02
   ------ ------------------------------- 163.8/991.5 kB 544.7 kB/s eta 0:00:02
   ------ ------------------------------- 163.8/991.5 kB 544.7 kB/s eta 0:00:02
   -------- ----------------------------- 225.3/991.5 kB 528.4 kB/s eta 0:00:02
   ---------- --------------------------- 286.7/991.5 k

In [3]:
from tokenizers import Tokenizer, models, \
    trainers, normalizers, pre_tokenizers, processors, decoders
import sentencepiece as spm

# Prepare corpus for training tokenizer

In [6]:
with open('E:\Python\DL\semantic-search\msmarco-data\collection.tsv', 'r') as rf, open('E:\Python\DL\semantic-search\msmarco-data\collection_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

In [7]:
with open('E:\Python\DL\semantic-search\msmarco-data\queries.train.tsv', 'r') as rf, open('E:\Python\DL\semantic-search\msmarco-data\queries.train_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

In [13]:
with open('E:\Python\DL\semantic-search\queries\queries.dev.tsv', 'r') as rf, open('E:/Python/DL/semantic-search/queries/raw_texts/queries.dev_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

In [14]:
with open('E:\Python\DL\semantic-search\queries\queries.eval.tsv', 'r') as rf, open('E:/Python/DL/semantic-search/queries/raw_texts/queries.eval_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

# Tokenizer training

## Huggingface tokenizer

### BPE tokenizer

In [15]:
bpe_tokenizer = Tokenizer(models.BPE())
bpe_tokenizer.normalizer = normalizers.NFC()
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

bpe_tokenizer.decoder = decoders.WordPiece('@@')
trainer = trainers.BpeTrainer(special_tokens=[
    '[UNK]',
    '[CLS]',
    '[SEP]',
    '[PAD]'],
    unknow_token='[UNK]',
    vocab_size=32000,
    min_frequency=0,
    continuing_subword_prefix='@@')

In [17]:
files = [f'E:/Python/DL/semantic-search/queries/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
bpe_tokenizer.train(files=files, trainer=trainer)

In [20]:
print(bpe_tokenizer.encode('Hi i am a student. My name is Hai.').tokens)
print(bpe_tokenizer.encode('Hi i am a studen. My name is Hai.').ids)

['Hi', 'i', 'am', 'a', 'student', '.', 'My', 'name', 'is', 'H', '@@ai', '.']
[9776, 75, 884, 67, 1278, 582, 16, 2512, 874, 589, 42, 2220, 16]


In [22]:
bpe_tokenizer.decode([9776, 75, 884, 67, 3483, 16, 2512, 874, 589, 42, 2220, 16])

'Hi i am a student. My name is Hai.'

### WordPiece tokenizer

In [23]:
# WP tokenizer inspired by BERT tokenizer
wp_tokenizer = Tokenizer(models.WordPiece())
wp_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC(),
                                                normalizers.Lowercase(),
                                                normalizers.StripAccents()])
wp_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
wp_tokenizer.decoder = decoders.WordPiece()
wp_tokenizer.post_processor = processors.TemplateProcessing(
    single='[CLS] $A [SEP]',
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ('[CLS]', 1),
        ('[SEP]', 2),
    ],
)
trainer = trainers.WordPieceTrainer(special_tokens=[
    '[UNK]',
    '[CLS]',
    '[SEP]',
    '[PAD]'],
    unknow_token='[UNK]',
    vocab_size=32000,
    min_frequency=0,)

In [24]:
files = [f'E:/Python/DL/semantic-search/queries/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
wp_tokenizer.train(files=files, trainer=trainer)

In [25]:
wp_tokenizer.get_vocab()

{'instrumental': 18441,
 '##ass': 704,
 'coldest': 10873,
 'poe': 22084,
 'subunits': 24732,
 'kary': 19885,
 '##gae': 23172,
 'accounting': 4593,
 'amusement': 21288,
 'treaty': 6775,
 '##ced': 1150,
 '##odys': 31880,
 'amphib': 12863,
 'marit': 18086,
 'prayer': 19281,
 '##atorship': 29121,
 'firef': 8506,
 '##ulsion': 12434,
 'seems': 6248,
 'vibrations': 18539,
 'daylight': 6026,
 'verbal': 12688,
 'official': 3292,
 'prevented': 19463,
 'premier': 8878,
 'reflecting': 18528,
 'marquez': 31921,
 'palpitations': 19767,
 'covering': 8317,
 '##erman': 13420,
 'shoreline': 22842,
 'thatch': 29878,
 'practice': 3389,
 '##etal': 31170,
 '##worts': 28733,
 'stabilization': 29754,
 'replenish': 31421,
 'hernia': 8195,
 'ensures': 15830,
 '##horn': 17334,
 '(%': 23465,
 '##nels': 27279,
 'paternal': 24722,
 'radius': 12424,
 'ranked': 7367,
 'whip': 12489,
 'monouns': 23669,
 'interference': 14934,
 'burg': 16351,
 '##md': 9480,
 'delinquency': 24067,
 'qld': 25692,
 'emphyse': 16134,
 'ovu

In [26]:
wp_tokenizer.encode('Hi i am a student. My name is Hai.').ids

[1, 6000, 49, 680, 41, 3160, 16, 826, 757, 475, 28219, 16, 2]

In [None]:
wp_tokenizer.decode([1, 6000, 49, 680, 41, 3160, 16, 826, 757, 475, 28219, 16, 2])

'hi i am a student. my name is hai.'

In [30]:
wp_tokenizer.save('E:/Python/DL/semantic-search/save/tokenizer/wp_tokenizer_32k.json', True)

### Unigram tokenizer

In [31]:
unigram_tokenizer = Tokenizer(models.Unigram())

trainer = trainers.UnigramTrainer(special_tokens=[
    '[UNK]',
    '[CLS]',
    '[SEP]',
    '[PAD]'],
    unk_token='[UNK]',
    vocab_size=32000,
    min_frequency=0)
unigram_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

In [33]:
files = [f'E:/Python/DL/semantic-search/queries/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
unigram_tokenizer.train(files=files, trainer=trainer)

In [34]:
unigram_tokenizer.__dir__()

['__new__',
 '__repr__',
 '__str__',
 '__getstate__',
 '__setstate__',
 '__getnewargs__',
 'from_str',
 'from_file',
 'from_buffer',
 'from_pretrained',
 'to_str',
 'save',
 'num_special_tokens_to_add',
 'get_vocab',
 'get_added_tokens_decoder',
 'get_vocab_size',
 'enable_truncation',
 'no_truncation',
 'enable_padding',
 'no_padding',
 'encode',
 'encode_batch',
 'encode_batch_fast',
 'decode',
 'decode_batch',
 'token_to_id',
 'id_to_token',
 'add_tokens',
 'add_special_tokens',
 'train',
 'train_from_iterator',
 'post_process',
 'pre_tokenizer',
 'truncation',
 'post_processor',
 'model',
 'encode_special_tokens',
 'normalizer',
 'decoder',
 'padding',
 '__dict__',
 '__doc__',
 '__module__',
 '__hash__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__init__',
 '__reduce_ex__',
 '__reduce__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__']

In [37]:
unigram_tokenizer.encode('Hi, i am a student. My name is Hai')

Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [36]:
from transformers.tokenization_utils import PreTrainedTokenizer

## SentencePiece tokenizer

### BPE tokenizer

In [None]:
files = [f'E:/Python/DL/semantic-search/queries/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
spm.SentencePieceTrainer.Train(
    input=files,
    model_prefix='E:/Python/DL/semantic-search/save/tokenizer/spModel_bpe_8k',
    vocab_size=32000,
    pad_id=0,                
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='[PAD]',
    unk_piece='[UNK]',
    bos_piece='[CLS]',
    eos_piece='[SEP]',
    user_defined_symbols='[MASK]',
    model_type='bpe'
)
sp = spm.SentencePieceProcessor()


In [41]:
sp.load('E:/Python/DL/semantic-search/save/tokenizer/spModel_bpe_8k.model')

True

In [42]:
sp.encode_as_pieces('Hi i am a student, my name is Hai')

['▁Hi', '▁i', '▁am', '▁a', '▁student', ',', '▁my', '▁name', '▁is', '▁H', 'ai']

### Unigram tokenizer

In [45]:
files = [f'E:/Python/DL/semantic-search/queries/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
spm.SentencePieceTrainer.Train(
    input=files,
    model_prefix='E:/Python/DL/semantic-search/save/tokenizer/spModel_uni_8k',
    vocab_size=32000,
    pad_id=0,                
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='[PAD]',
    unk_piece='[UNK]',
    bos_piece='[CLS]',
    eos_piece='[SEP]',
    user_defined_symbols='[MASK]',
    model_type='unigram'
)
sp = spm.SentencePieceProcessor()

In [46]:
sp.load('E:/Python/DL/semantic-search/save/tokenizer/spModel_uni_8k.model')
sp.encode_as_pieces('Hi i am a student, my name is Hai')

['▁Hi', '▁i', '▁am', '▁a', '▁student', ',', '▁my', '▁name', '▁is', '▁Hai']