-------------------------
#### SentencePiece
----------------------

In [6]:
#!pip install sentencepiece

In [8]:
#!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

In [13]:
location = r'D:\\AI-DATASETS\\01-MISC\\botchan.txt'

In [14]:
import sentencepiece as spm

In [18]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train('--input=D:\\AI-DATASETS\\01-MISC\\botchan.txt --model_prefix=m --vocab_size=2000')

In [19]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [20]:
# encode: text => id
print(sp.encode_as_pieces('This is a test'))
print(sp.encode_as_ids('This is a test'))

['▁This', '▁is', '▁a', '▁t', 'est']
[208, 31, 9, 434, 601]


In [22]:
# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
print(sp.decode_ids([208, 31, 9, 434, 601]))

This is a test
This is a test


In [23]:
# returns vocab size
print(sp.get_piece_size())

2000


In [24]:
# id <=> piece conversion
print(sp.id_to_piece(209))
print(sp.piece_to_id('▁This'))

il
208


In [25]:
# returns 0 for unknown tokens (we can change the id for UNK)
print(sp.piece_to_id('__MUST_BE_UNKNOWN__'))

0


In [26]:
# <unk>, <s>, </s> are defined by default. Their ids are (0, 1, 2)
# <s> and </s> are defined as 'control' symbol.
for id in range(3):
  print(sp.id_to_piece(id), sp.is_control(id))

<unk> False
<s> True
</s> True


#### Loads model from byte stream
Sentencepiece's model file is just a serialized protocol buffer. 

We can instantiate sentencepiece processor from byte object with `load_from_serialized_proto` method.

In [27]:
import tensorflow as tf




In [28]:
# Assumes that m.model is stored in non-Posix file system.
serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()

In [29]:
sp = spm.SentencePieceProcessor()
sp.load_from_serialized_proto(serialized_model_proto)

True

In [30]:
sp.encode_as_pieces('this is a test')

['▁this', '▁is', '▁a', '▁t', 'est']

In [32]:
print(sp.encode_as_pieces('my name is Bhupen and I like to teach Data Science'))

['▁my', '▁name', '▁is', '▁B', 'h', 'up', 'en', '▁and', '▁I', '▁like', '▁to', '▁teach', '▁D', 'at', 'a', '▁S', 'c', 'i', 'ence']


#### BPE (Byte pair encoding) model
- `Sentencepiece` supports BPE (byte-pair-encoding) for subword segmentation with `--model_type=bpe` flag. 
- We do not find empirical differences in translation quality between `BPE` and `unigram` model, but unigram model can perform sampling and n-best segmentation. See subword regularization paper [kudo18] for more detail.

In [35]:
spm.SentencePieceTrainer.train('--input=D:\\AI-DATASETS\\01-MISC\\botchan.txt --model_prefix=m_bpe --model_type=bpe --vocab_size=2000')

In [36]:
sp_bpe = spm.SentencePieceProcessor()

In [37]:
sp_bpe.load('m_bpe.model')

True

In [38]:
print('*** BPE ***')
print(sp_bpe.encode_as_pieces('thisisatesthelloworld'))
print(sp_bpe.nbest_encode_as_pieces('hello world', 5))  # returns an empty list.

*** BPE ***
['▁this', 'is', 'at', 'est', 'he', 'llow', 'or', 'ld']
[]


**using unigram**

In [41]:
spm.SentencePieceTrainer.train('--input=D:\\AI-DATASETS\\01-MISC\\botchan.txt --model_prefix=m_unigram --model_type=unigram --vocab_size=2000')

In [45]:
sp_unigram = spm.SentencePieceProcessor()
sp_unigram.load('m_unigram.model')

print('*** Unigram ***')
print(sp_unigram.encode_as_pieces('thisisatesthelloworld'))
print()
print(sp_unigram.nbest_encode_as_pieces('thisisatesthelloworld', 5))
print()
sp_unigram.nbest_encode_as_pieces('hello world', 5)

*** Unigram ***
['▁this', 'is', 'ate', 's', 'the', 'll', 'ow', 'or', 'l', 'd']

[['▁this', 'is', 'ate', 's', 'the', 'll', 'ow', 'or', 'l', 'd'], ['▁this', 'i', 's', 'ate', 's', 'the', 'll', 'ow', 'or', 'l', 'd'], ['▁this', 'is', 'at', 'es', 'the', 'll', 'ow', 'or', 'l', 'd'], ['▁this', 'is', 'ate', 'st', 'he', 'll', 'ow', 'or', 'l', 'd'], ['▁this', 'is', 'at', 'est', 'he', 'll', 'ow', 'or', 'l', 'd']]



[['▁he', 'll', 'o', '▁world'],
 ['▁he', 'l', 'l', 'o', '▁world'],
 ['▁', 'he', 'll', 'o', '▁world'],
 ['▁', 'h', 'e', 'll', 'o', '▁world'],
 ['▁he', 'll', 'o', '▁wor', 'l', 'd']]

In [1]:
def get_substrings(word, min_length=1, max_length=3):
    """
    Get all unique substrings of lengths 1 to 3 from a given word.
    """
    substrings = set()
    for length in range(min_length, min(max_length + 1, len(word) + 1)):
        for start in range(len(word) - length + 1):
            substring = word[start:start + length]
            substrings.add(substring)
    return substrings

In [2]:
# Given list of words
word_list = ["hug", "pug", "pun", "bun", "hugs"]

In [3]:
# Get all unique substrings of lengths 1 to 3 from the 3-letter words
all_substrings = set()
for word in word_list:
    substrings = get_substrings(word)
    all_substrings.update(substrings)

In [4]:
# Print the result
print("All Unique Substrings (Lengths 1 to 3):", all_substrings)

All Unique Substrings (Lengths 1 to 3): {'s', 'bu', 'pun', 'pug', 'p', 'g', 'ugs', 'pu', 'bun', 'h', 'u', 'gs', 'un', 'n', 'hug', 'b', 'hu', 'ug'}
