In [1]:
import sentencepiece as sp

In [2]:
N_TRAIN_SENTENCES = 20000
TRAINING_FILE = 'cnn/summary_bpe_train.txt'
VOCAB_SIZE = 10000
MODEL_NAME = 'cnn'

# make training data

In [3]:
import glob
import random
import tqdm
files = glob.glob('cnn/summary/*')

In [4]:
with open(TRAINING_FILE, 'w') as f_train:
    for _ in tqdm.tqdm(range(N_TRAIN_SENTENCES)):
        sample_file = random.choice(files)
        with open(sample_file) as f_sample:
            sentence = random.choice(f_sample.read().split('\n'))
        f_train.write(f'{sentence}\n')

100%|██████████| 20000/20000 [00:03<00:00, 5287.94it/s]


# Fit Encoder

In [5]:
import numpy as np
import sentencepiece as spm

class BytePairEncoder:
    def __init__(self, vocab_size, model_name, train_files):
        self.vocab_size = vocab_size
        self.model_name = model_name
        self.model_file = f'{self.model_name}.model'
        self.vocab_file = f'{self.model_name}.vocab'
        self.processor = self._fit(train_files)

    def _fit(self, train_files):
        spm.SentencePieceTrainer.Train(' '.join((
            f'--input={train_files}',
            f'--model_prefix={self.model_name}',
            f'--vocab_size={self.vocab_size}'
            '--model_type=bpe',
        )))
        processor = spm.SentencePieceProcessor()
        processor.Load(self.model_file)
        return processor
        
    def encode(self, text):
        return np.array(self.processor.EncodeAsIds(text))
    
    def encode_as_pieces(self, text):
        return self.processor.EncodeAsPieces(text)
    
    def decode(self, ids):
        return self.processor.DecodeIds(ids.tolist())
    
    def decode_pieces(self, pieces):
        return self.processor.DecodePieces(pieces)

In [6]:
%time
bpe = BytePairEncoder(vocab_size=VOCAB_SIZE, model_name=MODEL_NAME, train_files=TRAINING_FILE)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [7]:
sample_text = ' '.join('''
While the firelight's aglow
strange shadows in the flames will grow
till things we've never seen
will seem familiar
'''.strip().split('\n'))
sample_text

"While the firelight's aglow strange shadows in the flames will grow till things we've never seen will seem familiar"

In [8]:
ids = bpe.encode(sample_text)
ids

array([   3,    0,  340, 1909,    6,  303, 3007,  105,    7, 2759, 1027,
       2627, 5778,    7,   12,    6, 2807,    7,   53, 1326,  138, 2639,
       1026,    7,   78,  105,  239,  631,  199,   66,   53,  983, 2426])

In [9]:
bpe.decode(ids)

" ⁇ hile the firelight's aglow strange shadows in the flames will grow till things we've never seen will seem familiar"

In [10]:
pieces = bpe.encode_as_pieces(sample_text)
pieces

[b'\xe2\x96\x81',
 b'W',
 b'h',
 b'ile',
 b'\xe2\x96\x81the',
 b'\xe2\x96\x81fire',
 b'light',
 b"'",
 b's',
 b'\xe2\x96\x81ag',
 b'low',
 b'\xe2\x96\x81strange',
 b'\xe2\x96\x81shadow',
 b's',
 b'\xe2\x96\x81in',
 b'\xe2\x96\x81the',
 b'\xe2\x96\x81flame',
 b's',
 b'\xe2\x96\x81will',
 b'\xe2\x96\x81grow',
 b'\xe2\x96\x81t',
 b'ill',
 b'\xe2\x96\x81thing',
 b's',
 b'\xe2\x96\x81we',
 b"'",
 b've',
 b'\xe2\x96\x81never',
 b'\xe2\x96\x81see',
 b'n',
 b'\xe2\x96\x81will',
 b'\xe2\x96\x81seem',
 b'\xe2\x96\x81familiar']

In [11]:
bpe.decode_pieces(pieces)

"While the firelight's aglow strange shadows in the flames will grow till things we've never seen will seem familiar"

In [14]:
''.join(p.decode('utf-8').replace('▁', ' ') for p in pieces)

" While the firelight's aglow strange shadows in the flames will grow till things we've never seen will seem familiar"

In [15]:
!more cnn.vocab | head -100

<unk>	0
<s>	0
</s>	0
▁	-2.70518
entity	-2.76039
▁@	-2.76088
▁the	-3.20587
s	-3.43503
,	-3.48933
▁.	-3.61551
▁a	-4.07749
▁to	-4.16409
▁in	-4.21967
ed	-4.22016
▁"	-4.24646
▁of	-4.32205
and	-4.46666
ing	-4.52517
▁'	-4.61865
d	-4.92585
y	-4.96069
▁-	-5.0486
▁on	-5.07137
1	-5.07385
▁said	-5.18567
▁be	-5.1917
▁for	-5.22642
▁was	-5.27119
▁with	-5.38845
▁is	-5.41068
0	-5.4161
2	-5.46806
▁that	-5.48568
▁he	-5.58142
▁--	-5.60136
ly	-5.61833
▁at	-5.62728
3	-5.65259
4	-5.6679
▁an	-5.71241
▁it	-5.74109
▁his	-5.74819
)	-5.77492
(	-5.79208
m	-5.81885
t	-5.83233
5	-5.8337
▁has	-5.93128
e	-5.9549
▁as	-5.96678
6	-5.9735
7	-5.97439
▁by	-6.00798
▁will	-6.07061
en	-6.07086
▁have	-6.10553
9	-6.11204
8	-6.14442
▁i	-6.17625
▁after	-6.18213
▁who	-6.18394
▁were	-6.24241
▁but	-6.24864
▁fro	-6.30637
▁not	-6.33654
10	-6.34638
n	-6.36058
▁are	-6.41811
▁year	-6.4566
11	-6.49037
▁had	-6.51729
▁her	-6.52986
▁two	-6.53288
▁f	-6.54719
.	-6.55464
