In [1]:
import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from tokenizers import Tokenizer, models, trainers

In [2]:
samples = pd.read_csv('Reviews.csv')
samples.head()

Unnamed: 0,Review,Sentiment
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0


In [3]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(samples['Review'], vocab_size=1024, min_frequency=2, special_tokens=["[PAD]", "[MASK]"])






In [4]:
samples['tokenized'] = samples['Review'].apply(lambda x: tokenizer.encode(x).ids)
samples['tokenized']

0        [56, 279, 76, 275, 353, 416, 285, 262, 764, 35...
1        [56, 558, 599, 85, 264, 78, 654, 298, 13, 262,...
2        [48, 587, 2, 625, 416, 340, 259, 955, 289, 414...
3        [42, 811, 718, 461, 276, 375, 432, 90, 681, 29...
4        [3, 36, 290, 367, 393, 3, 270, 676, 323, 84, 2...
                               ...                        
49995    [752, 361, 297, 276, 807, 318, 627, 14, 68, 28...
49996    [47, 601, 288, 522, 259, 425, 911, 349, 815, 6...
49997    [43, 70, 287, 449, 277, 80, 362, 327, 283, 296...
49998    [56, 673, 387, 262, 681, 32, 298, 477, 287, 39...
49999    [42, 263, 669, 330, 361, 321, 319, 55, 286, 51...
Name: tokenized, Length: 50000, dtype: object

In [5]:
samples['stringified'] = samples['tokenized'].apply(lambda x: '_'.join(map(str,x)))
samples['stringified']

0        56_279_76_275_353_416_285_262_764_359_73_680_3...
1        56_558_599_85_264_78_654_298_13_262_1019_905_2...
2        48_587_2_625_416_340_259_955_289_414_655_288_2...
3        42_811_718_461_276_375_432_90_681_294_530_762_...
4        3_36_290_367_393_3_270_676_323_84_262_415_571_...
                               ...                        
49995    752_361_297_276_807_318_627_14_68_282_310_790_...
49996    47_601_288_522_259_425_911_349_815_684_516_84_...
49997    43_70_287_449_277_80_362_327_283_296_645_288_2...
49998    56_673_387_262_681_32_298_477_287_394_529_387_...
49999    42_263_669_330_361_321_319_55_286_514_758_313_...
Name: stringified, Length: 50000, dtype: object

In [6]:
pairizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(vocab_size=1024, min_frequency=2, initial_alphabet=list("0123456789_"))
pairizer.train_from_iterator(samples['stringified'], trainer=trainer)






In [7]:
pairizer.get_vocab()

{'_15_386': 553,
 '_812': 742,
 '_418': 268,
 '79': 323,
 '_769': 825,
 '_41': 43,
 '7_318': 820,
 '_788': 857,
 '49': 126,
 '_67_3': 414,
 '_444': 284,
 '_917': 1016,
 '_733': 805,
 '_801': 773,
 '_923': 949,
 '_769_86': 962,
 '17': 633,
 '_524': 468,
 '_969': 882,
 '_13_373': 298,
 '92_3': 840,
 '42': 443,
 '_822': 778,
 '_13_373_3': 591,
 '9_3': 25,
 '_547': 901,
 '_4': 13,
 '_524_3': 581,
 '_402': 221,
 '_516': 325,
 '_477': 282,
 '_273_3': 313,
 '_611_3': 827,
 '_50': 80,
 '2_84': 418,
 '_459': 594,
 '_737': 638,
 '_5': 16,
 '76': 176,
 '_292_3': 220,
 '_293': 248,
 '_592': 379,
 '_558': 378,
 '_545': 522,
 '_659': 784,
 '_291_3': 322,
 '_935': 963,
 '_692': 640,
 '_42': 45,
 '_796': 636,
 '_589': 686,
 '_784': 699,
 '_326': 154,
 '_709': 521,
 '_285_262_3': 649,
 '_86_3': 351,
 '_754': 848,
 '_295': 245,
 '93_3': 983,
 '_435': 357,
 '_259_30': 781,
 '_99': 151,
 '_280_3': 664,
 '_297_262': 710,
 '_861': 715,
 '00': 144,
 '_479': 271,
 '65': 202,
 '_933': 993,
 '_929': 936,
 '46':