# SentencePiece
## https://arxiv.org/pdf/1808.06226.pdf
## https://github.com/google/sentencepiece

In [9]:
import numpy as np
import pandas as pd
import csv

import os
import urllib.request

import sentencepiece as spm

In [2]:
data_path = "../data/"
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename=data_path + "IMDb_Reviews.csv")

('../data/IMDb_Reviews.csv', <http.client.HTTPMessage at 0x1101b2510>)

In [3]:
df = pd.read_csv(data_path + "IMDb_Reviews.csv")
df.head(5)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


## Make Model using SentencePiece

In [4]:
# convert csv to txt for sentencepiece
with open(data_path + "imdb_review.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(df["review"])) # only review column

In [8]:
model_path = "../model/"
model_name = model_path + "imdb_bpe.model"
vocab_name = model_path + "imdb_bpe.vocab"

spm.SentencePieceTrainer.Train('--input=../data/imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

# move model and vocab file to model folder
os.rename("imdb.model", model_name)
os.rename("imdb.vocab", vocab_name)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=../data/imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/imdb_review.txt
  input_format: 
  model_prefix: imdb
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2


### sentencepiece parameters
#### https://github.com/google/sentencepiece#usage-instructions
--input: one-sentence-per-line raw corpus file. No need to run tokenizer, normalizer or preprocessor. By default, SentencePiece normalizes the input with Unicode NFKC. You can pass a comma-separated list of files.
--model_prefix: output model name prefix. <model_name>.model and <model_name>.vocab are generated.
--vocab_size: vocabulary size, e.g., 8000, 16000, or 32000
--character_coverage: amount of characters covered by the model, good defaults are: 0.9995 for languages with rich character set like Japanese or Chinese and 1.0 for other languages with small character set.
--model_type: model type. Choose from unigram (default), bpe, char, or word. The input sentence must be pretokenized when using word type.

In [13]:
# check vocab file
vocab_list = pd.read_csv(model_path + "imdb_bpe.vocab", sep="\t", header=None, quoting=csv.QUOTE_NONE) # no quoting

print(len(vocab_list))
vocab_list.head(10)

5000


Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,▁t,0
4,▁a,-1
5,he,-2
6,in,-3
7,▁the,-4
8,▁s,-5
9,re,-6


In [14]:
# load model file
sp = spm.SentencePieceProcessor()
sp.Load(model_name)

True

## Using SentencePiece

In [15]:
lines = [
"I didn't like the movie at all.",
    "The movie was good.",
    "The movie was not bad.",
    "I liked the movie very much."
]

In [21]:
for line in lines:
    print(sp.encode_as_pieces(line)) # tokenize
    print(sp.encode_as_ids(line)) # index
    
print(sp.GetPieceSize()) # vocab size
print(sp.IdToPiece(4945)) # index to token
print(sp.PieceToId(".")) # token to index
print(sp.DecodeIds([41, 1412, 7, 106, 270, 347, 4945]))
print(sp.DecodePieces(['▁The', '▁movie', '▁was', '▁not', '▁bad', '.']))

['▁I', '▁didn', "'", 't', '▁like', '▁the', '▁movie', '▁at', '▁all', '.']
[41, 624, 4950, 4926, 197, 7, 106, 139, 170, 4945]
['▁The', '▁movie', '▁was', '▁good', '.']
[105, 106, 84, 253, 4945]
['▁The', '▁movie', '▁was', '▁not', '▁bad', '.']
[105, 106, 84, 120, 349, 4945]
['▁I', '▁liked', '▁the', '▁movie', '▁very', '▁much', '.']
[41, 1412, 7, 106, 270, 347, 4945]
5000
.
4945
I liked the movie very much.
The movie was not bad.
