# 1. BPE-based tokenizer

In [1]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [2]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv

# 1.1 IMDB 코퍼스에 대한 tokenizer 훈련시키고 확인하기

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x7ceb0ae7b370>)

In [4]:
train_df = pd.read_csv('IMDb_Reviews.csv')

In [5]:
train_df

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0
...,...,...
49995,the people who came up with this are SICK AND ...,0
49996,"The script is so so laughable... this in turn,...",0
49997,"""So there's this bride, you see, and she gets ...",0
49998,Your mind will not be satisfied by this nobud...,0


In [6]:
train_df.keys()

Index(['review', 'sentiment'], dtype='object')

In [7]:
train_df['review'].head(3)

0    My family and I normally do not watch local mo...
1    Believe it or not, this was at one time the wo...
2    After some internet surfing, I found the "Home...
Name: review, dtype: object

In [8]:
with open('imdb_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['review']))

* input : 학습시킬 파일
* model_prefix : 만들어질 모델 이름
* vocab_size : 단어 집합의 크기
* model_type : 사용할 모델 (unigram(default), bpe, char, word)
* max_sentence_length: 문장의 최대 길이
* pad_id, pad_piece: pad token id, 값
* unk_id, unk_piece: unknown token id, 값
* bos_id, bos_piece: begin of sentence token id, 값
* eos_id, eos_piece: end of sequence token id, 값
* user_defined_symbols: 사용자 정의 토큰

In [9]:
spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

위의 과정으로 `{model_prefix}.vocab`과 `{model_prefix}.model` 두 개의 파일 생성

In [10]:
vocab_list = pd.read_csv('imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10)

Unnamed: 0,0,1
3404,▁Best,-3401
706,▁nothing,-703
3226,inn,-3223
2459,icks,-2456
4208,▁Bo,-4205
1555,▁tou,-1552
3892,▁lover,-3889
1646,arth,-1643
4060,▁scientist,-4057
4392,▁tur,-4389


In [11]:
sp = spm.SentencePieceProcessor()
vocab_file = "imdb.model"
sp.load(vocab_file)

True

In [12]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 624, 4950, 4926, 139, 170, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]



In [13]:
sp.GetPieceSize()

5000

In [14]:
sp.IdToPiece(413)

'▁way'

In [15]:
sp.PieceToId('▁way')

413

In [16]:
sp.DecodeIds([41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91])

'I have waited a long time for someone to film'

In [17]:
sp.DecodePieces(['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film'])

'I have waited a long time for someone to film'

In [18]:
print(sp.encode('I have waited a long time for someone to film', out_type=str))
print(sp.encode('I have waited a long time for someone to film', out_type=int))

['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]


# 1.2 NSMC 코퍼스에 대한 tokenizer 훈련시키고 사용하기

In [19]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7ceb0ac8f0a0>)

In [20]:
naver_df = pd.read_table('ratings.txt')
naver_df

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1
...,...,...,...
199995,8963373,포켓 몬스터 짜가 ㅡㅡ;;,0
199996,3302770,쓰.레.기,0
199997,5458175,완전 사이코영화. 마지막은 더욱더 이 영화의질을 떨어트린다.,0
199998,6908648,왜난 재미없었지 ㅠㅠ 라따뚜이 보고나서 스머프 봐서 그런가 ㅋㅋ,0


In [21]:
naver_df.keys()

Index(['id', 'document', 'label'], dtype='object')

In [22]:
naver_df['document'].head(3)

0                                  어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1    디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...
2                 폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
Name: document, dtype: object

In [23]:
with open('nsmc_review.txt', 'w', encoding='UTF8') as f:
    f.write('\n'.join(naver_df['document'].astype(str)))

In [24]:
spm.SentencePieceTrainer.Train('--input=nsmc_review.txt --model_prefix=nsmc --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [25]:
vocab_list = pd.read_csv('nsmc.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10)

Unnamed: 0,0,1
2991,▁몇번을,-2988
3142,이션,-3139
1325,그런,-1322
3947,십,-3944
3152,▁에이,-3149
1395,▁사회,-1392
2432,순간,-2429
3578,올,-3575
4037,촌,-4034
4114,옹,-4111


In [26]:
sp = spm.SentencePieceProcessor()
vocab_file = "nsmc.model"
sp.load(vocab_file)

True

In [27]:
lines = [
  "안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화",
  "이거어렸을때되게재밌게봄ㅋㅋ이정재 이범수ㅋㅋㅋㅋ연기쩜"
]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()

안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화
['▁안', '개', '▁자', '욱', '한', '▁밤', '하', '늘', '에', '▁떠', '▁있는', '▁초', '승', '달', '▁같은', '▁영화']
[41, 3370, 87, 3856, 3291, 2468, 3284, 3695, 3288, 697, 214, 272, 3724, 3591, 348, 5]

이거어렸을때되게재밌게봄ㅋㅋ이정재 이범수ㅋㅋㅋㅋ연기쩜
['▁이거', '어', '렸', '을때', '되', '게', '재밌게', '봄', 'ᄏᄏ', '이', '정', '재', '▁이', '범', '수', 'ᄏᄏᄏᄏ', '연기', '쩜']
[188, 3293, 3656, 557, 3387, 3300, 2144, 3800, 9, 3277, 3318, 3310, 6, 3688, 3335, 70, 273, 4175]



In [28]:
sp.GetPieceSize()

5000

In [29]:
sp.DecodeIds([41, 3370, 87, 3856, 3291, 2468, 3284, 3695, 3288, 697, 214, 272, 3724, 3591, 348, 5])

'안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화'

In [30]:
sp.DecodePieces(['▁안', '개', '▁자', '욱', '한', '▁밤', '하', '늘', '에', '▁떠', '▁있는', '▁초', '승', '달', '▁같은', '▁영화'])

'안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화'

In [31]:
print(sp.encode('안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화', out_type=str))
print(sp.encode('안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화', out_type=int))

['▁안', '개', '▁자', '욱', '한', '▁밤', '하', '늘', '에', '▁떠', '▁있는', '▁초', '승', '달', '▁같은', '▁영화']
[41, 3370, 87, 3856, 3291, 2468, 3284, 3695, 3288, 697, 214, 272, 3724, 3591, 348, 5]
