In [12]:
import numpy as np
import pandas as pd

import os
import csv

import sentencepiece as spm

In [3]:
data_path = "../data/"
df = pd.read_table(data_path + "ratings.txt")

print(len(df))
df.head(5)

200000


Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


# Data Preprocessing

In [8]:
# drop null and duplicates
print(df.isnull().values.any()) 
print(df.dropna(inplace=True))

print(len(df))
print(df.isnull().values.any())

print(df['document'].nunique())
df.drop_duplicates(subset=['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
print(len(df))

False
None
199992
False
194543
194543


# Tokenizing with SentencePiece

In [11]:
with open(file=data_path + 'ratings_processed.txt', mode="w", encoding='utf-8') as f:
    f.write('\n'.join(df['document'].tolist()))

In [13]:
spm.SentencePieceTrainer.Train(
    '--input={} --model_prefix={} --vocab_size={} --model_type={} --max_sentence_length={}'.format(
        data_path + 'ratings_processed.txt',
        'naver',
        5000,
        'bpe',
        9999,
    )
)

model_path = "../model/"
os.rename('naver.model', model_path + 'naver.model')
os.rename('naver.vocab', model_path + 'naver.vocab')

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=../data/ratings_processed.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/ratings_processed.txt
  input_format: 
  model_prefix: naver
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 

In [15]:
# check vocab
vocab = pd.read_csv(model_path + 'naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)

print(len(vocab))
vocab.sample(10)

5000


Unnamed: 0,0,1
2251,보이,-2248
1485,▁나와서,-1482
4946,흩,-4943
3284,가,-3281
3520,ᅮ,-3517
3495,출,-3492
2495,▁리뷰,-2492
4768,딪,-4765
4897,닠,-4894
3436,애,-3433


In [16]:
# check model
sp = spm.SentencePieceProcessor()
sp.load(model_path + 'naver.model')

True

# Using Naver model

In [17]:
lines = [
    "뭐 이딴 것도 영화냐.",
    "진짜 최고의 영화입니다 ㅋㅋ",
]

In [18]:
for line in lines:
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))

['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']
[135, 969, 1296, 2762, 3275]
['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 210, 826, 122]


In [20]:
print(sp.encode(lines[0]))
print(sp.encode_as_ids(lines[0]))

[135, 969, 1296, 2762, 3275]
[135, 969, 1296, 2762, 3275]
