In [61]:
import tensorflow as tf
import numpy as np
import matplotlib as plt
import konlpy
import pandas as pd

print(tf.__version__)
print(np.__version__)
print(plt.__version__)
print(konlpy.__version__)

2.6.0
1.21.4
3.4.3
0.5.2


In [62]:
def tokenize(corpus):  # corpus: Tokenized Sentence's List
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(corpus)

    tensor = tokenizer.texts_to_sequences(corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, tokenizer

In [63]:
import os
path_to_file = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/korean-english-park.train.ko'

with open(path_to_file, "r") as f:
    raw = f.read().splitlines()

max_len = 150
min_len = 10
    
print("Data Size:", len(raw))

print("Example:")
for sen in raw[0:100][::20]: print(">>", sen)

cleaned_corpus = list(set(raw)) 

filtered_corpus = [s for s in cleaned_corpus if (len(s) < max_len) & (len(s) >= min_len)]

print("문장의 최단 길이:", min_len)
print("문장의 최장 길이:", max_len)

Data Size: 94123
Example:
>> 개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"
>> 북한의 핵무기 계획을 포기하도록 하려는 압력이 거세지고 있는 가운데, 일본과 북한의 외교관들이 외교 관계를 정상화하려는 회담을 재개했다.
>> "경호 로보트가 침입자나 화재를 탐지하기 위해서 개인적으로, 그리고 전문적으로 사용되고 있습니다."
>> 수자원부 당국은 논란이 되고 있고, 막대한 비용이 드는 이 사업에 대해 내년에 건설을 시작할 계획이다.
>> 또한 근력 운동은 활발하게 걷는 것이나 최소한 20분 동안 뛰는 것과 같은 유산소 활동에서 얻는 운동 효과를 심장과 폐에 주지 않기 때문에, 연구학자들은 근력 운동이 심장에 큰 영향을 미치는지 여부에 대해 논쟁을 해왔다.
문장의 최단 길이: 10
문장의 최장 길이: 150


In [64]:
# SentencePiece 모델 학습

import sentencepiece as spm
import os
temp_file = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp'

vocab_size = 16000

with open(temp_file, 'w') as f:
    for row in filtered_corpus:   # 이전에 나왔던 정제했던 corpus를 활용해서 진행해야 합니다.
        f.write(str(row) + '\n')

spm.SentencePieceTrainer.Train(
    '--input={} --model_prefix=korean_spm --vocab_size={} --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3'.format(temp_file, vocab_size)    
)
#위 Train에서  --model_type = unigram이 디폴트 적용되어 있습니다. --model_type = bpe로 옵션을 주어 변경할 수 있습니다.

!ls -l korean_spm*

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/aiffel/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp --model_prefix=korean_spm --vocab_size=16000 --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /aiffel/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp
  input_format: 
  model_prefix: korean_spm
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  h

-rw-r--r-- 1 root root 535805 May  9 06:58 korean_spm.model
-rw-r--r-- 1 root root 312443 May  9 06:58 korean_spm.vocab


In [65]:
s = spm.SentencePieceProcessor()
s.Load('korean_spm.model')

# SentencePiece를 활용한 sentence -> encoding
tokensIDs = s.EncodeAsIds('아버지가방에들어가신다.')
print(tokensIDs)

# SentencePiece를 활용한 sentence -> encoded pieces
print(s.SampleEncodeAsPieces('아버지가방에들어가신다.',1, 0.0))

# SentencePiece를 활용한 encoding -> sentence 복원
print(s.DecodeIds(tokensIDs))

[1074, 12, 691, 10, 3212, 12, 304, 41, 4]
['▁아버지', '가', '방', '에', '들어', '가', '신', '다', '.']
아버지가방에들어가신다.


In [66]:
# Tokenizer 함수 작성
from tqdm import tqdm
def sp_tokenize(s, corpus, maxlen=150,add_bos=True, add_eos=True): 

    tensor = []
    bos_id = s.bos_id()
    eos_id = s.eos_id()
    
    corpus = corpus.astype(str)

    for sen in tqdm(corpus, desc="SentencePiece Tokenizing"):
        ids = s.EncodeAsIds(sen)
        if add_bos:
            ids = [bos_id] + ids
        if add_eos:
            ids = ids + [eos_id]
        tensor.append(ids)

    with open("./korean_spm.vocab", 'r') as f:
        vocab = f.readlines()

    word_index = {}
    index_word = {}

    for idx, line in enumerate(vocab):
        word = line.split("\t")[0]

        word_index.update({word:idx})
        index_word.update({idx:word})

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',maxlen=maxlen)

    return tensor, word_index, index_word

In [67]:
data_path = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_train.txt'

with open(data_path, 'r') as f:
    file = f.readlines()
    column_name = file[0].strip().split('\t')
    data_split = [x.strip().split('\t')for x in file[1:]]
    data = pd.DataFrame(data_split, columns=column_name)
data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [68]:
import re

def preprocessing(seq):
    seq = str(seq).lower()
    
    slang_map = {
        "ㅅㅂ": "시발",
        "ㅄ": "병신",
        "ㅈㄴ": "아주",
        "ㅂㅅ": "병신",
        "ㅁㅊ": "미친",
    }

    # 패턴을 하나로 결합
    pattern = r'\b(' + '|'.join(map(re.escape, slang_map.keys())) + r')\b'

    # 매치된 문자열을 dict에서 찾아서 치환
    seq = re.sub(pattern, lambda m: slang_map[m.group()], seq)
    
    # 'O'만 구성된 단어 중 반복된 것만 욕설로 치환
    seq = re.sub(r'\bO{2,}\b', '욕설', seq)
    
    return seq
data['document'] = data['document'].astype(str).apply(preprocessing)

In [69]:
tensor, word_index, index_word = sp_tokenize(s, data.iloc[:]['document'])

SentencePiece Tokenizing: 100%|██████████| 150000/150000 [00:03<00:00, 41691.71it/s]


In [70]:
word_index

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 '.': 4,
 '을': 5,
 '▁': 6,
 '의': 7,
 '를': 8,
 '는': 9,
 '에': 10,
 '이': 11,
 '가': 12,
 '은': 13,
 ',': 14,
 '고': 15,
 '에서': 16,
 '▁“': 17,
 '로': 18,
 '”': 19,
 '한': 20,
 '인': 21,
 '일': 22,
 ')': 23,
 '(': 24,
 '▁이': 25,
 '과': 26,
 '▁있다': 27,
 '으로': 28,
 '와': 29,
 '▁수': 30,
 '도': 31,
 '▁밝혔다': 32,
 '▁말했다': 33,
 '할': 34,
 '년': 35,
 '지': 36,
 '▁있는': 37,
 '며': 38,
 '▁그': 39,
 '하고': 40,
 '다': 41,
 '하는': 42,
 '했다': 43,
 '▁그는': 44,
 '▁전': 45,
 '▁2': 46,
 '▁1': 47,
 '▁대한': 48,
 '▁위해': 49,
 '만': 50,
 '월': 51,
 '▁전했다': 52,
 '▁한': 53,
 '▁미국': 54,
 '해': 55,
 '▁이번': 56,
 '▁3': 57,
 '기': 58,
 '▁지난': 59,
 '현지시간': 60,
 '▁중': 61,
 '▁대해': 62,
 '자': 63,
 '"': 64,
 '된': 65,
 '▁미': 66,
 '▁것으로': 67,
 '▁‘': 68,
 '에게': 69,
 '스': 70,
 '▁것이라고': 71,
 '명이': 72,
 '▁"': 73,
 '▁것': 74,
 '이라고': 75,
 '▁있다고': 76,
 '▁것을': 77,
 's': 78,
 '▁4': 79,
 '나': 80,
 '’': 81,
 '▁6': 82,
 '▁이라크': 83,
 '시': 84,
 '▁그러나': 85,
 '리': 86,
 '▁5': 87,
 '게': 88,
 '▁더': 89,
 '▁다른': 90,
 '히': 91,


In [71]:
#print(word_index)
#print(index_word)

for sequence in tensor:
    print([index_word[word]for word in sequence])
    break

['<s>', '▁아', '▁더', '빙', '.', '.', '▁진짜', '▁짜', '증', '나', '네', '요', '▁목소리', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'

LSTM 기반의 모델 학습

In [72]:

from konlpy.tag import Mecab
import numpy as np
from collections import Counter

# Mecab : 한국어 형태소 분석기
#tokenizer = Mecab()

def load_data(train_data, test_data, s,num_words=10000):
    # 중복 제거
    train_data = train_data.drop_duplicates(subset=['document'])
    test_data = test_data.drop_duplicates(subset=['document'])

    # Nan 결측치 제거
    # https://wikidocs.net/153202
    train_data = train_data.dropna(how='any')
    test_data = test_data.dropna(how='any')

    # Use SentencePiece
    train_tokens, word_to_index, _ = sp_tokenize(s, train_data['document'], maxlen=40)
    test_tokens, _, _ = sp_tokenize(s, test_data['document'],maxlen=40)
    
    return train_tokens, np.array(train_data['label']), test_tokens, test_data['label'], word_to_index

train_data = pd.read_table(os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_train.txt')
test_data = pd.read_table(os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_test.txt')

train_data['document'] = train_data['document'].astype(str).apply(preprocessing)
test_data['document'] = test_data['document'].astype(str).apply(preprocessing)

X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, s)

SentencePiece Tokenizing: 100%|██████████| 146162/146162 [00:03<00:00, 41934.70it/s]
SentencePiece Tokenizing: 100%|██████████| 49150/49150 [00:01<00:00, 45488.72it/s]


In [73]:
X_val = X_train[:20000]   
y_val = y_train[:20000]

partial_x_train = X_train[20000:]  
partial_y_train = y_train[20000:]

print(partial_x_train.shape)
print(partial_y_train.shape)

print(np.unique(partial_y_train))
print(np.unique(y_val))

(126162, 40)
(126162,)
[0 1]
[0 1]


In [74]:
X_val = X_train[:20000]   
y_val = y_train[:20000]

partial_x_train = X_train[20000:]  
partial_y_train = y_train[20000:]

print(partial_x_train.shape)
print(partial_y_train.shape)

print(np.unique(partial_y_train))
print(np.unique(y_val))

(126162, 40)
(126162,)
[0 1]
[0 1]


In [75]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 32)          512000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 9         
Total params: 515,281
Trainable params: 515,281
Non-trainable params: 0
_________________________________________________________________


In [76]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
              
epochs=20

early_stop = EarlyStopping(monitor='val_loss',
                           patience=2,
                           restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    verbose=1,
                    callbacks=[early_stop, checkpoint])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [77]:
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

1536/1536 - 3s - loss: 0.3940 - accuracy: 0.8245
[0.3939913809299469, 0.824516773223877]


# 영화 데이터로 토크나이저 학습

In [78]:
train_data = pd.read_table(data_path)
train_data = train_data.dropna(subset=['document'])
train_data['document'] = train_data['document'].astype(str).apply(preprocessing)

clean_path = "spm_input.txt"

with open(clean_path, "w", encoding="utf-8") as f:
    for line in train_data['document']:
        f.write(str(line).strip() + "\n")


In [79]:
spm.SentencePieceTrainer.Train(
    '--input={} --model_prefix=korean_spm --vocab_size={}  --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3'.format(clean_path, vocab_size)    
)
s = spm.SentencePieceProcessor()
s.load("korean_spm.model")

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=spm_input.txt --model_prefix=korean_spm --vocab_size=16000  --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: spm_input.txt
  input_format: 
  model_prefix: korean_spm
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>


KeyboardInterrupt: 

In [None]:
tensor, word_index, index_word = sp_tokenize(s, train_data['document'])

In [None]:
word_index

In [None]:
#print(word_index)
#print(index_word)

for sequence in tensor:
    print([index_word[word]for word in sequence])
    break

In [None]:
train_data = pd.read_table(os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_train.txt')
test_data = pd.read_table(os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_test.txt')

train_data['document'] = train_data['document'].astype(str).apply(preprocessing)
test_data['document'] = test_data['document'].astype(str).apply(preprocessing)

X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, s)

X_val = X_train[:20000]   
y_val = y_train[:20000]

partial_x_train = X_train[20000:]  
partial_y_train = y_train[20000:]

print(partial_x_train.shape)
print(partial_y_train.shape)

print(np.unique(partial_y_train))
print(np.unique(y_val))

In [None]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
              
epochs=20

early_stop = EarlyStopping(monitor='val_loss',
                           patience=2,
                           restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    verbose=1,
                    callbacks=[early_stop, checkpoint])

In [None]:
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

# KoNLPy 형태소 분석기 사용 모델

In [None]:
from konlpy.tag import Mecab

# Mecab : 한국어 형태소 분석기
tokenizer = Mecab()
maxlen = 40

def load_data(train_data, test_data, s,num_words=10000):
    # 중복 제거
    train_data = train_data.drop_duplicates(subset=['document'])
    test_data = test_data.drop_duplicates(subset=['document'])

    # Nan 결측치 제거
    # https://wikidocs.net/153202
    train_data = train_data.dropna(how='any')
    test_data = test_data.dropna(how='any')

    # 토큰화, 형태소에 대한 구문 분석
    # https://konlpy.org/ko/v0.6.0/api/konlpy.tag/
    train_tokens = [tokenizer.morphs(sentence) for sentence in train_data['document']]
    test_tokens =  [tokenizer.morphs(sentence) for sentence in test_data['document']]

    # Use SentencePiece
    #train_tokens, word_to_index, _ = sp_tokenize(s, train_data['document'])
    #test_tokens, _, _ = sp_tokenize(s, test_data['document'])
    
    # word_to_index 구성
    # Counter를 사용하여 많이 사용된 num_words 개의 단어 vocab 만들기
    word_to_index = {}
    # 2-d list를 1-d list로 변환
    words = np.concatenate(train_tokens).tolist()
    counter = Counter(words)
    counter = counter.most_common(num_words-4)
    vocab = [key for key, _ in counter]
    # 미리 정의된 토큰 4개를 제외하고 단어 인덱스 부여
    word_to_index = {word:index+4 for index, word in enumerate(vocab)}

    word_to_index['<pad>']=  0# 패딩용 단어
    word_to_index['<s>']=  1# 문장의 시작지점
    word_to_index['</s>']= 2
    word_to_index['<unk>']= 3
    
    # text string to vocab index string
    # X_train의 상위 10000개의 단어만 단어에 등록되었기 때문에, X_train에도 <UNK>토큰이 발생할 수 있다.
    X_train = [[word_to_index['<s>']]+[word_to_index[token] if token in word_to_index else word_to_index['<unk>'] for token in train_token] for train_token in train_tokens]
    X_test = [[word_to_index['<s>']]+[word_to_index[token] if token in word_to_index else word_to_index['<unk>'] for token in test_token] for test_token in test_tokens]
    
    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='pre', value=word_to_index["<pad>"], maxlen=maxlen)
    X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='pre', value=word_to_index["<pad>"], maxlen=maxlen)
    
    return X_train, np.array(train_data['label']), X_test, test_data['label'], word_to_index

train_data = pd.read_table(os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_train.txt')
test_data = pd.read_table(os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_test.txt')

train_data['document'] = train_data['document'].astype(str).apply(preprocessing)
test_data['document'] = test_data['document'].astype(str).apply(preprocessing)

X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, s)

In [None]:
X_val = X_train[:20000]   
y_val = y_train[:20000]

partial_x_train = X_train[20000:]  
partial_y_train = y_train[20000:]

print(partial_x_train.shape)
print(partial_y_train.shape)

print(np.unique(partial_y_train))
print(np.unique(y_val))

In [None]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
              
epochs=20

early_stop = EarlyStopping(monitor='val_loss',
                           patience=2,
                           restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    verbose=1,
                    callbacks=[early_stop, checkpoint])

In [None]:
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

In [84]:
import numpy as np
from collections import Counter

def oov_ratio(encoded, unk_id=0):
    total = sum(len(seq) for seq in encoded)
    unk   = sum((tok == unk_id) for seq in encoded for tok in seq)
    return round(100 * unk / total, 2)

print("Train OOV% :", oov_ratio(X_train))
print("Test  OOV% :", oov_ratio(X_test))

Train OOV% : 46.68
Test  OOV% : 46.98


# 함수화

In [85]:
def load_data(train_data, test_data, tokenizer, max_len=40, num_words=10000, use_sentence_piece=True):
    # 중복 제거
    train_data = train_data.drop_duplicates(subset=['document'])
    test_data = test_data.drop_duplicates(subset=['document'])

    # Nan 결측치 제거
    train_data = train_data.dropna(how='any')
    test_data = test_data.dropna(how='any')

    if use_sentence_piece:
        train_tokens, word_to_index, _ = sp_tokenize(tokenizer, train_data['document'], maxlen=max_len)
        test_tokens, _, _ = sp_tokenize(tokenizer, test_data['document'], maxlen=max_len)
        
        return train_tokens, np.array(train_data['label']), test_tokens, test_data['label'], word_to_index
    else:
        stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다', '.']

        # 토큰화, 형태소에 대한 구문 분석
        train_tokens = [tokenizer.morphs(sentence) for sentence in train_data['document']]
        test_tokens =  [tokenizer.morphs(sentence) for sentence in test_data['document']]
        
        # 불용어 제거
        train_tokens = [[token for token in sentence if token not in stopwords] for sentence in train_tokens]
        test_tokens =  [[token for token in sentence if token not in stopwords] for sentence in test_tokens]

        
        # word_to_index 구성
        word_to_index = {}
        words = np.concatenate(train_tokens).tolist()
        counter = Counter(words)
        counter = counter.most_common(num_words-4)
        vocab = [key for key, _ in counter]
        word_to_index = {word:index+4 for index, word in enumerate(vocab)}

        word_to_index['<pad>']=  0# 패딩용 단어
        word_to_index['<s>']=  1# 문장의 시작지점
        word_to_index['</s>']= 2
        word_to_index['<unk>']= 3
    
        X_train = [[word_to_index['<s>']]+[word_to_index[token] if token in word_to_index else word_to_index['<unk>'] for token in train_token] for train_token in train_tokens]
        X_test = [[word_to_index['<s>']]+[word_to_index[token] if token in word_to_index else word_to_index['<unk>'] for token in test_token] for test_token in test_tokens]

        X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='pre', value=word_to_index["<pad>"], maxlen=maxlen)
        X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='pre', value=word_to_index["<pad>"], maxlen=maxlen)
    
        return X_train, np.array(train_data['label']), X_test, test_data['label'], word_to_index

In [97]:
from konlpy.tag import Hannanum,Kkma,Komoran,Mecab,Okt

def review_model(train_path, test_path, model, tokenizer='sp_ke_park', vocab_size = 16000, val_len=20000, epochs=20):
    assert tokenizer in ['sp_ke_park','se', 'mecab', 'hannanum', 'kkma', 'komoran', 'okt']
    
    train_data = pd.read_table(train_path)
    test_data = pd.read_table(test_path)
    
    train_data['document'] = train_data['document'].astype(str).apply(preprocessing)
    test_data['document'] = test_data['document'].astype(str).apply(preprocessing)

    if tokenizer == 'sp_ke_park':
        temp_file = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp'

        with open(temp_file, 'w') as f:
            for row in filtered_corpus:   # 이전에 나왔던 정제했던 corpus를 활용해서 진행해야 합니다.
                f.write(str(row) + '\n')

        spm.SentencePieceTrainer.Train(
            '--input={} --model_prefix=korean_spm --vocab_size={} --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3'.format(temp_file, vocab_size)    
        )
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load("korean_spm.model")
        
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer)
    elif tokenizer == 'se':
        clean_path = "spm_input.txt"

        with open(clean_path, "w", encoding="utf-8") as f:
            for line in train_data['document']:
                f.write(str(line).strip() + "\n")

        spm.SentencePieceTrainer.Train(
            '--input={} --model_prefix=korean_spm --vocab_size={}  --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3'.format(clean_path, vocab_size)    
        )
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load("korean_spm.model")
        
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer)
    elif tokenizer == 'mecab':
        tokenizer = Mecab()
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer, use_sentence_piece=False, num_words=vocab_size)
    elif tokenizer == 'hannanum':
        tokenizer = Hannanum()
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer, use_sentence_piece=False, num_words=vocab_size)
    elif tokenizer == 'kkma':
        tokenizer = Kkma()
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer, use_sentence_piece=False, num_words=vocab_size)
    elif tokenizer == 'komoran':
        tokenizer = Komoran()
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer, use_sentence_piece=False, num_words=vocab_size)
    elif tokenizer == 'okt':
        tokenizer = Okt()
        X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data, tokenizer, use_sentence_piece=False, num_words=vocab_size)
    
    
    #print("Train OOV% :", oov_ratio(X_train))
    #print("Test  OOV% :", oov_ratio(X_test))
    
    X_val = X_train[:val_len]   
    y_val = y_train[:val_len]

    partial_x_train = X_train[val_len:]  
    partial_y_train = y_train[val_len:]
    

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss',
                               patience=2,
                               restore_best_weights=True)
    checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

    history = model.fit(partial_x_train,
                        partial_y_train,
                        epochs=epochs,
                        batch_size=512,
                        validation_data=(X_val, y_val),
                        verbose=1,
                        callbacks=[early_stop, checkpoint])
    results = model.evaluate(X_test, y_test, verbose=2)
    print(results)
    return model, history

In [87]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 32)          512000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_12 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 9         
Total params: 515,281
Trainable params: 515,281
Non-trainable params: 0
_________________________________________________________________


In [88]:
train_path = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_train.txt'
test_path = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/nsmc/ratings_test.txt'

model, history = review_model(train_path, test_path, model, tokenizer = 'se')

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=spm_input.txt --model_prefix=korean_spm --vocab_size=16000  --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: spm_input.txt
  input_format: 
  model_prefix: korean_spm
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>


Train OOV% : 59.16
Test  OOV% : 59.1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
1536/1536 - 3s - loss: 0.3521 - accuracy: 0.8483
[0.3521427512168884, 0.8483214378356934]


In [89]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()
model, history = review_model(train_path, test_path, model, tokenizer = 'mecab')

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 32)          512000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_14 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 9         
Total params: 515,281
Trainable params: 515,281
Non-trainable params: 0
_________________________________________________________________
Train OOV% : 62.1
Test  OOV% : 62.22
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
1536/1536 - 3s - loss: 0.3554 - accuracy: 0.8467
[0.35537293553352356, 0.8466938138008118]


In [90]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()
model, history = review_model(train_path, test_path, model, tokenizer = 'sp_ke_park')

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 32)          512000    
_________________________________________________________________
lstm_8 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_16 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 9         
Total params: 515,281
Trainable params: 515,281
Non-trainable params: 0
_________________________________________________________________


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/aiffel/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp --model_prefix=korean_spm --vocab_size=16000 --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /aiffel/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp
  input_format: 
  model_prefix: korean_spm
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  h

Train OOV% : 46.68
Test  OOV% : 46.98
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
1536/1536 - 3s - loss: 0.3972 - accuracy: 0.8227
[0.3971611559391022, 0.8226653337478638]


In [92]:
"""
# hannanum은 java.lang.ArrayIndexOutOfBoundsException 예외 발생, 특수문자 때문인 것으로 추정됨.
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()
model, history = review_model(train_path, test_path, model, tokenizer = 'hannanum')"""

"word_vector_dim = 32\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))\nmodel.add(tf.keras.layers.LSTM(16))\nmodel.add(tf.keras.layers.Dense(8, activation='relu'))\nmodel.add(tf.keras.layers.Dense(1, activation='sigmoid'))\n\nmodel.summary()\nmodel, history = review_model(train_path, test_path, model, tokenizer = 'hannanum')"

In [94]:
"""
# 너무 오래걸림
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()
model, history = review_model(train_path, test_path, model, tokenizer = 'kkma')"""

"\n# 너무 오래걸림\nword_vector_dim = 32\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))\nmodel.add(tf.keras.layers.LSTM(16))\nmodel.add(tf.keras.layers.Dense(8, activation='relu'))\nmodel.add(tf.keras.layers.Dense(1, activation='sigmoid'))\n\nmodel.summary()\nmodel, history = review_model(train_path, test_path, model, tokenizer = 'kkma')"

In [100]:
word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()
model, history = review_model(train_path, test_path, model, tokenizer = 'se', vocab_size=24000)

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 32)          512000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 16)                3136      
_________________________________________________________________
dense_28 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 9         
Total params: 515,281
Trainable params: 515,281
Non-trainable params: 0
_________________________________________________________________


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=spm_input.txt --model_prefix=korean_spm --vocab_size=24000  --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: spm_input.txt
  input_format: 
  model_prefix: korean_spm
  model_type: UNIGRAM
  vocab_size: 24000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
1536/1536 - 3s - loss: 0.3611 - accuracy: 0.8416
[0.36105504631996155, 0.8416480422019958]


In [101]:
"""word_vector_dim = 32
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()
model, history = review_model(train_path, test_path, model, tokenizer = 'okt')"""

"word_vector_dim = 32\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))\nmodel.add(tf.keras.layers.LSTM(16))\nmodel.add(tf.keras.layers.Dense(8, activation='relu'))\nmodel.add(tf.keras.layers.Dense(1, activation='sigmoid'))\n\nmodel.summary()\nmodel, history = review_model(train_path, test_path, model, tokenizer = 'okt')"

# 회고

otk, hannanum, kkma를 진행해보고싶었는데, 속도가 느린건지 잘 돌아가지 않아서 시도해보지 못한 게 아쉽다.