In [1]:
from konlpy.tag import Okt

In [2]:
okt = Okt()
tokens = okt.morphs('나는 자연어 처리를 배운다')
print(tokens)

['나', '는', '자연어', '처리', '를', '배운다']


In [4]:
word_to_index = {word:idx for idx, word in enumerate(tokens)}
print('Vocabulary:', word_to_index)

Vocabulary: {'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '배운다': 5}


In [5]:
def one_hot_encoding(word, word_to_index):
    one_hot_vector = [0]*len(word_to_index)
    index = word_to_index[word]
    one_hot_vector[index] = 1
    return one_hot_vector

In [6]:
one_hot_encoding("자연어", word_to_index)

[0, 0, 1, 0, 0, 0]

### One-Hot Encoding Using Keras

In [7]:
text = "나랑 점심 먹으러 갈래 점심 메뉴는 햄버거 갈래 갈래 햄버거 최고야"

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
print('Vocab:', tokenizer.word_index)

Vocab: {'갈래': 1, '점심': 2, '햄버거': 3, '나랑': 4, '먹으러': 5, '메뉴는': 6, '최고야': 7}


In [12]:
sub_text = "점심 먹으러 갈래 메뉴는 햄버거 최고야"
encoded = tokenizer.texts_to_sequences([sub_text])[0]
print(encoded)

[2, 5, 1, 6, 3, 7]


In [13]:
one_hot = to_categorical(encoded)
print(one_hot)

[[0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]
