<a href="https://colab.research.google.com/github/finale22/NLP_study/blob/main/one_hot_encoding_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. one-hot encoding 구현

In [40]:
import numpy as np

In [41]:
samples = ["AlphaGo has intelligized through deep learning.", "Recurrent neural networks are a kind of deep learning algorithm."]

In [42]:
# 데이터에 있는 모든 토큰의 인덱스 구축
token_index = {}

In [43]:
for sample in samples:
    for word in sample.split(): # 단어 토큰화
        if word not in token_index:
            token_index[word] = len(token_index) + 1 # 단어마다 고유 인덱스 할당, 인덱스 0은 사용하지 않음.

token_index

{'AlphaGo': 1,
 'has': 2,
 'intelligized': 3,
 'through': 4,
 'deep': 5,
 'learning.': 6,
 'Recurrent': 7,
 'neural': 8,
 'networks': 9,
 'are': 10,
 'a': 11,
 'kind': 12,
 'of': 13,
 'learning': 14,
 'algorithm.': 15}

In [44]:
max_length = 10 # 각 샘플(여기서는 한 문장)에서 max_length까지의 단어만 사용
results = np.zeros(shape=(len(samples),
                          max_length,
                          max(token_index.values()) + 1)) # 결과를 저장할 넘파이 배열, max_length x max(token_index.values() + 1) 배열이 len(samples)개 있는 3차원 배열

In [45]:
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.

print(results)

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0

# 2. one-hot encoding Tokenizer로 구현

In [46]:
from keras.preprocessing.text import Tokenizer

In [47]:
samples = ["AlphaGo has intelligized through deep learning.", "Recurrent neural networks are a kind of deep learning algorithm."]

In [48]:
tokenizer = Tokenizer(num_words=1000) # 가장 빈도가 높은 1000개의 단어만 선택하도록 Tokenizer 객체 생성
tokenizer.fit_on_texts(samples) # 단어 인덱스 구축

sequences = tokenizer.texts_to_sequences(samples) # 문자열을 정수 인덱스 리스트로 변환

sequences

[[3, 4, 5, 6, 1, 2], [7, 8, 9, 10, 11, 12, 13, 1, 2, 14]]

In [49]:
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') # 원-핫 이진 벡터

one_hot_results

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

In [50]:
word_index = tokenizer.word_index # 계산된 단어 인덱스 구하기

word_index

{'deep': 1,
 'learning': 2,
 'alphago': 3,
 'has': 4,
 'intelligized': 5,
 'through': 6,
 'recurrent': 7,
 'neural': 8,
 'networks': 9,
 'are': 10,
 'a': 11,
 'kind': 12,
 'of': 13,
 'algorithm': 14}