In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
import numpy as np

In [4]:
from tensorflow.keras.preprocessing.text import one_hot

In [5]:
# 문서의 정의
docs = ['glass of orange juice',
       'bottle of mango juice',
       'glass of mango shake',
       'drink bottle of banana shake',
       'I wnat a glass of cold water',
       'The king and the queen',
       'mand and woman']

In [6]:
vocab_size =10000

### 문제1: 원핫인코딩 연습해보자

In [7]:
# 원핫인코딩해보자
encoded_docs = [one_hot(d, vocab_size) for d in docs]

In [8]:
print(encoded_docs)

[[5921, 2686, 4800, 4480], [1119, 2686, 6445, 4480], [5921, 2686, 6445, 8020], [8560, 1119, 2686, 2633, 8020], [8078, 5011, 7029, 5921, 2686, 507, 2944], [5883, 1425, 2165, 5883, 9471], [6915, 2165, 2302]]


# 단어 임베딩 (Word Embeddings)

In [10]:
embedding_length = 5 # 임베딩 길이 (단어 하나를 5개의 유닛으로 표현해보자.)
max_doc_len = 10  # 문장 길이

### 문제2: 패딩을 시도해보자

In [29]:
# 다음의 pad_sequence를 이용해서 10개의 길이가 되도록 패딩 연습해보자. (디폴트인 pre 대신 post를 사용하라)
# (https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences)
# tf.keras.preprocessing.sequence.pad_sequences(
#     sequences, maxlen=None, dtype='int32', padding='pre',
#     truncating='pre', value=0.0)

In [13]:
encoded_docs = pad_sequences(encoded_docs, truncating='post', padding='post', maxlen=max_doc_len)

In [14]:
print(encoded_docs)

[[5921 2686 4800 4480    0    0    0    0    0    0]
 [1119 2686 6445 4480    0    0    0    0    0    0]
 [5921 2686 6445 8020    0    0    0    0    0    0]
 [8560 1119 2686 2633 8020    0    0    0    0    0]
 [8078 5011 7029 5921 2686  507 2944    0    0    0]
 [5883 1425 2165 5883 9471    0    0    0    0    0]
 [6915 2165 2302    0    0    0    0    0    0    0]]


### 문제3: 5개의 유닛으로 만드는 임베딩 모델을 구축해보자.

In [15]:
#힌트: Embedding() 찾아보자 (https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)
# tf.keras.layers.Embedding(
#     input_dim, output_dim, embeddings_initializer='uniform',
#     embeddings_regularizer=None, activity_regularizer=None,
#     embeddings_constraint=None, mask_zero=False, input_length=None, **kwargs
# )
# 이중 input_dim, output_dim과 input_length를 정의해보자.

In [19]:
model=Sequential()
model.add(Embedding(vocab_size, embedding_length, input_length=max_doc_len))


model.compile('adam', 'mse')
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 10, 5)             50000     
Total params: 50,000
Trainable params: 50,000
Non-trainable params: 0
_________________________________________________________________


### 문제4: encoded_docs를 (배치크기, 문장길이, 임베딩차원)의 모양으로 만들어라.

In [20]:
# 위에서 만든 모델을 이용해 예측하라.
output=model.predict(encoded_docs)
print(output.shape)
print(output)

(7, 10, 5)
[[[-0.04889632  0.00252714  0.02168825 -0.00817809  0.04854068]
  [ 0.00541852  0.03419889 -0.02202311  0.01589291 -0.00808783]
  [ 0.03997257 -0.02933958  0.03178154  0.02849341 -0.02209885]
  [-0.02928739  0.00815045 -0.044402    0.01274708  0.0162443 ]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]]

 [[ 0.00339971  0.00449334 -0.00039792  0.02644164 -0.01212122]
  [ 0.00541852  0.03419889 -0.02202311  0.01589291 -0.00808783]
  [-0.00629584 -0.03414214 -0.01625671 -0.03276785  0.04267308]
  [-0.02928739  0.00815045 -0.044402    0.01274708  0.0162443 ]
  [ 0.01284481  0.03459806  0.04131451  0.02524319 -0.00135282]
  [ 0.01284481  0.03459806 

### 문제5: LSTM 모델을 구축해보라. 

In [21]:
# 은닉상태의 차원은 64로 지정하라.

model=Sequential()
model.add(Embedding(vocab_size, embedding_length, input_length=max_doc_len))
model.add(LSTM(units=64))
model.compile('adam', 'mse')
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 10, 5)             50000     
_________________________________________________________________
lstm (LSTM)                  (None, 64)                17920     
Total params: 67,920
Trainable params: 67,920
Non-trainable params: 0
_________________________________________________________________


### 문제6: 문제5의 LSTM 모델의 출력을 확인하라.

In [22]:
# LSTM의 출력을 산출하고 이의 차원을 확인하라.

output=model.predict(encoded_docs)
print(output.shape)
print(output)

(7, 64)
[[ 5.35600260e-03 -6.15130179e-03  2.79492437e-04  2.15256092e-04
   6.11688616e-03 -3.68430838e-03  9.31514369e-04  5.27960598e-04
   2.61307717e-03  2.67142383e-03 -2.71752104e-03 -8.82898737e-03
   4.17436194e-03 -4.22805548e-03  1.13755814e-03  1.04035507e-03
   1.71633437e-03  6.04047766e-03  5.71901584e-03 -3.72357015e-03
   8.67256313e-04  7.07795331e-03 -5.76936036e-05  5.92362694e-03
  -2.35443213e-03  2.73992633e-03 -7.47268787e-04 -5.49585605e-03
  -5.88087458e-03 -6.05518278e-03  8.08181800e-03 -4.15831571e-03
   2.78576999e-03  6.07422180e-03  2.78777815e-03 -1.24800671e-03
   5.45005838e-04  4.88117430e-03  4.30778787e-03 -1.25636719e-03
  -4.03690245e-03  4.05657152e-03 -4.41830234e-05 -4.55471163e-04
  -4.86987038e-03  4.87070793e-05  6.52943039e-03 -1.52577041e-03
   3.47894500e-03 -1.98283861e-03 -1.45035074e-03 -1.67792430e-03
   1.03880870e-04 -3.59702785e-03 -2.10074242e-03 -1.00562703e-02
  -1.06676237e-03 -3.15825036e-03 -1.20852480e-03 -2.15679477e-03
  

### 문제7: 감성분석을 할 수 있도록 LSTM을 이용한 로지스틱 모델 예측 모델을 만들어라.

In [24]:
# 힌트: 임베딩 -> LSTM -> Dense, 출력의 갯수가 하나이므로 활성함수를 sigmoid로 사용하라.

In [25]:
model=Sequential()
model.add(Embedding(vocab_size, embedding_length, input_length=max_doc_len))
model.add(LSTM(units=64))
model.add(Dense(1, activation='sigmoid'))

model.compile('adam', 'binary_cross_entropy')
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 10, 5)             50000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                17920     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 67,985
Trainable params: 67,985
Non-trainable params: 0
_________________________________________________________________


### 문제8: 문제 7의 확장된 LSTM 모델의 출력을 확인하라

In [27]:
# 확장된 LSTM의 출력을 산출하고 이의 차원을 확인하라.

In [28]:
output=model.predict(encoded_docs)
print(output.shape)
print(output)

(7, 1)
[[0.4986678 ]
 [0.49853685]
 [0.49872452]
 [0.49894178]
 [0.49891144]
 [0.49867383]
 [0.498584  ]]
