In [46]:
# NOTE: PLEASE MAKE SURE YOU ARE RUNNING THIS IN A PYTHON3 ENVIRONMENT

import tensorflow as tf
print(tf.__version__)

# This is needed for the iterator over the data
# But not necessary if you have TF 2.0 installed
#!pip install tensorflow==2.0.0-beta0


# !pip install -q tensorflow-datasets

2.4.0


In [47]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [48]:
import numpy as np

train_data, test_data = imdb['train'], imdb['test']

# 문장과 라벨에 대한 리스트 생성
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# Python3에서는 str(s.tonumpy()) 사용
# train_data의 string부분과 label부분 불러와서 리스트에 저장
# 둘다 numpy로 바꾸고 단어는 다시 스트링으로
for s, l in train_data: 
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [49]:
# 파라미터 설정

# 10000개의 단어, 16차원
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 1 토크나이저 생성
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
# 2 학습데이터에 적용
tokenizer.fit_on_texts(training_sentences)
# 3 인덱스 추출
word_index = tokenizer.word_index
print(len(word_index))
# 4 문장들을 단어인덱스의 리스트로
sequences = tokenizer.texts_to_sequences(training_sentences)
print(sequences[0])
# 5 패딩 붙여
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
print(padded[0])

# 토크나이저는 테스트문장이아닌!! 훈련문장에서만 적합
# 테스트는 시퀀스만 적용
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

86539
[59, 12, 14, 35, 439, 400, 18, 174, 29, 1, 9, 33, 1378, 3401, 42, 496, 1, 197, 25, 88, 156, 19, 12, 211, 340, 29, 70, 248, 213, 9, 486, 62, 70, 88, 116, 99, 24, 5740, 12, 3317, 657, 777, 12, 18, 7, 35, 406, 8228, 178, 2477, 426, 2, 92, 1253, 140, 72, 149, 55, 2, 1, 7525, 72, 229, 70, 2962, 16, 1, 2880, 1, 1, 1506, 4998, 3, 40, 3947, 119, 1608, 17, 3401, 14, 163, 19, 4, 1253, 927, 7986, 9, 4, 18, 13, 14, 4200, 5, 102, 148, 1237, 11, 240, 692, 13, 44, 25, 101, 39, 12, 7232, 1, 39, 1378, 1, 52, 409, 11, 99, 1214, 874, 145, 10]
[   0    0   59   12   14   35  439  400   18  174   29    1    9   33
 1378 3401   42  496    1  197   25   88  156   19   12  211  340   29
   70  248  213    9  486   62   70   88  116   99   24 5740   12 3317
  657  777   12   18    7   35  406 8228  178 2477  426    2   92 1253
  140   72  149   55    2    1 7525   72  229   70 2962   16    1 2880
    1    1 1506 4998    3   40 3947  119 1608   17 3401   14  163   19
    4 1253  927 7986    9    4   18   

In [66]:
# 패딩된 목록을 살펴보려면?
# 토큰을 단어로 다시 해독 하기위해 뒤집어야함

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print('\n')
print(training_sentences[1])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?


b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.'


In [51]:
model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(6, activation='relu'),
      tf.keras.layers.Dense(1, activation='sigmoid') # 긍정, 부정 결정
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [52]:
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, 
          validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0fd1213860>

In [53]:
# 모델의 첫번째 레이어의 첫번째 하이퍼 파라미터
e = model.layers[0]
weights = e.get_weights()
print(weights)

[array([[-0.0249884 , -0.05608271, -0.01796454, ..., -0.03067191,
        -0.01025001, -0.00353848],
       [ 0.03361741, -0.14514115, -0.03363437, ...,  0.02246208,
         0.09371507,  0.0123951 ],
       [-0.01348947, -0.12386089, -0.04216803, ...,  0.05610391,
         0.05610269,  0.00875582],
       ...,
       [ 0.13037515, -0.00837902,  0.08629876, ..., -0.12398137,
        -0.06137634,  0.09817248],
       [ 0.08552732, -0.0132495 , -0.01277668, ..., -0.06396361,
         0.00276506,  0.01976584],
       [-0.08214451, -0.09223964,  0.00202092, ...,  0.10990557,
         0.00491677, -0.03085126]], dtype=float32)]


In [54]:
# 2차원배열이므로 속에 꺼 빼내
weights = weights[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [55]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# 메타데이터배열 out_m에는 그냥 단어를씀.
# 벡터파일 out_v에는 단어에 대한 벡터의 각 차원의 계수 작성

for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word+"\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [56]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [67]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1431], [966], [4], [1537], [1537], [4715], [], [790], [2019], [11], [2929], [2184], [], [790], [2019], [11], [579], [], [11], [579], [], [4], [1782], [4], [4517], [11], [2929], [1275], [], [], [2019], [1003], [2929], [966], [579], [790], []]
