In [1]:
!pip install gensim



# 텍스트 분류

copied and modified from https://www.tensorflow.org/tutorials/keras/text_classification

In [2]:
## 설정
VOCA_SIZE = 4000 # 어휘 사전의 크기
EMBEDDING_SIZE = 64 # 단어를 임베딩한 벡터 크기

# 데이터

## 데이터 로딩

In [3]:
import tensorflow as tf
import numpy as np

def load_imdb_data(num_words=VOCA_SIZE):
  print('Loading data...')

  # 데이터
  # (train_x, train_y), (test_x, test_y)
  dataset = tf.keras.datasets.imdb.load_data(num_words=VOCA_SIZE)

  # 단어와 정수 인덱스를 매핑한 딕셔너리
  # word_index = {'fawn': 34701, 'tsukino': 52006, 'nunnery': 52007, ... }
  word_index = tf.keras.datasets.imdb.get_word_index()

  return dataset, word_index


In [4]:
((train_x, train_y), (test_x, test_y)), word_index = load_imdb_data()
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

Loading data...
(25000,)
(25000,)
(25000,)
(25000,)


## 데이터 보기

In [5]:
print(train_x[0])
print(train_y[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


In [6]:
# 처음 몇 개 인덱스는 사전에 정의되어 있습니다
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# reverse_word_index = {34704: 'fawn', 52009: 'tsukino', 52010: 'nunnery', ... }

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(train_x[0])
print(decode_review(train_x[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
<START> this film was just brilliant casting location scenery story direction <UNK> rea

## 각 데이터의 길이

In [7]:
print(len(train_x[0]))
print(len(train_x[1]))
print(len(train_x[2]))
print(len(train_x[3]))
print(len(train_x[4]))

218
189
141
550
147


## 데이터 길이 일정하게 하기

In [8]:
print(train_x[0])
print(len(train_x[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
218


In [9]:
from tensorflow.keras.preprocessing import sequence

train_x = sequence.pad_sequences(train_x, maxlen=400, padding='post')
test_x = sequence.pad_sequences(test_x, maxlen=400, padding='post')
print(train_x.shape)
print(test_x.shape)

(25000, 400)
(25000, 400)


In [10]:
print(train_x[0])
print(len(train_x[0]))

[   1   14   22   16   43  530  973 1622 1385   65  458    2   66 3941
    4  173   36  256    5   25  100   43  838  112   50  670    2    9
   35  480  284    5  150    4  172  112  167    2  336  385   39    4
  172    2 1111   17  546   38   13  447    4  192   50   16    6  147
 2025   19   14   22    4 1920    2  469    4   22   71   87   12   16
   43  530   38   76   15   13 1247    4   22   17  515   17   12   16
  626   18    2    5   62  386   12    8  316    8  106    5    4 2223
    2   16  480   66 3785   33    4  130   12   16   38  619    5   25
  124   51   36  135   48   25 1415   33    6   22   12  215   28   77
   52    5   14  407   16   82    2    8    4  107  117    2   15  256
    4    2    7 3766    5  723   36   71   43  530  476   26  400  317
   46    7    4    2 1029   13  104   88    4  381   15  297   98   32
 2071   56   26  141    6  194    2   18    4  226   22   21  134  476
   26  480    5  144   30    2   18   51   36   28  224   92   25  104
    4 

# embedding 학습하며 예측 실행

In [11]:
# Gzip 파일 압축푸는 함수 (!gzip -d ---.gz)
def download_and_decompress_gzip(http_link, target_link):
  """Utility function for download & decompress gzip file"""
  import gzip, shutil, os
  from urllib.request import urlretrieve

  if os.path.exists(target_link): return

  file_path, _ = urlretrieve(http_link)

  with gzip.open(file_path, 'rb') as gz:
    with open(target_link, 'wb') as f_out:
        shutil.copyfileobj(gz, f_out)

In [12]:
# 단어 목록을 계산
def make_unique_word_set(*X):
  word_set = set()

  for dataset in X:
    for sentence in dataset:
      words = decode_review(sentence).split(" ")
      word_set.update(words)

  print(len(word_set))
  return word_set

word_set = make_unique_word_set(train_x, test_x)

3999


In [13]:
# 외부 word2vec 모델 불러오는 함수

def load_word2vec_model():
  import gensim.downloader as api
  return api.load('word2vec-google-news-300')

def load_slim_word2vec_model():
  from gensim.models import KeyedVectors

  # 작은 모델을 로드해온다.
  slim_model_link = "https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz"
  model_path = 'GoogleNews-vectors-negative300-SLIM.bin'
  download_and_decompress_gzip(slim_model_link, model_path)

  return KeyedVectors.load_word2vec_format(model_path, binary=True)

In [14]:
# Define Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM

# CNN Model
def create_cnn_model(voca_size=VOCA_SIZE, embedding_size=EMBEDDING_SIZE):
  model = Sequential()
  model.add(Input(400))
  model.add(Embedding(voca_size, embedding_size))  # 텍스트는 임베딩 해서 사용한다.
  model.add(Dropout(0.2))
  model.add(Conv1D(250, 3))
  model.add(GlobalMaxPooling1D())
  model.add(Dense(250))
  model.add(Dropout(0.2))
  model.add(Activation('relu'))
  model.add(Dense(1))
  model.add(Activation('sigmoid'))
  
  model.compile(loss='binary_crossentropy',
                optimizer='adam', metrics=['accuracy'])

  return model

# RNN Model
def create_rnn_model(voca_size=VOCA_SIZE, embedding_size=EMBEDDING_SIZE):
  model = Sequential()
  model.add(Input(400))
  model.add(Embedding(voca_size, embedding_size))
  model.add(Dropout(0.2))
  # model.add(Conv1D(250, 3))
  # model.add(GlobalMaxPooling1D())
  model.add(Bidirectional(LSTM(64)))  # ADD
  model.add(Dense(250))
  model.add(Dropout(0.2))
  model.add(Activation('relu'))
  model.add(Dense(1))
  model.add(Activation('sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

# RNN Model: From word2vec
def create_rnn_model_from_word2vec(word2vec, voca_size=VOCA_SIZE):
  embedding_size = word2vec.vector_size
  embedding_matrix = np.zeros((voca_size, embedding_size))

  # tokenizer에 있는 단어 사전을 순회하면서 word2vec의 300차원 vector를 가져옵니다
  for idx, word in enumerate(word_set):
      embedding_vector = word2vec[word] if word in word2vec else None
      if embedding_vector is not None:
          embedding_matrix[idx] = embedding_vector

  model = Sequential()
  model.add(Input(400))
  model.add(Embedding(voca_size,
                      embedding_size,
                      input_length=400,
                      weights=[embedding_matrix],
                      trainable=False
                      )
            )
  model.add(Dropout(0.2))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dense(250))
  model.add(Dropout(0.2))
  model.add(Activation('relu'))
  model.add(Dense(1))
  model.add(Activation('sigmoid'))

  model.compile(loss='binary_crossentropy',
                optimizer='adam', metrics=['accuracy'])

  return model


# Template

In [15]:
BATCH_SIZE = 32
EPOCHS = 1

# word2vec = load_word2vec_model()
# word2vec = load_slim_word2vec_model()
model = create_rnn_model() # create_rnn_model_from_word2vec(word2vec)
model.summary()
model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(test_x, test_y))

# Evaluation
loss, acc = model.evaluate(test_x, test_y)
print("loss =", loss)
print("acc =", acc)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2021-10-28 13:24:55.501539: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-10-28 13:24:55.501621: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-10-28 13:24:55.697424: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-28 13:24:55.697604: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 64)           256000    
_________________________________________________________________
dropout (Dropout)            (None, 400, 64)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 250)               32250     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation (Activation)      (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2

2021-10-28 13:24:56.452885: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-28 13:24:56.663167: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-28 13:24:56.682649: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-28 13:24:57.624793: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-28 13:24:57.638388: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-10-28 13:27:35.359131: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-28 13:27:35.403942: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-28 13:27:35.411632: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


loss = 0.6564628481864929
acc = 0.6138800382614136
