# 문자열 분류 Template

# 데이터 준비

In [None]:
## 설정
VOCA_SIZE = 4000 # 어휘 사전의 크기
EMBEDDING_SIZE = 64 # 단어를 임베딩한 벡터 크기

## 데이터 로딩

In [None]:
import tensorflow as tf
import numpy as np

def load_imdb_data(num_words=VOCA_SIZE):
  print('Loading data...')

  # 데이터
  # (train_x, train_y), (test_x, test_y)
  dataset = tf.keras.datasets.imdb.load_data(num_words=VOCA_SIZE)

  # 단어와 정수 인덱스를 매핑한 딕셔너리
  # word_index = {'fawn': 34701, 'tsukino': 52006, 'nunnery': 52007, ... }
  word_index = tf.keras.datasets.imdb.get_word_index()

  return dataset, word_index


In [None]:
((train_x, train_y), (test_x, test_y)), word_index = load_imdb_data()
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

Loading data...
(25000,)
(25000,)
(25000,)
(25000,)


## 데이터 보기

In [None]:
print(train_x[0])
print(train_y[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


In [None]:
# 처음 몇 개 인덱스는 사전에 정의되어 있습니다
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# reverse_word_index = {34704: 'fawn', 52009: 'tsukino', 52010: 'nunnery', ... }

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(train_x[0])
print(decode_review(train_x[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
<START> this film was just brilliant casting location scenery story direction <UNK> rea

## 각 데이터의 길이

In [None]:
print(len(train_x[0]))
print(len(train_x[1]))
print(len(train_x[2]))
print(len(train_x[3]))
print(len(train_x[4]))

218
189
141
550
147


## 데이터 길이 일정하게 하기

In [None]:
print(train_x[0])
print(len(train_x[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
218


In [None]:
from tensorflow.keras.preprocessing import sequence

train_x = sequence.pad_sequences(train_x, maxlen=400, padding='post')
test_x = sequence.pad_sequences(test_x, maxlen=400, padding='post')
print(train_x.shape)
print(test_x.shape)

(25000, 400)
(25000, 400)


In [None]:
print(train_x[0])
print(len(train_x[0]))

[   1   14   22   16   43  530  973 1622 1385   65  458    2   66 3941
    4  173   36  256    5   25  100   43  838  112   50  670    2    9
   35  480  284    5  150    4  172  112  167    2  336  385   39    4
  172    2 1111   17  546   38   13  447    4  192   50   16    6  147
 2025   19   14   22    4 1920    2  469    4   22   71   87   12   16
   43  530   38   76   15   13 1247    4   22   17  515   17   12   16
  626   18    2    5   62  386   12    8  316    8  106    5    4 2223
    2   16  480   66 3785   33    4  130   12   16   38  619    5   25
  124   51   36  135   48   25 1415   33    6   22   12  215   28   77
   52    5   14  407   16   82    2    8    4  107  117    2   15  256
    4    2    7 3766    5  723   36   71   43  530  476   26  400  317
   46    7    4    2 1029   13  104   88    4  381   15  297   98   32
 2071   56   26  141    6  194    2   18    4  226   22   21  134  476
   26  480    5  144   30    2   18   51   36   28  224   92   25  104
    4 

# Template

## 단순 RNN

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM

model = Sequential()
model.add(Input(400))
model.add(Embedding(VOCA_SIZE, EMBEDDING_SIZE))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(Dense(250, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(train_x, train_y, batch_size=32, epochs=10, validation_split=0.1)

# Evaluation
loss, acc = model.evaluate(test_x, test_y)
print("loss =", loss)
print("acc =", acc)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 400, 64)           256000    
_________________________________________________________________
batch_normalization (BatchNo (None, 400, 64)           256       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 250)               32250     
_________________________________________________________________
batch_normalization_1 (Batch (None, 250)               1000      
_________________________________________________________________
activation_4 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

## Word2Vec 사용 RNN

### Word2Vec 사용 위한 라이브러리 설치

In [None]:
!pip install gensim



### Word2Vec 읽기

In [None]:
def download_and_decompress_gzip(http_link, target_link):
  """Utility function for download & decompress gzip file"""
  import gzip, shutil, os
  from urllib.request import urlretrieve

  if os.path.exists(target_link): return

  file_path, _ = urlretrieve(http_link)

  with gzip.open(file_path, 'rb') as gz:
    with open(target_link, 'wb') as f_out:
        shutil.copyfileobj(gz, f_out)

def get_unique_word_set(train_x, test_x):
  word_set = set()

  for sentence in train_x:
    words = decode_review(sentence).split(" ")
    word_set.update(words)

  for sentence in test_x:
    words = decode_review(sentence).split(" ")
    word_set.update(words)

  return word_set    

def load_embedding_matrix_from_word2vec(train_x, test_x):

  # word2vec 파일 다운로드, 압축 풀기
  download_and_decompress_gzip(
      "https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz",
      'GoogleNews-vectors-negative300-SLIM.bin',
  )

  # word2vec 로딩
  from gensim.models import KeyedVectors
  word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300-SLIM.bin', binary=True)
  
  # train_x, test_x 문장들의 단어들 set 구하기
  word_set = get_unique_word_set(train_x, test_x)
  # word_set = { it, took, kill, rough, power, ... }
  # embedding 행렬 생성
  embedding_size = word2vec.vector_size
  embedding_matrix = np.zeros((VOCA_SIZE, embedding_size))

  # word_set에 담긴 단어들에 대한 word2vec 값을 행렬에 설정
  for word in word_set:
    if word not in word2vec : continue
    idx = word_index[word]
    # word, idx
    # it 12
    # took 562
    # kill 516
    # rough 2683
    embedding_matrix[idx] = word2vec[word]
    # embedding_matrix[12] = [ 0.05521541 -0.00023065  0.0347889 ... ]
      
  return embedding_matrix    

### embedding_matrix 값 생성

In [None]:
embedding_matrix = load_embedding_matrix_from_word2vec(train_x, test_x)

In [None]:
print(embedding_matrix.shape)
idx = word_index['it']
print(idx)
print(embedding_matrix[idx])

(4000, 300)
12
[ 0.05521541 -0.00023065  0.0347889   0.06510951 -0.09702593 -0.04085302
  0.03686347 -0.04244884  0.04308717  0.01228782 -0.06064121 -0.1525605
 -0.04340634 -0.08425936 -0.13404898  0.01484114  0.04308717  0.03095893
  0.08872766 -0.09128097  0.02250108  0.05330043  0.05968371 -0.08681267
  0.08106772 -0.01484114 -0.03861887 -0.01867111  0.0088967   0.02569272
 -0.04436383  0.06287535 -0.05298126 -0.05298126  0.04915129 -0.02808645
 -0.00666255  0.0031318   0.03319308  0.09319596  0.05553458 -0.02537356
  0.11106916 -0.03367183 -0.06670532  0.05585374 -0.03431015  0.03798055
  0.01938923  0.02345857  0.10213256  0.00428877  0.02018714 -0.02824604
  0.03191642  0.01220803  0.00172548 -0.04659798  0.00504678 -0.00172548
  0.00841796  0.05776873 -0.05585374 -0.01053242 -0.05744956 -0.0223415
  0.00115198  0.00674234 -0.04883213  0.06096037  0.05808789 -0.01364427
  0.04117219 -0.05744956 -0.0437255  -0.00630349  0.08362103  0.09000432
  0.05617291  0.11170749  0.03446974 -

### 학습 Template

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM

model = Sequential()
model.add(Input(400))
# model.add(Embedding(VOCA_SIZE, embedding_size))
model.add(Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],
                    input_length=400,
                    weights=[embedding_matrix],
                    trainable=False
                    )
          )
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(Dense(250))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(train_x, train_y, batch_size=32, epochs=10, validation_split=0.1)

# Evaluation
loss, acc = model.evaluate(test_x, test_y)
print("loss =", loss)
print("acc =", acc)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 300)          1200000   
_________________________________________________________________
batch_normalization (BatchNo (None, 400, 300)          1200      
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               186880    
_________________________________________________________________
dense (Dense)                (None, 250)               32250     
_________________________________________________________________
batch_normalization_1 (Batch (None, 250)               1000      
_________________________________________________________________
activation (Activation)      (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

## CNN

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM

model = Sequential()
model.add(Input(400))
model.add(Embedding(VOCA_SIZE, EMBEDDING_SIZE))
model.add(BatchNormalization())
model.add(Conv1D(250, 3, padding="same"))
model.add(Conv1D(250, 3, padding="same"))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(train_x, train_y, batch_size=32, epochs=10, validation_split=0.1)

# Evaluation
loss, acc = model.evaluate(test_x, test_y)
print("loss =", loss)
print("acc =", acc)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 400, 64)           256000    
_________________________________________________________________
batch_normalization_2 (Batch (None, 400, 64)           256       
_________________________________________________________________
conv1d (Conv1D)              (None, 400, 250)          48250     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 400, 250)          187750    
_________________________________________________________________
global_max_pooling1d (Global (None, 250)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 250)               62750     
_________________________________________________________________
batch_normalization_3 (Batch (None, 250)              