In [1]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import SimpleRNN, Dense, Input, Embedding
# model = Sequential()
# model.add(Input(shape=(80))) # 입력하는 영화평의 길이를 80으로 제한, 길면 자르고, 짧으면 zero padding
# model.add(Embedding(input_dim=10000, output_dim=32))
# model.add(SimpleRNN(64))
# # model.add(Dense(1, activation='sigmoid'))
# model.add(Dense(2, activation='softmax'))
# model.summary()

In [2]:
# 트랜스포머를 이용한 영화평 분류 모델
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model

inputs = layers.Input(shape=(80,))

input_embedding = layers.Embedding(input_dim=10000, output_dim=32)(inputs)
positions = tf.range(0, 80)
pos_encoding = layers.Embedding(input_dim=80, output_dim=32)(positions)
pos_enc_output = pos_encoding + input_embedding

attention_output = layers.MultiHeadAttention(3, 32)(pos_enc_output, pos_enc_output)
x = layers.add([pos_enc_output, attention_output])
x = layers.BatchNormalization()(x)

ffnn = Sequential([layers.Dense(64, activation='relu'),
                   layers.Dense(32, activation='relu')])(x)
x = layers.add([ffnn, x])
x = layers.BatchNormalization()(x)
x = layers.GlobalAveragePooling1D()(x)
# x = layers.Dropout(0.1)(x)

x = layers.Dense(64, activation='relu')(x)
# x = layers.Dropout(0.1)(x)

outputs = layers.Dense(2, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 80)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 80, 32)               320000    ['input_1[0][0]']             
                                                                                                  
 tf.__operators__.add (TFOp  (None, 80, 32)               0         ['embedding[0][0]']           
 Lambda)                                                                                          
                                                                                                  
 multi_head_attention (Mult  (None, 80, 32)               12608     ['tf.__operators__.add[0][

In [3]:
# model.compile(loss='binary_crossentropy', optimizer='adam',
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [4]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

In [5]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(25000,) (25000,) (25000,) (25000,)


In [6]:
print(X_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [7]:
word_index = imdb.get_word_index()
word_index['the']

1

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=80)
X_test = pad_sequences(X_test, maxlen=80)

In [9]:
%%time
model.fit(X_train, y_train, epochs=10, batch_size=200)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [10]:
model.evaluate(X_test, y_test)



[0.5730395913124084, 0.7817999720573425]

In [11]:
print(X_test.shape)

(25000, 80)


In [12]:
import numpy as np
pred = model.predict(X_test)
# pred = (pred > 0.5).astype(int)
pred = np.argmax(pred, axis=1)



In [13]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred))

[[11807   693]
 [ 4762  7738]]


In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7818

In [15]:
# 옵티마이저를 sgd로 바꿔보세요. accuracy: 0.57784
# 전체 단어의 개수를 1000개로 바꿔보세요. accuracy: 0.72104
# 영화평의 길이를 200개로 바꿔보세요. accuracy: 0.51976, 0.50936
# pad_sequence의 truncating과 padding을 pre로 바꿔보세요. accuracy: 0.81016

In [16]:
# 아래 택스트를 긍/부정 분류하세요.
text = "My God the actors who potrayed the VIP people cannot act. I cringed everytime they said a line. It felt like they were just reading them. Even the intonation was off. It was like when we were kids and had to read a play in class and we exagerated the intonation. Terrible, just awful."

In [17]:
# 텍스트 안에 사용한 10000의 단어집에 없는 단어가 있을 수 있음
# 사용하지 않는 단어가 있을 수 있음

In [18]:
word_index = imdb.get_word_index()
# print(word_index)

In [19]:
word_to_index = {k:(v+3) for k, v in word_index.items()}
# 특별한 문자를 4개 미리 넣어둠 <PAD>, <START>, <UNK>, <UNUSED>
word_to_index["<PAD>"] = 0
word_to_index["<START>"] = 1
word_to_index["<UNK>"] = 2
word_to_index["<UNUSED>"] = 3

In [20]:
def encoding(review_text):
    encoded = []
    for word in review_text:
        try:
            idx = word_to_index[word]
            if idx>10000:
                encoded.append(3)
            else:
                encoded.append(idx)
        except:
            encoded.append(2)
    return encoded

encoding(['the', 'god', 'my'])

[4, 558, 61]

In [21]:
input_text = text.lower().split()
print(input_text)
input_encoded = encoding(input_text)
print(input_encoded)

['my', 'god', 'the', 'actors', 'who', 'potrayed', 'the', 'vip', 'people', 'cannot', 'act.', 'i', 'cringed', 'everytime', 'they', 'said', 'a', 'line.', 'it', 'felt', 'like', 'they', 'were', 'just', 'reading', 'them.', 'even', 'the', 'intonation', 'was', 'off.', 'it', 'was', 'like', 'when', 'we', 'were', 'kids', 'and', 'had', 'to', 'read', 'a', 'play', 'in', 'class', 'and', 'we', 'exagerated', 'the', 'intonation.', 'terrible,', 'just', 'awful.']
[61, 558, 4, 156, 37, 2, 4, 3, 84, 566, 2, 13, 3, 3, 36, 301, 6, 2, 12, 421, 40, 36, 71, 43, 886, 2, 60, 4, 3, 16, 2, 12, 16, 40, 54, 75, 71, 362, 5, 69, 8, 332, 6, 297, 11, 707, 5, 75, 3, 4, 2, 2, 43, 2]


In [22]:
# np.array(input_encoded).reshape(-1, len(input_encoded))
np.array(input_encoded)[np.newaxis, :]

array([[ 61, 558,   4, 156,  37,   2,   4,   3,  84, 566,   2,  13,   3,
          3,  36, 301,   6,   2,  12, 421,  40,  36,  71,  43, 886,   2,
         60,   4,   3,  16,   2,  12,  16,  40,  54,  75,  71, 362,   5,
         69,   8, 332,   6, 297,  11, 707,   5,  75,   3,   4,   2,   2,
         43,   2]])

In [23]:
input_pad = pad_sequences(np.array(input_encoded)[np.newaxis, :], maxlen=80)
print(input_pad)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0  61 558   4 156  37   2   4   3  84 566
    2  13   3   3  36 301   6   2  12 421  40  36  71  43 886   2  60   4
    3  16   2  12  16  40  54  75  71 362   5  69   8 332   6 297  11 707
    5  75   3   4   2   2  43   2]]


In [24]:
pred1 = model.predict(input_pad)
pred1, np.argmax(pred1) #0: 부정, 1: 긍정



(array([[0.60500175, 0.39499822]], dtype=float32), 0)