In [31]:
import os
os.getcwd()
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Conv1D, GlobalMaxPooling1D, Flatten, Concatenate, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [33]:
imdb_data = pd.read_csv("IMDB Dataset.csv")
print(imdb_data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [37]:
# y값 지정 : pos = 1, neg = 0으로 변형
imdb_data['sentiment'] = imdb_data['sentiment'].replace('positive', 1)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('negative', 0)

# 전처리
## 1) 단어 아니면 삭제
imdb_data['review'] = imdb_data['review'].str.replace('[^\w]|br', ' ') 
## 2) 공백만 있는 경우 null array로 변환 후 제거
imdb_data['review'] = imdb_data['review'].replace('', np.nan)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('', np.nan)
imdb_data = imdb_data.dropna(how='any', axis=0)

# train, test 분리
review_train_full, review_test, y_train_full, y_test = train_test_split(imdb_data['review'], imdb_data['sentiment'], shuffle=False, random_state=34)
review_train, review_valid, y_train, y_valid = train_test_split(review_train_full, y_train_full, shuffle=False, random_state=34)

# 토큰화
stopwords = ['a', 'an', 'the']

X_train = []
for stc in review_train:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_train.append(token)

X_valid = []
for stc in review_valid:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_valid.append(token)

X_test = []
for stc in review_test:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_test.append(token)

tokenizer = Tokenizer(5000) # 단어 수
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)
X_test = tokenizer.texts_to_sequences(X_test)

# 패딩(각 문장별로 가지고 있는 단어 개수를 맞춤)
X_train = pad_sequences(X_train, maxlen=200)
X_valid = pad_sequences(X_valid, maxlen=200)
X_test = pad_sequences(X_test, maxlen=200)

In [43]:
# 모델링(함수형)

input_ = Input(shape=X_train.shape[1:]) # (200, 5000)
emb_vec = Embedding(5000, 256)(input_)  # (200, 5000) -> (200, 256)
conv = Conv1D(256, 3, padding='valid', activation='relu')(emb_vec) # (200, 256) -> (198, 1, 256) # filters=256개, kernel size=3
pool = GlobalMaxPooling1D()(conv) # (1, 256)
output_ = Dense(1, activation='sigmoid')(pool)

model = Model(input_, output_)
model.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 200)]             0         
_________________________________________________________________
embedding_14 (Embedding)     (None, 200, 256)          1280000   
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 198, 256)          196864    
_________________________________________________________________
global_max_pooling1d_22 (Glo (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 257       
Total params: 1,477,121
Trainable params: 1,477,121
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
es = EarlyStopping(patience=5)
model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid), callbacks=[es])
print(model.evaluate(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
[0.4213278889656067, 0.8954399824142456]


---

In [40]:
# 모델링(서브클래스형)

class CNN(Model):
    
    def __init__(self, activation='relu', **kwargs):
        super().__init__(**kwargs)
        
        self.emb_vec = Embedding(5000, 256)
        self.conv = Conv1D(256, 3, padding='valid', activation=activation)
        self.pool = GlobalMaxPooling1D()
        self.output_ = Dense(1, activation='sigmoid')
        
    def call(self, input_):
        emb_vec = self.emb_vec(input_)
        conv = self.conv(emb_vec)
        pool = self.pool(conv)
        output_ = self.output_(pool)
        
        return output_

model = CNN()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
es = EarlyStopping(patience=5)
model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid), callbacks=[es])
print(model.evaluate(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
[0.42242690920829773, 0.8981599807739258]


---

In [45]:
# 앙상블 모델 생성-------------------------------------------
# CNN 모델 : Embedding → Conv → Pooling → Flatten → Dense
# 함수형 API 사용

# 함수형 케라스 -- 복잡한 모델을 구현할 때 사용
inputs = Input(shape=(200, ))
embed = Embedding(5000, 256)(inputs)


# 모델 합성
concat_layers = []

conv = Conv1D(256, 3, padding='valid', activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)
flat = Flatten()(pool)
concat_layers.append(flat)

conv = Conv1D(256, 4, padding='valid', activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)
flat = Flatten()(pool)
concat_layers.append(flat)

conv = Conv1D(256, 5, padding='valid', activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)
flat = Flatten()(pool)
concat_layers.append(flat)

####
concat = Concatenate()(concat_layers)
outputs = Dense(1, activation='sigmoid')(concat)
model = Model(inputs, outputs)

model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 200, 256)     1280000     input_13[0][0]                   
__________________________________________________________________________________________________
conv1d_23 (Conv1D)              (None, 198, 256)     196864      embedding_15[0][0]               
__________________________________________________________________________________________________
conv1d_24 (Conv1D)              (None, 197, 256)     262400      embedding_15[0][0]               
____________________________________________________________________________________________

In [46]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
es = EarlyStopping(patience=5)
model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid), callbacks=[es])
print(model.evaluate(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
[0.46563926339149475, 0.9029600024223328]
