In [1]:
# Pandas 를 포함한 실습에 필요한 모듈 추가가
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 데이터셋 파일 Pandas 데이터프레임으로 불러오기
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [3]:
# 상위 5개 데이터 읽어오기
df.head(5)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [4]:
# 1단계 데이터셋 만들기
target = df.pop('sentiment') # sentiment 컬럼의 데이터만 추출(df에서 해당 컬럼 사라짐)

# 텐서플로 데이터셋 객체로 생성,
# 하나의 데이터는 (리뷰 텍스트, sentiment 레이블 값)
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [5]:
# 확인
for ex in ds_raw.take(5):
    tf.print(ex[0].numpy()[0][ : 50], ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0
b'hi for all the people who have seen this wonderful' 1
b'I recently bought the DVD, forgetting just how muc' 0


In [6]:
tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)

ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [7]:
# 2단계 : 고유 토큰(단어) 찾기
from collections import Counter

tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

max_seq_length = 100 # 추가코드 1: Sequence 슬라이스용 변수

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    tokens = tokens[-max_seq_length : ] # 추가코드 2: 시퀸스의 마지막에서부터 max_seq_length 만큼 슬라이스
    token_counts.update(tokens)

print("어휘 사전 크기:", len(token_counts))

어휘 사전 크기: 58063


In [8]:
# 3단계: 고유 토큰을 정수로 인코딩

encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

example_str = 'This is an Example!'
print(encoder.encode(example_str))

[374, 209, 104, 26885]


In [9]:
# 인코더에 토큰 사이즈 37p
encoder.vocab_size

58065

In [10]:
# 3-A 단계: 변환을 위한 함수 정의
def encode(text_tensor, label):
    text = text_tensor.numpy()[0] # map() 메서드에서는 즉시 실행 불가
    encoded_text = encoder.encode(text)

    encoded_text = encoded_text[-max_seq_length : ] # 추가코드 3: 시퀸스의 마지막에서부터 max_seq_length 만큼 슬라이스

    return encoded_text, label

In [11]:
# 3-B 단계: 함수를 TF 연산으로 변환
# tf.py_function으로 감싸 map() 메서드가 가능하도록 텐서플로 연산으로 변환
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

In [12]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [13]:
# 샘플의 크기 확인
tf.random.set_seed(1)

for example in ds_train.shuffle(1000).take(5):
    print("시퀀스 길이 :", example[0].shape)

시퀀스 길이 : (24,)
시퀀스 길이 : (100,)
시퀀스 길이 : (100,)
시퀀스 길이 : (100,)
시퀀스 길이 : (100,)


In [14]:
# padded_batch() 테스트를 위한 일부 데이터 추출
ds_subset = ds_train.take(8)

for example in ds_subset:
    print("개별 샘플크기:", example[0].shape)

개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)


In [15]:
# 배치 데이터 만들기
ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))

for batch in ds_batched:
    print("배치 지원:", batch[0].shape)

배치 지원: (4, 100)
배치 지원: (4, 100)


In [16]:
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

In [17]:
embedding_dim = 20 # 임베딩 차원수를 저장하기 위한 변수
vocab_size = len(token_counts) + 2 # 임베딩 레이어 입력 차원 수, 고유 단어 수+2

tf.random.set_seed(1)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(      # 임베딩 레이어
        input_dim = vocab_size,
        output_dim = embedding_dim,
        name = 'embbed-layer'),

    tf.keras.layers.SimpleRNN(
        units = 64,                 # RNN 레이어
        return_sequences = True,
        name = 'simple_rnn_1'),

    tf.keras.layers.Dense(64, activation='relu'),   # 완전연결층 FC64
    tf.keras.layers.Dense(1, activation='sigmoid'), # 완전연결층 FC 1-출력층
])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embbed-layer (Embedding)     (None, None, 20)          1161300   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, None, 64)          5440      
_________________________________________________________________
dense (Dense)                (None, None, 64)          4160      
_________________________________________________________________
dense_1 (Dense)              (None, None, 1)           65        
Total params: 1,170,965
Trainable params: 1,170,965
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(optimizer = tf.keras.optimizers.Adam(1e-3),
                loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
                metrics = ['accuracy'])

In [21]:
history = model.fit(train_data, validation_data=valid_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
test_results = model.evaluate(test_data)



In [24]:
print('테스트 정확도 : {:.2f}%'.format(test_results[1]*100))

테스트 정확도 : 71.50%


In [26]:
bidirect_rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(      # 임베딩 레이어
        input_dim = vocab_size,
        output_dim = embedding_dim,
        name = 'embbed-layer'),

    # tf.keras.layers.SimpleRNN(    # RNN 레이어
    #     units = 64,                 
    #     return_sequences = True,
    #     name = 'simple_rnn_1'),

    tf.keras.layers.Bidirectional(  # RNN 레이어를 양방향 레이어로
        tf.keras.layers.SimpleRNN(
            units = 64,
            return_sequences = True,
            name='simple_rnn_1'),
        name = 'bidirect-simple-rnn'),

    tf.keras.layers.Dense(64, activation = 'relu'), # 완전연결층 FC64

    tf.keras.layers.Dense(1, activation = 'sigmoid') # 완전연결층 FC 1-출력층
])

In [27]:
bidirect_rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embbed-layer (Embedding)     (None, None, 20)          1161300   
_________________________________________________________________
bidirect-simple-rnn (Bidirec (None, None, 128)         10880     
_________________________________________________________________
dense_2 (Dense)              (None, None, 64)          8256      
_________________________________________________________________
dense_3 (Dense)              (None, None, 1)           65        
Total params: 1,180,501
Trainable params: 1,180,501
Non-trainable params: 0
_________________________________________________________________


In [29]:
bidirect_rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])

In [30]:
history = bidirect_rnn_model.fit(train_data, validation_data = valid_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
test_results = bidirect_rnn_model.evaluate(test_data)
print('테스트 정확도: {:.2f}%'.format(test_results[1]*100))

테스트 정확도: 77.62%


In [32]:
lstm_rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(          # 임베딩 레이어
        input_dim = vocab_size,
        output_dim = embedding_dim,
        name = 'embbed-layer'),

    tf.keras.layers.LSTM(               # LSTM 레이어
        units = 64,
        return_sequences = True,
        name = 'lstm-layer_1'),

    tf.keras.layers.Dense(64, activation = 'relu'), # 완전연결층 FC64

    tf.keras.layers.Dense(1, activation = 'sigmoid') # 완전연결층 FC 1-출력층
])

In [33]:
lstm_rnn_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embbed-layer (Embedding)     (None, None, 20)          1161300   
_________________________________________________________________
lstm-layer_1 (LSTM)          (None, None, 64)          21760     
_________________________________________________________________
dense_4 (Dense)              (None, None, 64)          4160      
_________________________________________________________________
dense_5 (Dense)              (None, None, 1)           65        
Total params: 1,187,285
Trainable params: 1,187,285
Non-trainable params: 0
_________________________________________________________________


In [34]:
lstm_rnn_model.compile(optimizer = tf.keras.optimizers.Adam(1e-3),
                        loss = tf.keras.losses.BinaryCrossentropy(from_logits = False),
                        metrics = ['accuracy'])

In [35]:
history = lstm_rnn_model.fit(train_data, validation_data = valid_data, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
test_results = lstm_rnn_model.evaluate(test_data)
print('테스트 정확도:{:.2f}%'.format(test_results[1]*100))

테스트 정확도:69.52%


In [37]:
bi_lstm_rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(      #임베딩 레이어
        input_dim = vocab_size,
        output_dim = embedding_dim,
        name='embbed-layer'),

    tf.keras.layers.Bidirectional(  # Bidirectional LSTM layer
        tf.keras.layers.LSTM(
            units = 64,
            return_sequences = True,
            name='lstm-layer_1')),

    tf.keras.layers.Dense(64, activation = 'relu'), # 완전연결층 FC 64

    tf.keras.layers.Dense(1, activation = 'sigmoid') # 완전연결층 FC 1 - 출력층
])

In [38]:
bi_lstm_rnn_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embbed-layer (Embedding)     (None, None, 20)          1161300   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         43520     
_________________________________________________________________
dense_6 (Dense)              (None, None, 64)          8256      
_________________________________________________________________
dense_7 (Dense)              (None, None, 1)           65        
Total params: 1,213,141
Trainable params: 1,213,141
Non-trainable params: 0
_________________________________________________________________


In [39]:
bi_lstm_rnn_model.compile(optimizer = tf.keras.optimizers.Adam(1e-3),
                            loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics = ['accuracy'])

In [40]:
history = bi_lstm_rnn_model.fit(train_data, validation_data=valid_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
test_results = bi_lstm_rnn_model.evaluate(test_data)
print('테스트 정확도:{:.2f}%'.format(test_results[1]*100))

테스트 정확도:82.05%
