# 16. 순환 신경망으로 순차 데이터 모델링

## 16.1 순차 데이터 소개

## 16.2 시퀀스 모델링을 위한 RNN

### 16.2.3 은닉 순환과 출력 순환

In [2]:
import tensorflow as tf
tf.random.set_seed(1)
rnn_layer = tf.keras.layers.SimpleRNN(
    units=2, use_bias=True, return_sequences=True)
rnn_layer.build(input_shape=(None, None, 5))
w_xh, w_oo, b_h = rnn_layer.weights
print('W_xh 크기:', w_xh.shape)
print('W_oo 크기:', w_oo.shape)
print('b_h 크기:', b_h.shape)

W_xh 크기: (5, 2)
W_oo 크기: (2, 2)
b_h 크기: (2,)


In [3]:
x_seq = tf.convert_to_tensor([[1.0]*5, [2.0]*5, [3.0]*5], dtype=tf.float32)
## SimpleRNN의 출력
output = rnn_layer(tf.reshape(x_seq, shape=(1, 3, 5)))

## 수동으로 출력 계산하기
out_man = []
for t in range(len(x_seq)):
    xt = tf.reshape(x_seq[t], (1, 5))
    print('타임 스텝 {} =>'.format(t))
    print('  입력             :', xt.numpy())
    
    ht = tf.matmul(xt, w_xh) + b_h
    print('  은닉             :', ht.numpy())
    
    if t>0:
        prev_o = out_man[t-1]
    else:
        prev_o = tf.zeros(shape=(ht.shape))
    ot = ht + tf.matmul(prev_o, w_oo)
    ot = tf.math.tanh(ot)
    out_man.append(ot)
    print('  출력(수동)       :', ot.numpy())
    print('  SimpleRNN 출력   :'.format(t), output[0][t].numpy())
    print()

타임 스텝 0 =>
  입력             : [[1. 1. 1. 1. 1.]]
  은닉             : [[-0.69677734 -0.8956299 ]]
  출력(수동)       : [[-0.6023183  -0.71416336]]
  SimpleRNN 출력   : [-0.6023183  -0.71416336]

타임 스텝 1 =>
  입력             : [[2. 2. 2. 2. 2.]]
  은닉             : [[-1.3935547 -1.7912598]]
  출력(수동)       : [[-0.94736236 -0.9896641 ]]
  SimpleRNN 출력   : [-0.94736236 -0.9896641 ]

타임 스텝 2 =>
  입력             : [[3. 3. 3. 3. 3.]]
  은닉             : [[-2.090332  -2.6868896]]
  출력(수동)       : [[-0.9921783  -0.99914104]]
  SimpleRNN 출력   : [-0.9921783  -0.99914104]



## 16.3 텐서플로로 시퀀스 모델링을 위한 RNN 구현

### 16.3.1 첫 번째 프로젝트:IMDb 영화 리뷰의 감성 분석

영화 리뷰 데이터 준비

In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import os
import gzip
import shutil

df = pd.read_csv('movie_data.csv', encoding='utf-8')

df.tail()

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


In [5]:
## 1단계 : 데이터셋 만들기
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

## 확인:
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [6]:
# 일정한 결과값을 얻기 위한 seed값 설정
tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [7]:
## 2단계: 고유 토큰(단어) 찾기
from collections import Counter
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)

print('어휘 사전 크기:', len(token_counts))

어휘 사전 크기: 87007


In [8]:
## 3단계: 고유 토큰을 정수로 인코딩하기
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str))

[232, 9, 270, 1123]


In [9]:
## 3-A단계: 변환을 위한 함수 정의
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

## 3-B단계: 함수를 TF 연산으로 변환하기
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

#샘플의 크기 확인하기:
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('시퀀스 길이', example[0].shape)

시퀀스 길이 (24,)
시퀀스 길이 (179,)
시퀀스 길이 (262,)
시퀀스 길이 (535,)
시퀀스 길이 (130,)


In [10]:
## 일부 데이터 추출하기
ds_subset = ds_train.take(8)
for example in ds_subset:
    print('개별 샘플 크기:', example[0].shape)

개별 샘플 크기: (119,)
개별 샘플 크기: (688,)
개별 샘플 크기: (308,)
개별 샘플 크기: (204,)
개별 샘플 크기: (326,)
개별 샘플 크기: (240,)
개별 샘플 크기: (127,)
개별 샘플 크기: (453,)


In [11]:
## 배치 데이터 만들기
ds_batched = ds_subset.padded_batch(
             4, padded_shapes=([-1], []))
for batch in ds_batched:
    print('배치 차원:', batch[0].shape)

배치 차원: (4, 688)
배치 차원: (4, 453)


In [12]:
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

In [14]:
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100, output_dim=6, input_length=20, name='embed-layer'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          32000     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, None, 32)          2080      
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [17]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)

## 모델 만들기
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name='embed-layer'),
    
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, name='lstm-layer'),
        name='bidir-lstm'),
    
    tf.keras.layers.Dense(64, activation='relu'),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

bi_lstm_model.summary()

## 컴파일과 훈련
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy'])
history = bi_lstm_model.fit(
    train_data,
    validation_data=valid_data,
    epochs=10)

## 테스트 데이터에서 평가
test_results = bi_lstm_model.evaluate(test_data)
print('테스트 정확도: {:.2f}%'.format(test_results[1]*100))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1740180   
                                                                 
 bidir-lstm (Bidirectional)  (None, 128)               43520     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,792,021
Trainable params: 1,792,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
테스트 정확도: 84.24%


In [18]:
from collections import Counter
def preprocess_datasets(
    ds_raw_train,
    ds_raw_valid,
    ds_raw_test,
    max_seq_length=None,
    batch_size=32):
    
    ## 1단계: (데이터셋 만들기 이미 완료)
    ## 2단계: 고유 토큰 찾기
    tokenizer = tfds.deprecated.text.Tokenizer()
    token_counts = Counter()
    
    for example in ds_raw_train:
        tokens = tokenizer.tokenize(example[0].numpy()[0])
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
        token_counts.update(tokens)
        
    print('어휘 사전 크기:', len(token_counts))
    
    ## 3단계: 텍스트 인코딩하기
    encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
    
    def encode(text_tensor, label):
        text = text_tensor.numpy()[0]
        encoded_text = encoder.encode(text)
        if max_seq_length is not None:
            encoded_text = encoded_text[-max_seq_length:]
        return encoded_text, label
    
    def encode_map_fn(text, label):
        return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))
    
    ds_train = ds_raw_train.map(encode_map_fn)
    ds_valid = ds_raw_valid.map(encode_map_fn)
    ds_test = ds_raw_test.map(encode_map_fn)
    
    ## 4단계: 배치 데이터 만들기
    train_data = ds_train.padded_batch(batch_size, padded_shapes=([-1], []))
    valid_data = ds_valid.padded_batch(batch_size, padded_shapes=([-1], []))
    test_data = ds_test.padded_batch(batch_size, padded_shapes=([-1], []))
    
    return (train_data, valid_data, test_data, len(token_counts))

In [19]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU

def build_rnn_model(embedding_dim, vocab_size, recurrent_type='SimpleRNN', n_recurrent_units=64,
                    n_recurrent_layers=1, bidirectional=True):
    tf.random.set_seed(1)
    
    # 모델 생성
    model = tf.keras.Sequential()
    
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='embed-layer'))
    
    for i in range(n_recurrent_layers):
        return_sequences = (i < n_recurrent_layers -1)
        
        if recurrent_type == 'SimpleRNN':
            recurrent_layer = SimpleRNN(
                units=n_recurrent_units, return_sequences=return_sequences,
                name='simplernn-layer-{}'.format(i))
        elif recurrent_type == 'LSTM':
            recurrent_layer = LSTM(
                units=n_recurrent_units, return_sequences=return_sequences,
                name='lstm-layer-{}'.format(i))
        elif recurrent_type == 'GRU':
            recurrent_layer = GRU(
                units=n_recurrent_units, return_sequences=return_sequences,
                name='gru-layer-{}'.format(i))
        
        if bidirectional:
            recurrent_layer = Bidirectional(
                recurrent_layer, name='bidir-' + recurrent_layer.name)
            
        model.add(recurrent_layer)
    
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    return model

In [20]:
batch_size = 32
embedding_dim = 20
max_seq_length = 100
train_data, valid_data, test_data, n = preprocess_datasets(
    ds_raw_train, ds_raw_valid, ds_raw_test, max_seq_length=max_seq_length, batch_size=batch_size)

vocab_size = n + 2
rnn_model = build_rnn_model(embedding_dim, vocab_size, recurrent_type='SimpleRNN',
                            n_recurrent_units=64, n_recurrent_layers=1, bidirectional=True)
rnn_model.summary()

어휘 사전 크기: 58063
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1161300   
                                                                 
 bidir-simplernn-layer-0 (Bi  (None, 128)              10880     
 directional)                                                    
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,180,501
Trainable params: 1,180,501
Non-trainable params: 0
_________________________________________________________________


In [21]:
rnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(
        from_logits=False), metrics=['accuracy'])

history = rnn_model.fit(
    train_data, validation_data=valid_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
results = rnn_model.evaluate(test_data)
print('테스트 정확도: {:.2f}%'.format(results[1]*100))

테스트 정확도: 76.10%


In [23]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
from IPython.display import display
import PIL
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7899578670518958593
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5769199616
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6458989066248694261
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


### 16.3.2 두 번째 프로젝트: 텐서플로로 글자 단위 언어 모델 구현

In [5]:
import numpy as np
## 텍스트 읽고 전처리하기
with open('1268-0.txt', 'r', encoding='UTF8') as fp:
    text=fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)
print('전체 길이:', len(text))
print('고유한 문자:', len(char_set))

전체 길이: 1112350
고유한 문자: 80


In [7]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)
print('인코딩된 텍스트 크기:', text_encoded.shape)
print(text[:15], '    == 인코딩 ==>', text_encoded[:15])
print(text_encoded[15:21], '    == 디코딩 ==>', ''.join(char_array[text_encoded[15:21]]))

인코딩된 텍스트 크기: (1112350,)
THE MYSTERIOUS      == 인코딩 ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]     == 디코딩 ==> ISLAND


In [9]:
import tensorflow as tf
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)
for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [11]:
seq_length = 40
chunk_size = seq_length + 1
ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True)

## x & y를 나누기 위한 함수를 정의한다.
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

In [12]:
for example in ds_sequences.take(2):
    print('입력 (x):', repr(''.join(char_array[example[0].numpy()])))
    print('타깃 (y):', repr(''.join(char_array[example[1].numpy()])))
    print()

입력 (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
타깃 (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

입력 (x): ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
타깃 (y): 'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'



In [13]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(
            rnn_units,
            return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

## 매개변수 설정
charset_size = len(char_array)
embedding_dim = 256
rnn_units = 512
tf.random.set_seed(1)
model = build_model(
    vocab_size=charset_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         20480     
                                                                 
 lstm (LSTM)                 (None, None, 512)         1574912   
                                                                 
 dense (Dense)               (None, None, 80)          41040     
                                                                 
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
model.fit(ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x23e86836610>

In [17]:
tf.random.set_seed(1)
logits = [[1.0, 1.0, 1.0]]
print('확률:', tf.math.softmax(logits).numpy()[0])

확률: [0.33333334 0.33333334 0.33333334]


In [20]:
samples = tf.random.categorical(
    logits=logits, num_samples=10)
tf.print(samples.numpy())

array([[1, 2, 0, 1, 0, 1, 1, 2, 1, 1]], dtype=int64)


In [21]:
tf.random.set_seed(1)
logits = [[1.0, 1.0, 3.0]]
print('확률: ', tf.math.softmax(logits).numpy()[0])

확률:  [0.10650698 0.10650698 0.78698605]


In [22]:
samples = tf.random.categorical(
    logits=logits, num_samples=10)
tf.print(samples.numpy())

array([[2, 2, 0, 2, 2, 2, 2, 2, 1, 2]], dtype=int64)


In [23]:
def sample(model, starting_str, len_generated_text=500, max_input_length=40, scale_factor=1.0):
    encoded_input = [char2int[s] for s in starting_str]
    encoded_input = tf.reshape(encoded_input, (1, -1))
    
    generated_str = starting_str
    
    model.reset_states()
    for i in range(len_generated_text):
        logits = model(encoded_input)
        logits = tf.squeeze(logits, 0)
        
        scaled_logits = logits * scale_factor
        new_char_indx = tf.random.categorical(
            scaled_logits, num_samples=1)
        
        new_char_indx = tf.squeeze(new_char_indx)[-1].numpy()
        
        generated_str += str(char_array[new_char_indx])
        
        new_char_indx = tf.expand_dims([new_char_indx], 0)
        encoded_input = tf.concat([encoded_input, new_char_indx], axis=1)
        encoded_input = encoded_input[:, -max_input_length:]
        
    return generated_str

In [24]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island'))

The island was extended on the reporter.

“Spored up, a ciff making a minute the promontory, the rope had more trade, enther at the state of the convicts
shore, Tabor Island indeed,” answered Herbert.

“No, captainly,” replied Neb, “you did not sufficient soil to establish ourselves of the lake at the five or freeze, let us we likely to conden of
elects with
its splude
which is
extremely pieces.

Of abyssible daily fever did not more any emergency. The cart was a cold with the rooms, although
not walking 


In [25]:
logits = np.array([[1.0, 1.0, 3.0]])
print('스케일 조정 전의 확률:', tf.math.softmax(logits).numpy()[0])
print('0.5배 조정 후 확률:', tf.math.softmax(0.5*logits).numpy()[0])
print('0.1배 조정 후 확률:', tf.math.softmax(0.1*logits).numpy()[0])

스케일 조정 전의 확률: [0.10650698 0.10650698 0.78698604]
0.5배 조정 후 확률: [0.21194156 0.21194156 0.57611688]
0.1배 조정 후 확률: [0.31042377 0.31042377 0.37915245]


In [26]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island', scale_factor=2.0))

The island was extended on the road to the mouth of the coast when the mark for two miles from the extreme voice. The engineer was heard, and then, in the corral and an abundance of each other.

It was not to be felt. The shade
of Prospect Heights and the sea, the sea between the two country he could distinguish a double bain almost more brought be suitable.

“And yet Ayrton! I will be able to remain standing in a beautiful trees. The engineer was very darked as the terrible danger in the right bank of th


In [27]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island', scale_factor=0.5))

The island was egbleer fox togetray? go Heave, Herbert! ciff master to
me, Harding,--“To casctobary?” added; cerrud to wohes
abonavh tak;
the mosn, open,
sin heat, evaxisyen ocerrasieas, mingodesion, projeeved
up,
listeng again
barquiticy of
ocions delb.
 Top, this inglete. Hadd Herb,! Then,
for, indedicalent free! tullor 23 molmidutes; cother-glee-?”
said Peninsuland-Afficatily over-tinking my lib. He had cast
illyful-promolt of the ansle. He further. Lalts
spreaked, with us. Towarded Falrs?
To Juty 9uri
