In [1]:
# 텍스트를 읽고 전처리( 불필요한 부분 삭제)
import numpy as np

with open('1268-0.txt', 'r', encoding='UTF8') as fp:
    text = fp.read()

start_index = text.find('THE MYSTERIOUS ISLAND')
end_index = text.find('END OF THE PROJECT GUTENBERG')

text = text[start_index:end_index]

char_set = set(text)

print("전체 길이", len(text))
print("고유 문자 수", len(char_set))

전체 길이 1112300
고유 문자 수 80


# 새 섹션

In [2]:
# 문자-정수 매핑 인코더 / 디코더 생성
chars_sorted = sorted(char_set)

char2int = {ch:i for i, ch in enumerate(chars_sorted)}

char_array = np.array(chars_sorted)

In [3]:
# 데이터셋 정수로 인코딩
text_encoded = np.array(
    [char2int[ch] for ch in text], dtype = np.int32)

print("인코딩된 텍스트 크기:", text_encoded.shape)

인코딩된 텍스트 크기: (1112300,)


In [4]:
# 데이터셋 인코딩 결과 확인
print(text[: 15], "\t==인코딩==>\t", text_encoded[:15])

THE MYSTERIOUS  	==인코딩==>	 [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]


In [5]:
# 데이터셋 디코딩 테스트
print(text_encoded[15:21], "\t==디코딩==>\t", char_array[text_encoded[15:21]])

[33 43 36 25 38 28] 	==디코딩==>	 ['I' 'S' 'L' 'A' 'N' 'D']


In [6]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # TF warning 메세지 제외
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # 0 번 GRU 선택

import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.compat.v1.Session(config=config)

In [7]:
import tensorflow as tf

ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)

In [8]:
for ex in ds_text_encoded.take(5):
    print("{}->{}".format(ex.numpy(), char_array[ex.numpy()]))

44->T
32->H
29->E
1-> 
37->M


In [9]:
seq_length = 40
chunk_size = seq_length + 1

# ds_chunks 입력 / 타겟 데이터를 만들기 위하여 길이 41(=배치 41)로 만든 데이터셋
ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True)

# x&y를 나누기 위한 함수 정의
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

# map 메서드를 이용하여 입력/ 타겟 데이터로 나누어진 데이터 셋
ds_sequences = ds_chunks.map(split_input_target)

# 입력 / 타겟 데이터 쌍 2개를 추출하여 출력
for example in ds_sequences.take(2):
    print('입력 (x):', repr(''.join(char_array[example[0].numpy()])))
    print('입력 (y):', repr(''.join(char_array[example[0].numpy()])))
    print()

입력 (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
입력 (y): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'

입력 (x): 'OUS ISLAND\n\nby Jules Verne\n\n1874\n\n\n\n\nPAR'
입력 (y): 'OUS ISLAND\n\nby Jules Verne\n\n1874\n\n\n\n\nPAR'



In [10]:
# 배치생성
BATCH_SIZE = 64
BUFFER_SIZE = 10000

tf.random.set_seed(1)
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [11]:
# 매개변수 설정
vocab_size = len(char_array)
embedding_dim = 256 
rnn_units = 512 

# RNN 모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.LSTM(
        rnn_units,
        return_sequences=True),
    tf.keras.layers.Dense(vocab_size)
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         20480     
                                                                 
 lstm (LSTM)                 (None, None, 512)         1574912   
                                                                 
 dense (Dense)               (None, None, 80)          41040     
                                                                 
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(optimizer='adam',
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [13]:
hist = model.fit(ds, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
tf.random.set_seed(1)

# 로짓 출력 1.0, 1.0, 3.0 이라 가정 (출력유닛 3개)
logits = [[1.0, 1.0, 3.0]]

print("클래스 별 확률 값", tf.math.softmax(logits).numpy()[0])

클래스 별 확률 값 [0.10650698 0.10650698 0.78698605]


In [15]:
# logits 의 각 클래스별 확률 분포로부터 길이 10의 샘플 데이터 생성
samples= tf.random.categorical(logits=logits, num_samples=10)

tf.print(samples.numpy())

array([[2, 2, 0, 2, 2, 2, 2, 2, 1, 2]])


In [16]:
def generate_samples(model, starting_str,
                     len_generated_text=500,
                     max_input_length=40,
                     scale_factor=1.0):
  encoded_input = [char2int[s] for s in starting_str] # encoded_input.shape: (텍스트길이,)
  encoded_input = tf.reshape(encoded_input, (1, -1)) # encoded_input.shape: (1, 텍스트길이)

  generated_str = starting_str

  # 모델의 상태 매개변수 초기화
  model.reset_states()

  for i in range(len_generated_text):
    logits = model(encoded_input) # logits.shape (1, 텍스트길이 ~ max_input_length, 80)
    
    # squeeze 크기가 1인 차원 제거 logits.shape (텍스트길이 ~ max_input_length, 80)
    logits = tf.squeeze(logits, 0)
    
    scaled_logits = logits * scale_factor
    
    # 각 타임스텝에서 1개 글자 생성
    new_char_idx = tf.random.categorical(scaled_logits, num_samples=1)

    # 가장 마지막 글자를 갖고와서 생성할 텍스트에 추가
    new_char_idx = tf.squeeze(new_char_idx)[-1].numpy()

    generated_str += str(char_array[new_char_idx])

    # 현재 출력을 다음 입력으로 넣기위해 준비
    new_char_idx = tf.expand_dims([new_char_idx], 0)
    encoded_input = tf.concat([encoded_input, new_char_idx], axis=1)

    encoded_input = encoded_input[:, -max_input_length:]

  return generated_str

In [17]:
tf.random.set_seed(1)
print(generate_samples(model, starting_str="The island"))

The island was explored them for it was Herbert.

“Yes! captain!” answered Harding, “this is to arout geneme cerrider, would absopph to make their first time he had no
worthy cold. “Take a new wairical blade, and,
listen
the case of the capes cast fire a senterchies of the dutara to the corral was complete. The excetellars in Lake Grant, bedno-glees, is excession wander.

It was evident that his having began to black useful to the intense sea. The vessel was
suspected. They swarmed into shricks, situation


In [19]:
# scale_factor 조정에 따른 확률 값
logits = np.array([[1.0, 1.0, 3.0]])

print('스케일 조정 전의 확률:   ', tf.math.softmax(logits).numpy()[0])
print('0.5배 조정 후 확률:   ', tf.math.softmax(0.5*logits).numpy()[0])
print('0.1배 조정 후 확률:   ', tf.math.softmax(0.1*logits).numpy()[0])

스케일 조정 전의 확률:    [0.10650698 0.10650698 0.78698604]
0.5배 조정 후 확률:    [0.21194156 0.21194156 0.57611688]
0.1배 조정 후 확률:    [0.31042377 0.31042377 0.37915245]


In [20]:
tf.random.set_seed(1)
print(generate_samples(model, starting_str="The island", scale_factor=2.0))

The island was explored the shore, and the most part of the vessel never having turned the obtain.

“The colonists had been the dark glass. The poor productions of the island was as easy to escape the most side, and soon reserved the colony.

It was evidently no result to the approach of the corral. The water from the morning of the mountain.

“And what did you are no marshes was there as it might have been able to find them.”

The colonists had already been sent at the bottom of the mountain, and the isl


In [21]:
tf.random.set_seed(1)
print(generate_samples(model, starting_str="The island", scale_factor=0.5))

The island had egbly place Cyrut 9
gig 1Ha” wound you again,” returned
Herbert-lap,--tho case to roub gunf-mace,
supportohes
visipeh gaz;
unflun-Dourtry,
sin heart; Anxisyen Jup’, sixaptimmed awaid, carble voice at Farr,
ragage
barqey. But Chouse, five feet in thier no sogoous.

He’n,! answered Cyrus Harding an excetelly Ayrtonly, untilet
the
followings,
smenizen glneemiar animare.

Yet have millid. He had calredly
found
gual, ob0,”
 answered. These ungfaltnws were wele
by the rooms, dlazures
no Jutt 9 Ex


In [24]:
class TransformerBlock(tf.keras.layers.Layer): 
  def __init__(self, embed_dim, num_heads, ff_dim):
    super(TransformerBlock, self).__init__()
    self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.ffn = tf.keras.Sequential(
        [tf.keras.layers.Dense(ff_dim, activation='relu'),
         tf.keras.layers.Dense(embed_dim)]
    )

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, inputs, training):
    attn_output = self.att(inputs, inputs)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.ffn(out1)

    return self.layernorm2(out1 + ffn_output)

In [25]:
# Pandas 를 포함한 실습에 필요한 모듈 추가가
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

import pandas as pd

In [27]:
# 데이터셋 파일 Pandas 데이터프레임으로 불러오기
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [28]:
# 상위 5개 데이터 읽어오기
df.head(5)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [29]:
# 1단계 데이터셋 만들기
target = df.pop('sentiment') # sentiment 컬럼의 데이터만 추출(df에서 해당 컬럼 사라짐)

# 텐서플로 데이터셋 객체로 생성,
# 하나의 데이터는 (리뷰 텍스트, sentiment 레이블 값)
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [30]:
# 확인
for ex in ds_raw.take(5):
    tf.print(ex[0].numpy()[0][ : 50], ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0
b'hi for all the people who have seen this wonderful' 1
b'I recently bought the DVD, forgetting just how muc' 0


In [31]:
tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)

ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [32]:
# 2단계 : 고유 토큰(단어) 찾기
from collections import Counter

tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

max_seq_length = 100 # 추가코드 1: Sequence 슬라이스용 변수

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    tokens = tokens[-max_seq_length : ] # 추가코드 2: 시퀸스의 마지막에서부터 max_seq_length 만큼 슬라이스
    token_counts.update(tokens)

print("어휘 사전 크기:", len(token_counts))

어휘 사전 크기: 58063


In [33]:
# 3단계: 고유 토큰을 정수로 인코딩

encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

example_str = 'This is an Example!'
print(encoder.encode(example_str))

[374, 209, 104, 26885]


In [34]:
# 인코더에 토큰 사이즈 37p
encoder.vocab_size

58065

In [35]:
# 3-A 단계: 변환을 위한 함수 정의
def encode(text_tensor, label):
    text = text_tensor.numpy()[0] # map() 메서드에서는 즉시 실행 불가
    encoded_text = encoder.encode(text)

    encoded_text = encoded_text[-max_seq_length : ] # 추가코드 3: 시퀸스의 마지막에서부터 max_seq_length 만큼 슬라이스

    return encoded_text, label

In [36]:
# 3-B 단계: 함수를 TF 연산으로 변환
# tf.py_function으로 감싸 map() 메서드가 가능하도록 텐서플로 연산으로 변환
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

In [37]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [38]:
# 샘플의 크기 확인
tf.random.set_seed(1)

for example in ds_train.shuffle(1000).take(5):
    print("시퀀스 길이 :", example[0].shape)

시퀀스 길이 : (24,)
시퀀스 길이 : (100,)
시퀀스 길이 : (100,)
시퀀스 길이 : (100,)
시퀀스 길이 : (100,)


In [39]:
# padded_batch() 테스트를 위한 일부 데이터 추출
ds_subset = ds_train.take(8)

for example in ds_subset:
    print("개별 샘플크기:", example[0].shape)

개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)
개별 샘플크기: (100,)


In [40]:
# 배치 데이터 만들기
ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))

for batch in ds_batched:
    print("배치 지원:", batch[0].shape)

배치 지원: (4, 100)
배치 지원: (4, 100)


In [41]:
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

In [42]:
embedding_dim = 32 # 임베딩 차원수를 저장하기 위한 변수
vocab_size = len(token_counts) + 2 # 임베딩 레이어 입력 차원 수, 고유 단어 수 + 2

tf.random.set_seed(1)

transformer_block = TransformerBlock(embedding_dim, num_heads=2, ff_dim=32)

inputs = tf.keras.layers.Input(shape=(max_seq_length,))

x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(inputs)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(20, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

In [43]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 32)           1858080   
                                                                 
 transformer_block (Transfor  (None, 100, 32)          10656     
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_3 (Dense)             (None, 20)                660       
                                                                 
 dense_4 (Dense)             (None, 1)                 21    

In [44]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [45]:
history = model.fit(train_data, validation_data = valid_data, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [46]:
test_results = model.evaluate(test_data)
print('테스트 정확도: {: .2f}%'.format(test_results[1]*100))

테스트 정확도:  82.60%
