In [1]:
# 토큰화 수동 구현
import string

class Vectorizer:
  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text if char not in string.punctuation)

  def tokenize(self, text):
    return text.split()

  def make_vocabulary(self, dataset):
    self.vocabulary = {"": 0, "[UNK]" : 1} # 패딩 토큰 0, 사전에 없는 토큰 1
    for text in dataset:
      text = self.standardize(text)
      tokens = self.tokenize(text)
      for token in tokens:
        if token not in self.vocabulary:
          self.vocabulary[token] = len(self.vocabulary)
    self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    return [self.vocabulary.get(token, 1) for token in tokens]

  def decode(self, int_sequence):
    return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

vectorizer = Vectorizer()
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]
vectorizer.make_vocabulary(dataset)

In [2]:
# 사용
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [3]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


### 구현은 비슷하나 더 빠르고 효율적인 TextVectorization 층

In [4]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(output_mode = "int", )

In [5]:
import re
import string
import tensorflow as tf

def custom_standardization(string_tensor):
  lowercase_string = tf.strings.lower(string_tensor) # 문자열 소문자로 변경
  return tf.strings.regex_replace(
      lowercase_string, f"[{re.escape(string.punctuation)}]", ""
  )

def custom_split(string_tensor):
  return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode = 'int',
    standardize = custom_standardization,
    split = custom_split
)

In [6]:
# 텍스트 말뭉치Corpus의 어휘사전 인덱싱 : Dataset 객체로 adapt() 메서드를 호출하면 된다.
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]
text_vectorization.adapt(dataset)

In [7]:
# 계산된 어휘사전 추출하기
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [8]:
# 예시 문장 인코딩과 디코딩
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
i write rewrite and [UNK] rewrite again


## 데이터 준비

In [9]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  30.9M      0  0:00:02  0:00:02 --:--:-- 30.9M


In [10]:
# 데이터 확인
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [11]:
# 훈련 텍스트 파일에서 20%를 aclImdb/val로 덜어 검증 세트 생성
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
  os.makedirs(val_dir / category)
  files = os.listdir(train_dir / category)
  random.Random(1337).shuffle(files)

  # 20% 검증 세트
  num_val_samples = int(0.2 * len(files))
  val_files = files[-num_val_samples:]

  # 검증 데이터 옮기기
  for fname in val_files:
    shutil.move(train_dir / category / fname,
                val_dir / category / fname)


In [12]:
# 디렉터리 -> 데이터셋 만들기
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size = batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size = batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size = batch_size
)


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [13]:
for inputs, targets in train_ds:
  print("inputs.shape : ", inputs.shape)
  print("inputs.dtype : ", inputs.dtype)
  print("targets.shape : ", targets.shape)
  print("targets.dtype : ", targets.dtype)
  print("inputs[0] : ", inputs[0])
  print("targets[0] : ", targets[0])
  break

inputs.shape :  (32,)
inputs.dtype :  <dtype: 'string'>
targets.shape :  (32,)
targets.dtype :  <dtype: 'int32'>
inputs[0] :  tf.Tensor(b'MAJOR SPOILERS!! THIS IS FOR PEOPLE WHO HAVE SEEN THE MOVIE!!<br /><br />Commenters have touched on the major theme of "honor" in the film, and too many comparisons to "Braveheart." I\'ll point out a few things about this movie that I have not seen other comments touch on:<br /><br />This movie has a decidedly different take on abortion. The first character to get pregnant is the villain\'s (Roth) girlfriend, and when he coldly suggests an abortion, she states it is too late for that. The shame of her situation ("I\'m to have a bastard\'s bastard.") leads her to commit suicide in a much later scene. The second character to find herself pregnant is Mary, Rob\'s wife, after a rape by Roth\'s character (and at least one sex scene with her husband, Rob). Late in the movie, as Rob is leaving for a final confrontation with Roth, Mary asks what she should d

### BoW 방식

In [14]:
text_vectorization = TextVectorization(
    max_tokens = 20000,
    output_mode = "multi_hot" # 멀티-핫 이진 벡터로 출력 토큰 인코딩
)

text_only_train_ds = train_ds.map(lambda x, y : x) # 레이블 없는 데이터셋
text_vectorization.adapt(text_only_train_ds) # 데이터셋의 어휘사전 인덱싱

# 데이터셋 전처리 : 다중 CPU 코어 활용
binary_1gram_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
binary_1gram_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
binary_1gram_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)

In [15]:
# 데이터셋 출력 확인
for inputs, targets in binary_1gram_train_ds:
  print("inputs.shape : ", inputs.shape)
  print("inputs.dtype : ", inputs.dtype)
  print("targets.shape : ", targets.shape)
  print("targets.dtype : ", targets.dtype)
  print("inputs[0] : ", inputs[0])
  print("targets[0] : ", targets[0])
  break

inputs.shape :  (32, 20000)
inputs.dtype :  <dtype: 'float32'>
targets.shape :  (32,)
targets.dtype :  <dtype: 'int32'>
inputs[0] :  tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0] :  tf.Tensor(0, shape=(), dtype=int32)


In [16]:
# 모델 생성 함수 : 모든 예제에서 쓸 거임
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens = 20000, hidden_dim = 16):
  inputs = keras.Input(shape = (max_tokens, ))
  x = layers.Dense(hidden_dim, activation = 'relu')(inputs)
  x = layers.Dropout(0.5)(x)
  outputs = layers.Dense(1, activation = 'sigmoid')(x)
  model = keras.Model(inputs, outputs)
  model.compile(optimizer = 'rmsprop',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  return model

In [17]:
# 모델 훈련 및 테스트
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only = True)
]
model.fit(binary_1gram_train_ds.cache(),
          validation_data = binary_1gram_val_ds.cache(),
          epochs = 10,
          callbacks = callbacks)

model = keras.models.load_model('binary_1gram.keras')
print(f"테스트 정확도 : {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
테스트 정확도 : 0.889


### 이진 인코딩 & 바이그램

In [18]:
text_vectorization = TextVectorization(
    ngrams = 2, # 정슈 튜플로도 전달 가능. (2, 3)이면 2~3단어로 구성된 어휘사전을 만든다. 1단어는 제외.
    max_tokens = 20000,
    output_mode = "multi_hot" # 멀티-핫 이진 벡터로 출력 토큰 인코딩
)

text_only_train_ds = train_ds.map(lambda x, y : x) # 레이블 없는 데이터셋
text_vectorization.adapt(text_only_train_ds) # 데이터셋의 어휘사전 인덱싱

# 데이터셋 전처리 : 다중 CPU 코어 활용
binary_2gram_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
binary_2gram_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
binary_2gram_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only = True)
]
model.fit(binary_2gram_train_ds.cache(),
          validation_data = binary_2gram_val_ds.cache(),
          epochs = 10,
          callbacks = callbacks)

model = keras.models.load_model('binary_2gram.keras')
print(f"테스트 정확도 : {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
테스트 정확도 : 0.897


In [19]:
text_vectorization =TextVectorization(
									  ngrams = 2,
									  max_tokens = 20000,
									  output_mode = 'tf-idf'
)

text_only_train_ds = train_ds.map(lambda x, y : x) # 레이블 없는 데이터셋
text_vectorization.adapt(text_only_train_ds) # 데이터셋의 어휘사전 인덱싱

# 데이터셋 전처리 : 다중 CPU 코어 활용
tfidf_2gram_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", save_best_only = True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data = tfidf_2gram_val_ds.cache(),
          epochs = 10,
          callbacks = callbacks)

model = keras.models.load_model('tfidf_2gram.keras')
print(f"테스트 정확도 : {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                320016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
테스트 정확도 : 0.894


# 시퀀스 모델

## 1번째 예제

In [20]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000

# 600개 단어를 넘는 데이터는 5%라서 합리적인 선택이래
text_vectorization = layers.TextVectorization(
    max_tokens = max_tokens, # 600개 단어 이후는 자른다
    output_mode = 'int',
    output_sequence_length = max_length
)


text_vectorization.adapt(text_only_train_ds)

# 데이터셋 전처리 : 다중 CPU 코어 활용
int_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
int_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)
int_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4
)

In [21]:
# 모델
import tensorflow as tf

inputs = keras.Input(shape = (None, ), dtype = 'int64') # 정수 시퀀스의 입력
embedded = tf.one_hot(inputs, depth = max_tokens) # 2만 차원 이진 벡터로 인코딩
x = layers.Bidirectional(layers.LSTM(32))(embedded) # 양방향 LSTM 층
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x) # 분류층
model = keras.Model(inputs, outputs)
model.compile(optimizer = 'rmsprop',
			 loss = 'binary_crossentropy',
			 metrics = ['accuracy'])
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirectiona  (None, 64)               5128448   
 l)                                                              
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
_________________________________________________

In [22]:
# 훈련

callbacks =[
			keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
											save_best_only = True)
]

model.fit(int_train_ds, validation_data = int_val_ds, epochs = 10, callbacks = callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")


Epoch 1/10
 61/625 [=>............................] - ETA: 2:04 - loss: 0.6923 - accuracy: 0.5225

KeyboardInterrupt: ignored

## 단어 임베딩
- 원핫인코딩은 차원이 너무 커지는 데다가, 성능도 이진분류 바이그램보다 떨어짐(속도, 정확도 모두)


In [23]:
embedding_layer = layers.Embedding(input_dim = max_tokens,
								   output_dim = 256)

In [26]:
import tensorflow as tf

inputs = keras.Input(shape = (None, ), dtype = 'int64') # 정수 시퀀스의 입력
embedded = layers.Embedding(input_dim = max_tokens,
								   output_dim = 256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded) # 양방향 LSTM 층
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x) # 분류층
model = keras.Model(inputs, outputs)
model.compile(optimizer = 'rmsprop',
			 loss = 'binary_crossentropy',
			 metrics = ['accuracy'])
model.summary()

callbacks =[
			keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
											save_best_only = True)
]

model.fit(int_train_ds, validation_data = int_val_ds, epochs = 10, callbacks = callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               73984     
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
_________________________________________________

### 패딩에 마스킹 적용하기
- 실전에서는 수동으로 마스킹을 관리하지 않아도, 케라스가 자동으로 전달한다.
- 패딩 : 정해진 길이보다 짧은 문장은 0으로 채워짐.
- 마스킹 : 패딩을 RNN의 학습에 적용시키지 않게 함

In [27]:
import tensorflow as tf

inputs = keras.Input(shape = (None, ), dtype = 'int64') # 정수 시퀀스의 입력
embedded = layers.Embedding(input_dim = max_tokens,
								   output_dim = 256,
								   mask_zero = True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded) # 양방향 LSTM 층
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x) # 분류층
model = keras.Model(inputs, outputs)
model.compile(optimizer = 'rmsprop',
			 loss = 'binary_crossentropy',
			 metrics = ['accuracy'])
model.summary()

callbacks =[
			keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
											save_best_only = True)
]

model.fit(int_train_ds, validation_data = int_val_ds, epochs = 10, callbacks = callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_4 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_4 (Bidirectio  (None, 64)               73984     
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
_________________________________________________

## 사전 훈련 임베딩(GloVe) 적용하기

In [29]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2023-07-25 07:36:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-07-25 07:36:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-07-25 07:36:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [31]:
# glove 파싱
import numpy as np

path_to_glove_file = "glove.6B.100d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
	for line in f:
		word, coefs = line.split(maxsplit = 1)
		coefs = np.fromstring(coefs, "f", sep = " ")
		embeddings_index[word] = coefs

print(f"단어 벡터 갯수 : {len(embeddings_index)}")

단어 벡터 갯수 : 400000


In [32]:
# Embedding 행렬 만들기
embedding_dim = 100

# 인덱싱 단어 추출
voca = text_vectorization.get_vocabulary()

# 어휘 사전의 단어와 인덱스 매핑
word_index = dict(zip(voca, range(len(voca))))

# GloVe 벡터를 담는 행렬 준비
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():

	if i  < max_tokens:
		embedding_vector = embeddings_index.get(word)

	# 인덱스 i에 대한 단어 벡터로 행렬의 i번째 항목을 채운다
	# 임베딩 인덱스에 단어가 없다면 0
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

# 초기화 : 사전 훈련 임베딩 층에 넣기, 훈련 중 변경 X
embedding_layer = layers.Embedding(
								   max_tokens,
								   embedding_dim,
								   embeddings_initializer= keras.initializers.Constant(embedding_matrix),
								   trainable = False, # 훈련 중 가중치 변경 X
								   mask_zero = True
)

In [33]:
# 모델 훈련
inputs = keras.Input(shape = (None, ), dtype = 'int64') # 정수 시퀀스의 입력
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded) # 양방향 LSTM 층
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x) # 분류층
model = keras.Model(inputs, outputs)
model.compile(optimizer = 'rmsprop',
			 loss = 'binary_crossentropy',
			 metrics = ['accuracy'])
model.summary()

callbacks =[
			keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras",
											save_best_only = True)
]

model.fit(int_train_ds, validation_data = int_val_ds, epochs = 10, callbacks = callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_5 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_5 (Bidirectio  (None, 64)               34048     
 nal)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 2,034,113
Trainable params: 34,113
Non-trainable params: 2,000,000
____________________________________________

## 트랜스포머 인코더

In [37]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim # 입력 토큰 벡터 크기
    self.dense_dim = dense_dim # 내부 밀집 층 크기
    self.num_heads = num_heads # 어텐션 헤드 개수
    self.attention = layers.MultiHeadAttention(
        num_heads = num_heads, key_dim = embed_dim
    )
    self.dense_proj = keras.Sequential(
        [
            layers.Dense(dense_dim, activation = 'relu'),
            layers.Dense(embed_dim),
        ]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()

  # 연산 수행
  def call(self, inputs, mask = None):

    # Embedding층의 마스크는 2D이나, 어텐션층은 3D나 4D 기대
    if mask is not None:
      mask = mask[:, tf.newaxis, :]

    attention_output = self.attention(inputs, inputs, attention_mask = mask)
    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)

  # 모델 저장을 위한 직렬화 구현
  def get_config(self):
    config = super().get_config()
    config.update({
        'embed_dim' : self.embed_dim,
        'num_heads' : self.num_heads,
        'dense_dim' : self.dense_dim
    })
    return config

In [38]:
voca_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape = (None, ), dtype = 'int64')
x = layers.Embedding(voca_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
# 위 인코더는 전체 시퀀스를 반환하므로, 분류를 위해 전역 풀링층으로 각 시퀀스를 1개의 벡터로 만든다.
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_7 (Embedding)     (None, None, 256)         5120000   
                                                                 
 transformer_encoder (Transf  (None, None, 256)        543776    
 ormerEncoder)                                                   
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout_9 (Dropout)         (None, 256)               0         
                                                                 
 dense_14 (Dense)            (None, 1)                 257 

In [39]:
# 훈련
callbacks = [
			 keras.callbacks.ModelCheckpoint("transformer.encoder.keras",
												 save_best_only = True)
]

model.fit(int_train_ds, validation_data = int_val_ds, epochs = 20, callbacks = callbacks)

# 불러오기 & 추론
model = keras.models.load_model("transformer.encoder.keras",
							   custom_objects = {"TransformerEncoder": TransformerEncoder})
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
테스트 정확도 : 0.874


### 위치 임베딩

In [40]:
class PositionalEmbedding(layers.Layer):

  # 위치 임베딩 : 시퀀스 길이를 미리 알아야 한다는 게 단점임
  def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.token_embeddings = layers.Embedding(
        input_dim = input_dim, output_dim = output_dim
    )
    self.position_embeddings = layers.Embedding(
        input_dim = sequence_length, output_dim = output_dim
    )
    self.sequence_length = sequence_length
    self.input_dim = input_dim
    self.output_dim = output_dim

  def call(self, inputs):
    length = tf.shape(inputs)[-1]
    positions = tf.range(start = 0, limit = length, delta = 1)
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = self.position_embeddings(positions)
    return embedded_tokens + embedded_positions

  # 입력 0 패딩을 무시하는 마스킹 생성.
  # 프레임워크에 의해 자동으로 호출되며, 마스킹은 다음 층으로 전달된다.
  def compute_mask(self, inputs, mask = None):
    return tf.math.not_equal(inputs, 0)

  def get_config(self):
    config = super().get_config()
    config.update({
        "output_dim" : self.output_dim,
        "sequence_length" : self.sequence_length,
        "input_dim" : self.input_dim
    })
    return config


In [42]:
# 트랜스포머 인코더와 위치 임베딩 합치기

voca_size = 20000
sequence_length = 600
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape = (None, ), dtype = 'int64')
x = PositionalEmbedding(sequence_length, voca_size, embed_dim)(inputs) # 위치 임베딩 추가
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

# 훈련
callbacks = [
			 keras.callbacks.ModelCheckpoint("full_transformer.encoder.keras",
												 save_best_only = True)
]

model.fit(int_train_ds, validation_data = int_val_ds, epochs = 20, callbacks = callbacks)

# 불러오기 & 추론
model = keras.models.load_model("full_transformer.encoder.keras",
							   custom_objects = {"TransformerEncoder": TransformerEncoder,
                            "PositionalEmbedding" : PositionalEmbedding})
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, None)]            0         
                                                                 
 positional_embedding_1 (Pos  (None, None, 256)        5273600   
 itionalEmbedding)                                               
                                                                 
 transformer_encoder_2 (Tran  (None, None, 256)        543776    
 sformerEncoder)                                                 
                                                                 
 global_max_pooling1d_2 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_10 (Dropout)        (None, 256)               0         
                                                          

# 시퀀스 투 시퀀스 : 기계 번역 예제

In [43]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip

--2023-07-25 09:16:37--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.214.128, 172.253.114.128, 172.253.119.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.214.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2023-07-25 09:16:37 (268 MB/s) - ‘spa-eng.zip’ saved [2638744/2638744]



In [44]:
# 데이터 확인하기
text_file = 'spa-eng/spa.txt'
with open(text_file) as f:
	lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
	english, spanish = line.split("\t")
	spanish = '[start] ' + spanish + ' [end]'
	text_pairs.append((english, spanish))

# text_pairs 내용
import random
print(random.choice(text_pairs))

("You're fired.", '[start] Están despedidas. [end]')


In [45]:
# 데이터 섞은 뒤 훈련/검증/테스트 나누기
import random

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples : ]

In [46]:
# 2개의 TextVectorizer 층 준비
import tensorflow as tf
import string
import re

strip_chars = string.punctuation + " ¿ "
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
	lower_case = tf.strings.lower(input_string)
	return tf.strings.regex_replace(
	lower_case, f"[{re.escape(strip_chars)}]", "")

voca_size = 15000
sequence_length = 20

# 영어
source_vectorization = layers.TextVectorization(
												max_tokens = voca_size,
												output_mode = 'int',
												output_sequence_length = sequence_length
)

# 스페인어
target_vectorization = layers.TextVectorization(
												max_tokens = voca_size,
												output_mode = 'int',
												output_sequence_length = sequence_length + 1, # 훈련 중 한 스텝 앞선 문장이 필요하기 때문에 토큰 1개 추가
												standardize = custom_standardization
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [48]:
# 데이터 -> 파이프라인 변환
batch_size = 64

def format_dataset(eng, spa):
	eng = source_vectorization(eng)
	spa = target_vectorization(spa)
	return ({
		'english' : eng,
		'spanish' : spa[:, :-1] # 입력 스페인어 문장은 마지막 토큰 포함 X라서 입출력 길이 동일
	}, spa[:, 1:]) # 타깃 스페인어 문장

def make_dataset(pairs):
	eng_texts, spa_texts = zip(*pairs)
	eng_texts = list(eng_texts)
	spa_texts = list(spa_texts)
	dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
	dataset = dataset.batch(batch_size)
	dataset = dataset.map(format_dataset, num_parallel_calls = 4)
	return dataset.shuffle(2048).prefetch(16).cache() # 전처리 속도 높이기 위한 캐싱

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

# 크기 확인
for inputs, targets in train_ds.take(1):
	print(f"inputs['english'].shape : {inputs['english'].shape}")
	print(f"inputs['spanish'].shape : {inputs['spanish'].shape}")
	print(f"targets.shape : {targets.shape}")

inputs['english'].shape : (64, 20)
inputs['spanish'].shape : (64, 20)
targets.shape : (64, 20)


## RNN seq2seq 모델 만들기

In [51]:
# GRU 기반 모델
from tensorflow import keras
from tensorflow.keras import layers

embed_dim = 256
latent_dim = 1024

# 인코더
source = keras.Input(shape = (None,), dtype = 'int64', name = 'english') # 영어 소스 문장. 이름 지정 시 입력 딕셔너리로 모델을 훈련할 수 있다.
x = layers.Embedding(voca_size, embed_dim, mask_zero = True)(source) # 마스킹 필수!
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode = 'sum')(x)

# 디코더
past_target = keras.Input(shape = (None,), dtype = 'int64', name = 'spanish')
x = layers.Embedding(voca_size, embed_dim, mask_zero = True)(past_target)
decoder_gru = layers.GRU(latent_dim ,return_sequences = True)
x = decoder_gru(x, initial_state = encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(voca_size, activation = 'softmax')(x) # 다음 토큰 예측
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [52]:
# 컴파일 & 훈련
seq2seq_rnn.compile(optimizer = 'rmsprop',
				   loss = 'sparse_categorical_crossentropy',
				   metrics = ['accuracy'])
seq2seq_rnn.fit(train_ds, epochs= 15, validation_data = val_ds)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7ac5d07c9360>

In [54]:
# 추론
import numpy as np

# 예측 인덱스 문자열 토큰으로 변환하는 딕셔너리 준비
spa_voca = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_voca)), spa_voca))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
	tokenized_input_sentence = source_vectorization([input_sentence])
	decoded_sentence = "[start]"
	for i in range(max_decoded_sentence_length):
		tokenized_target_sentence = target_vectorization([decoded_sentence])

		# 다음 토큰 샘플링
		next_token_predictions = seq2seq_rnn.predict(
		[tokenized_input_sentence, tokenized_target_sentence])
		sampled_token_index = np.argmax(next_token_predictions[0, i, :])

		# 다음 토큰 -> 문자열로 변경 & 문장에 추가
		sampled_token = spa_index_lookup[sampled_token_index]
		decoded_sentence += " " + sampled_token
		if sampled_token == "[end]":
			break
	return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
	input_sentence = random.choice(test_eng_texts)
	print("-")
	print(input_sentence)
	print(decode_sequence(input_sentence))

-
Tom is snoring.
[start]                    
-
The cat lay hidden in the bushes.
[start]                    
-
I watched a tennis match on TV.
[start]                    
-
What makes you think that Tom likes heavy metal?
[start]                    
-
She writes to her son every now and then.
[start]                    
-
He will wait.
[start]                    
-
You could be right, I suppose.
[start]                    
-
He picked out the best book.
[start]                    
-
They were sitting on the sofa in our living room.
[start]                    
-
I must talk to you.
[start]                    
-
Why don't you want to tell us the truth?
[start]                    
-
I can't allow that to happen.
[start]                    
-
He likes to sing popular songs.
[start]                    
-
The town was deserted by its inhabitants.
[start]                    
-
And that's how Tom met Mary.
[start]                    
-
I am lighting my cigar.
[start]                    
-
Tom

KeyboardInterrupt: ignored

## 트랜스포머 시퀀스 투 시퀀스 모델

In [None]:
class TransformerDecoder(layers.Layer):
	def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
		super().__init__(**kwargs)
		self.embed_dim = embed_dim
		self.dense_dim = dense_dim
		self.num_heads = num_heads
		self.attention_1 = layers.MultiHeadAttention(num_heads = num_heads,
		key_dim = embed_dim)
		self.attention_2 = layers.MultiHeadAttention(num_heads = num_heads,
		key_dim = embed_dim)
		self.attention_3 = layers.MultiHeadAttention(num_heads = num_heads,
		key_dim = embed_dim)
		self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation = 'relu'),
		layers.Dense(embed_dim),])
		self.layer_norm_1 = layers.LayerNormalization()
		self.layer_norm_2 = layers.LayerNormalization()
		self.layer_norm_3 = layers.LayerNormalization()
		self.supports_masking = True # 입력 마스킹을 출력으로 전달하게 함
		# layer.compute_mask()는 위 속성이 False이면 에러를 반환한다.
	def get_config(self):
		config = super().get_config()
		config.update({
		'embed_dim' : self.embed_dim,
		'num_heads' : self.num_heads,
		'dense_dim' : self.dense_dim
		  })
		return config

	# 코잘 마스킹 : 미래 타임스텝의 데이터를 사용하지 못하게 한다
	def get_casual_attention_mask(self, inputs):
		input_shape= tf.shape(inputs)
		batch_size, sequence_length = input_shape[0], input_shape[1]
		i = tf.range(sequence_length)[:, tf.newaxis]
		j = tf.range(sequence_length)
		mask = tf.cast(i >= j, dtype = 'int32') #
		mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))

		mult = tf.concat([tf.expand_dims(batch_size, -1),
						  tf.constant([1, 1], dtype = tf.int32)], axis = 0)
		return tf.tile(mask, mult)

	def call(self, inputs, encoder_outputs, mask = None):
		casual_mask = self.get_casual_attention_mask(inputs)

		if mask is not None:
		  padding_mask = tf.cast(
			  mask[:, tf.newaxis, :], dtype = 'int32'
		  )
    padding_mask = tf.minimum(padding_mask, casual_mask)

		attention_output_1 = self.attention_1(
			query = inputs,
			value = inputs,
			key = inputs,
			attention_mask = casual_mask
		)
		attention_output_1 = self.layernorm_1(inputs + attention_output_1)

		attention_output_2 = self.attention_2(
			query = attention_output_1,
			value = encoder_outputs,
			key = encoder_outputs,
			attention_mask = padding_mask, # 합친 마스킹을 소스 + 타깃 시퀀스를 연결시키는 2번째 어텐션 층에 전달
		)
		attention_output_2 = self.layernorm_2(
			attention_output_1 + attention_output_2
		)

		proj_output = self.dense_proj(attention_output_2)
		return self.layernorm_3(attention_output_2 + proj_output)

In [None]:
# 엔드 투 엔드 트랜스포머
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape = (None,), dtype = 'int64', name = 'english')
x = PositionalEmbedding(sequence_length, voca_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape = (None, ), dtype = 'int64', name = 'spanish')
x = PositionalEmbedding(sequence_length, voca_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(ebmed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocas_size, activation = 'softmax')(x)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

transformer.compile(
					 optimizer = 'rmsprop',
					 loss = 'sparse_categorical_crossentropy',
					 metrics = ['accuracy']
)
transformer.fit(train_ds, epochs = 30, validation_data = val_ds)

In [None]:
import numpy as np

spa_voca = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_voca)), spa_voca))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
	tokenized_input_sentence = source_vectorization([input_sentence])
	decoded_sentence = "[start]"
	for i in range(max_decoded_sentence_length):
		tokenized_target_sentence = target_vectorization(
		[decoded_sentence])[:, :-1]
		predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
		sampled_token_index = np.argmax(predictions[0, i, :])

		# 다음 토큰 예측 문자열로 바꾸고 생성된 문장에 추가
		sampled_token = spa_index_lookup[sampled_token_index]
		decoded_sentence += " " + sampled_token
		if sampled_token == "[end]":
			break
	return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
	input_sentence = random.choice(test_eng_Texts)
	print("-")
	print(input_sentence)
	print(decode_sequence(input_sentence))