# NLP with RNN and Attention

### shakespeare dataset

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt


shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [4]:
print(len(shakespeare_text))
print(shakespeare_text[:40])

1115394
First Citizen:
Before we proceed any fur


In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [6]:
tokenizer.texts_to_sequences(["first"])

[[20, 6, 9, 8, 3]]

In [7]:
tokenizer.sequences_to_texts([[20, 6, 9, 8,3]])

['f i r s t']

In [8]:
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count

In [9]:
print(max_id)
print(dataset_size)

39
1115394


In [10]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [11]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

2022-07-16 11:15:39.826903: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-16 11:15:39.859292: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-16 11:15:39.859478: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-16 11:15:39.860046: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [12]:
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [13]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [15]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [16]:
dataset = dataset.map(lambda X_batch, y_batch:(tf.one_hot(X_batch, depth=max_id), y_batch))

In [17]:
dataset = dataset.prefetch(1)

In [19]:
model = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2,recurrent_dropout=0.2), # input shape is [None, max_id] because time_step size can be any!
    keras.layers.LSTM(128, return_sequences=True,
                     dropout=0.2,recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])



In [20]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20

KeyboardInterrupt: 

In [23]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) -1
    return tf.one_hot(X, max_id)

In [24]:
X_new = preprocess(["How are yo"])
y_pred = np.argmax(model.predict(X_new), axis=-1)
print(tokenizer.sequences_to_texts(y_pred + 1)[0][-1])

u


In [32]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :] # use last probability for getting next char!
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [33]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [38]:
print(complete_text("I", temperature=1))

In humblion by a mouth: sing gods
my father, let us


In [42]:
# for split data!!!!!
batch_size=32
encoded_parts = np.array_split(encoded[:train_size], batch_size)# array_split splits the dataset into batch_size segments
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift = n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = (tf.data.Dataset.zip(tuple(datasets))).map(lambda *windows: tf.stack(windows))

dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, y_batch: (tf.one_hot(X_batch, depth=max_id), y_batch))
dataset = dataset.prefetch(1)

In [44]:
model = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True, stateful=True, dropout=0.2, batch_input_shape=[batch_size, None, max_id]), # recurrnet_dropout=0.2), ## for hardware acceleration
    keras.layers.LSTM(128, return_sequences=True, stateful=True, dropout=0.2), # recurrnet_dropout=0.2), ## for hardware acceleration
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")),
])

In [45]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [54]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model.fit(dataset, epochs=200, callbacks=[ResetStatesCallback()])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7fb264fde200>

In [55]:
# 가중치 복사하여 여러 배치 처리
stateless_model = keras.models.Sequential([ 
    keras.layers.LSTM(128, return_sequences=True, input_shape=[None, max_id]), # a stateless model!! input shape's batch_size is not important
    keras.layers.LSTM(128, return_sequences=True), # dropout is not necessary!!
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

가중치를 복사하려면 먼저 (가중치를 만들기 위해) 모델을 빌드합니다:

In [56]:
stateless_model.build(tf.TensorShape([None, None, max_id]))

In [57]:
stateless_model.set_weights(model.get_weights()) #이것만 부르면 다 됨!! -> stat 신경 쓸 필요 X

In [58]:
model = stateless_model
print(complete_text("i"))

im: be sorrow by my own
as i did practise your thit


## IMDB dataset

In [59]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [61]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()} #items returns the word and id tokens!
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
print(" ".join([id_to_word[id_] for id_ in X_train[0][:20]]))

<sos> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you


In [65]:
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

In [67]:
for data in datasets["train"].take(5):
    print(data)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

2022-07-18 10:19:00.207690: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [64]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-z']", b" ")
    X_batch = tf.strings.split(X_batch) # 단어 조각이 나옴
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch # 패딩 씌워서 작은 문장 처리!

In [69]:
# 어휘 사전 구축
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy())) # numpy for real-value, list to make it into list!

In [70]:
vocabulary.most_common()[:3]

[(b'<pad>', 214316), (b'the', 61137), (b'a', 38562)]

In [71]:
len(vocabulary)

54104

In [72]:
# 가장 많이 등장한 단어 10000개만 사용!
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]] # Counter-> most_common returns sorted array!

In [75]:
# preprocessing word dictionary
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64) # same as range
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids) # give ids for preallocated words -> give keys to vocabs
num_oov_buckets = 1000 # number of unknown words
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [76]:
table.lookup(tf.constant([b"dongho is Fantastic".split()]))

<tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[10729,     7,  6018]])>

In [77]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [80]:
embed_size = 256
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]), # input shape is None because it will be N * 300 -> will be determined
    keras.layers.LSTM(256, return_sequences=True),
    keras.layers.LSTM(256),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [81]:
history = model.fit(train_set, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Masking

In [83]:
# use mask_zero = True !
embed_size = 256
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None],
                          mask_zero=True), # input shape is None because it will be N * 300 -> will be determined
    keras.layers.LSTM(256, return_sequences=True),
    keras.layers.LSTM(256),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [84]:
test_set = datasets["test"].batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)

In [85]:
model.evaluate(test_set)



[1.579132080078125, 0.7292400002479553]

In [86]:
# 직점 마스킹을 처리!
K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size+num_oov_buckets, embed_size)(inputs)
z = keras.layers.LSTM(256, return_sequences=True)(z, mask=mask) # give mask like this!! dim between z and mask is different!
z = keras.layers.LSTM(256)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)

model = keras.Model(inputs=[inputs], outputs=[outputs])

In [87]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Using Pretrained Embeddings

In [92]:
# use tf hub!!
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                    dtype=tf.string, input_shape=[], output_shape=[50]), # returns one vector for one sentence!
    keras.layers.Dense(128, activation="relu", kernel_initializer="he_normal"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [96]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Encoder-decoder Network for Translation

In [107]:
import tensorflow_addons as tfa

vocab_size = 100
embed_size = 10

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32) # don't know the size of the sentence yet
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size) # initialize class ilke this!
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs) # use same embedding like this!

encoder = keras.layers.LSTM(512, return_state=True) # return state!
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer = output_layer)

final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state, sequence_length=sequence_lengths)

Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.Model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths], outputs=[Y_proba])

In [108]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
X = np.random.randint(100, size=10*1000).reshape(1000, 10)
Y = np.random.randint(100, size=15*1000).reshape(1000, 15)
X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]
seq_lengths = np.full([1000], 15)

history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)

Epoch 1/2
Epoch 2/2


### 양방향 순환 층

In [109]:
layer = keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True)) #이것만 하면 됨!

### 빔 검색

```
beam_width = 10
decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder(
            cell=decoder_cell, beam_width=beam_width, output_layer=output_layer) # 훈련된걸 감쌈?
decoder_initial_state = tfa.seq2seq.beam_search_decoder.tile_batch(
            encoder_state, multiplier=beam_width)
outputs, _, _ = decoder(embedding_decoder, start_tokens=start_tokens,end_token=endtoken, initial_state=decoder_initial_state
```

## Attention

```
attention_mechanism = tfa.seq2seq.attention_wrapper.LuongAttention(
    units, encoder_state, memory_sequence_length=encoder_sequence_length)
attention_decoder_cell = tfa.seq2seq.attention_wrapper.AttentionWrapper(
    decoder_cell, attention_mechanism, attention_layer_size=n_units)
```

* Visual Attention

## Transformer : Attentions is all you need

In [125]:
# position encoding
class PositionalEncoding(keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims+=1
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_embed = np.empty((1, max_steps, max_dims))
        pos_embed[0, :, ::2] = np.sin(p / 10000 ** (2*i / max_dims)).T
        pos_embed[0, :, 1::2] = np.cos(p / 10000 ** (2*i / max_dims)).T
        self.positional_embedding = tf.constant(pos_embed.astype(self.dtype))
        pass
    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]] # use broadcasting!!

In [126]:
# First layers of transformer
embed_size = 512
max_steps = 500
vocab_size = 10000

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
embeddings =keras.layers.Embedding(vocab_size, embed_size) # share embedding layer!
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

positional_embedding = PositionalEncoding(max_steps, max_dims=embed_size) # share encoding layer!
encoder_in = positional_embedding(encoder_embeddings)
decoder_in = positional_embedding(decoder_embeddings)

In [127]:
# 트렌스포머의 나머지 부분
Z = encoder_in
for N in range(6):
    Z = keras.layers.Attention(use_scale=True)([Z, Z])
    
encoder_outputs = Z

Z = decoder_in
for N in range(6):
    Z = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
    Z = keras.layers.Attention(use_scale=True)([Z, encoder_outputs])
    
outputs = keras.layers.TimeDistributed(keras.layers.Dense(vocab_size, activation="softmax"))(Z)