In [1]:
import tensorflow as tf
from tensorflow.keras import layers as tl
from tensorflow.data import Dataset
import tensorflow_datasets as tfds
from pathlib import Path
import numpy as np


In [2]:
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = tf.keras.utils.get_file('shakespeare.txt', url)

with open(filepath) as f:
  text = f.read()


In [3]:
print(text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [4]:
text_vec_layer = tl.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([text])
encoded = text_vec_layer([text])[0]

2023-11-29 11:04:53.650436: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-11-29 11:04:53.650455: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-11-29 11:04:53.650458: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-11-29 11:04:53.650495: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-29 11:04:53.650708: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-11-29 11:04:53.769220: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [5]:
n_tokens = text_vec_layer.vocabulary_size()
dataset_size = len(encoded)

In [6]:
def to_window(sequence, length):
  ds = Dataset.from_tensor_slices(sequence) # character id list로 부터 dataset을 만든다.
  ds = ds.window(length + 1, shift=1, drop_remainder=True) # length + 1 size로 한칸씩 오른쪽으로 이동하는 window dataset list의 데이터셋을 만든다.
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  return ds


def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = to_window(sequence, length)

  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)

  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)




sample = tf.range(10) + 1


sample_ds = to_window(sample, 6)
sample_ds = sample_ds.batch(2)
for data in sample_ds:
  print(data)

dataset = to_dataset(sample, 6, batch_size=2)

for data in dataset:
  print(data)

tf.Tensor(
[[1 2 3 4 5 6 7]
 [2 3 4 5 6 7 8]], shape=(2, 7), dtype=int32)
tf.Tensor(
[[ 3  4  5  6  7  8  9]
 [ 4  5  6  7  8  9 10]], shape=(2, 7), dtype=int32)
(<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 2, 3, 4, 5, 6],
       [2, 3, 4, 5, 6, 7]], dtype=int32)>, <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[2, 3, 4, 5, 6, 7],
       [3, 4, 5, 6, 7, 8]], dtype=int32)>)
(<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[3, 4, 5, 6, 7, 8],
       [4, 5, 6, 7, 8, 9]], dtype=int32)>, <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 4,  5,  6,  7,  8,  9],
       [ 5,  6,  7,  8,  9, 10]], dtype=int32)>)


In [7]:
length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:100_000], length=length, shuffle=True, seed=42, batch_size=64)
valid_set = to_dataset(encoded[100_000:106_000], length=length, batch_size=64)
test_set = to_dataset(encoded[106_000:120_000], length=length, batch_size=64)

In [8]:
# test with simple GRU model

model = tf.keras.Sequential([
    tl.Embedding(input_dim=n_tokens, output_dim=16),
    tl.GRU(128, return_sequences=True),
    tl.Dense(n_tokens, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          656       
                                                                 
 gru (GRU)                   (None, None, 128)         56064     
                                                                 
 dense (Dense)               (None, None, 41)          5289      
                                                                 
Total params: 62009 (242.22 KB)
Trainable params: 62009 (242.22 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    'checkpoints/my_shakespeare_model', monitor='val_accuracy', save_best_only=True)

skip = True

if not skip:
    history = model.fit(train_set, validation_data=valid_set, epochs=1, callbacks=[model_ckpt]) # epoch should be at least 10


In [10]:
generation_model = tf.keras.Sequential(
    [
        text_vec_layer,
        model
    ]
)

In [11]:
y_proba = generation_model.predict(['to be or not to b'])[0, -1]
y_pred = tf.argmax(y_proba)
text_vec_layer.get_vocabulary()[y_pred]



'j'

In [12]:
def next_char(text, temperature=1):
  pred = generation_model.predict([text])
  # print(pred.shape) # 넣은 문자의 각각 다음 문자가 예측되어 나온다. text가 18개면 shape 이 (1, 18, 41) 이다.
  y_proba = pred[0, -1:] # 마지막 것만 사용한다.
  # print(y_proba.shape)
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0,0]
  return text_vec_layer.get_vocabulary()[char_id]

In [13]:
def extend_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

In [14]:
print(extend_text('to be or not to b', temperature=0.01))

to be or not to bn,z3ypy pfdt'3j.vpich3jdbr yfzapbkh::cq3j?$ptsb


In [15]:
# 감성 분석

raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name='imdb_reviews',
    split=['train[:90%]', 'train[90%:]', 'test'],
    as_supervised=True
)

In [16]:
tf.random.set_seed(42)

In [17]:
list(raw_train_set.take(1))

2023-11-29 11:04:59.676217: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


[(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">,
  <tf.Tensor: shape=(), dtype=int64, numpy=0>)]

In [18]:
batch_size = 64

train_set = raw_train_set.shuffle(5000, seed=42).batch(batch_size).prefetch(1)
valid_set = raw_valid_set.batch(batch_size).prefetch(1)
test_set = raw_valid_set.batch(batch_size).prefetch(1)

In [19]:
X, y = list(raw_train_set.take(1))[0]
print(X)

vocab_size = 1000

text_vec_layer = tl.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))


tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)


2023-11-29 11:04:59.715840: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [20]:
embed_size = 128
tf.random.set_seed(42)

model = tf.keras.Sequential([
    text_vec_layer,
    tl.Embedding(vocab_size, embed_size),
    tl.GRU(128),
    tl.Dense(1, activation='sigmoid')
])
X, y = list(train_set.take(1))[0]
print('X shape', X.shape)
output1 = model.layers[0](X)
print('vectorize result:', output1.shape, output1[0])
output2 = model.layers[1](output1)
print('embedding result: ', output2.shape, output2[0])
output3 = model.layers[2](output2)
print('GRU result:', output3.shape)
output4 = model.layers[3](output3)
print('final result:', output4.shape)




X shape (64,)
vectorize result: (64, 926) tf.Tensor(
[ 10  42 366 143  36   2   1   1   5   1   1 142 753  15 563   6 161   4
   1 913  21   2  18 669   2   1   7  43   1 391  34 772   3   1   1   1
   1  30  34 772   3   1   1   5  93  17   4   1   1  13  22  12  10 178
   6   1  56 120 132  19  10 209  36   4 385 514  17   4 707 409  71 129
 459 138   1   1  21   2   1 151   1 368  87  69 181 332  69  28   1   6
  80 136 194 321   1  26 832 952   1   5   1  30   4 146   1   1  23 340
   6 543 123  97  79 107  39   2 377  30 263   6   1 219   4 169   5  82
   1 888  12 321   1  24   1   1   5 405   1   1 781  24   1  65 517   3
   1  43  87  65 120   8  65   1   1   7   1  13   2 146   1   5   2  18
 206  43  49   7   1  99  30  43  11   1   1   1   3   1   1  65 517   3
 258   6 972  11 843   5   4 394 514   1   1   9  74  30   4   1  58 656
 476   7   1   3   9  69 188 168   1   8  12 415 111   1   1 107 480   4
   1  16   1   4   1   1   1  24   1  79 107   2 436   1   5   4 211 51

In [21]:
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

skip = True

if not skip:
    with tf.device('/cpu:0'):  # using gpu results in very bad performance for this task.
        history = model.fit(train_set, validation_data=valid_set, epochs=1)  # result is bad because of 0 vectors

In [22]:
# masked version

inputs = tl.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
mask = tf.math.not_equal(token_ids, 0)
Z = tl.Embedding(vocab_size, embed_size)(token_ids)
Z = tl.GRU(128, dropout=0.2)(Z, mask=mask)
outputs = tl.Dense(1, activation='sigmoid')(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

In [23]:
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

skip = True

if not skip:
    with tf.device('/cpu:0'):  # using gpu results in very bad performance for this task.
            history = model.fit(train_set, validation_data=valid_set, epochs=2)

In [24]:
import os
import tensorflow_hub as hub

os.environ['TFHUB_CACHE_DIR'] = 'cache'
model = tf.keras.Sequential([
    hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                   trainable=True, dtype=tf.string, input_shape=[]),
    tl.Dense(64, activation='relu'),
    tl.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])


In [25]:
skip = True

if not skip:
    with tf.device('/cpu:0'):  # using gpu results in very bad performance for this task.
        model.fit(train_set, validation_data=valid_set, epochs=1)

In [26]:
# NMT 

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file('spa-eng.zip', origin=url, cache_dir='datasets', extract=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [27]:
text = (Path(path).with_name('spa-eng') / 'spa.txt').read_text()

In [28]:
text = text.replace('¡', '').replace('¿','')
pairs = [line.split('\t') for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

In [29]:
for i in range(3):
    print(f'{sentences_en[i]} => {sentences_es[i]}')

How boring! => Qué aburrimiento!
I love sports. => Adoro el deporte.
Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?


In [30]:
vocab_size = 1000
max_length = 50

text_vec_layer_en = tl.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tl.TextVectorization(vocab_size, output_sequence_length=max_length)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [31]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [32]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [33]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f'startofseq {s}' for s in sentences_es[100_000:]])

Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[100_000:]])


In [34]:
print(X_train.shape)
print(X_train.shape)

(100000,)
(100000,)


In [35]:
tf.random.set_seed(42)  # 추가 코드 - CPU에서 재현성 보장
encoder_inputs = tl.Input(shape=[], dtype=tf.string)
decoder_inputs = tl.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tl.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tl.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

encoder = tl.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
print('encoder state:', encoder_state)

decoder = tl.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tl.Dense(vocab_size, activation='softmax')
y_proba = output_layer(decoder_outputs)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[y_proba])

encoder state: [<KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'lstm')>, <KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'lstm')>]


In [36]:
optimizer = tf.keras.optimizers.legacy.Nadam(
    learning_rate=0.05)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [37]:
print(Y_train.shape)
model.summary()

(100000, 50)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 text_vectorization_2 (Text  (None, 50)                   0         ['input_2[0][0]']             
 Vectorization)                                                                                   
                                                                                                  
 text_vectorization_3 (Text  (None, 50)                   0         ['input_3[0

In [39]:
skip = True

if not skip:
    model.fit((X_train, X_train_dec), Y_train, epochs=5, validation_data=((X_valid, X_valid_dec), Y_valid))

In [40]:
def translate(model, sentence_en):
    translation = ''
    for word_idx in range(max_length):
        X = np.array([sentence_en]) # batch 1로 되게끔 감쌈
        X_dec = np.array(["startofseq " + translation]) # batch 1로 되게끔 감쌈
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += " " + predicted_word
    return translation.strip()

In [41]:
translate(model, 'I like you')



'[UNK] [UNK]'

In [59]:
causal_mask = tf.linalg.band_part(
    tf.ones((10, 10), tf.bool), -1, 0
)
causal_mask

<tf.Tensor: shape=(10, 10), dtype=bool, numpy=
array([[ True, False, False, False, False, False, False, False, False,
        False],
       [ True,  True, False, False, False, False, False, False, False,
        False],
       [ True,  True,  True, False, False, False, False, False, False,
        False],
       [ True,  True,  True,  True, False, False, False, False, False,
        False],
       [ True,  True,  True,  True,  True, False, False, False, False,
        False],
       [ True,  True,  True,  True,  True,  True, False, False, False,
        False],
       [ True,  True,  True,  True,  True,  True,  True, False, False,
        False],
       [ True,  True,  True,  True,  True,  True,  True,  True, False,
        False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])>