In [1]:
import os

import tensorflow as tf
import tensorflow_datasets as tfds

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)
parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


'/root/.keras/datasets'

In [3]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [4]:
labeled_data_sets

[<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>]

In [5]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [6]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
    
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [7]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Wast thou as timid, tell me, when with those'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'counsel, father Jove has sent me to bear you this message--so long as'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'trying hard to embrace his knees, for he would fain live, not die.'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Great glory hath shed; now headlong on the Greeks'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'through. Therefore there can be no understanding between you and me,'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)


In [9]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    print(text_tensor.numpy())
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    print(some_tokens)
    vocabulary_set.update(some_tokens)
    break

b'Wast thou as timid, tell me, when with those'
['Wast', 'thou', 'as', 'timid', 'tell', 'me', 'when', 'with', 'those']


In [10]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)
vocab_size

17178

In [11]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'Wast thou as timid, tell me, when with those'


In [12]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[245, 14032, 8721, 2068, 15472, 1779, 4996, 3677, 7111]


In [13]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

```Dataset.map``` 을 사용 ```Dataset.map``` 함수를 데이터 세트의 각 요소에 적용하려고 합니다. 하지만 ```Dataset.map``` 은 그래프 모드에서 실행됩니다.
- 그래프 텐서는 값이 없습니다.
- 그래프 모드에서는 TensorFlow Ops 및 함수만 사용할 수 있습니다.

따라서 이 함수를 직접 ```.map``` 할 수 없습니다. ```tf.py_function``` 에 래핑해야 합니다. ```tf.py_function```은 래핑 된 파이썬 함수에 일반 텐서를 전달합니다.

In [15]:
def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

    # `tf.data.Datasets` work best if all components have a shape set
    #  so set the shapes manually: 
    encoded_text.set_shape([None])
    label.set_shape([])

    return encoded_text, label
    # Tensor 객체를 리턴하게 된다.

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [16]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

In [17]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: shape=(15,), dtype=int64, numpy=
 array([  245, 14032,  8721,  2068, 15472,  1779,  4996,  3677,  7111,
            0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [18]:
sample_text[1], sample_labels[1]

(<tf.Tensor: shape=(15,), dtype=int64, numpy=
 array([ 4018,  5531,  3414,  8775,   954,  1779, 13093,  8409,  4324,
        12279,  2569,  1008,  8541,  8721,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>)

In [19]:
vocab_size += 1 # because of pad, 0

In [20]:
model = tf.keras.Sequential()

In [21]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [22]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [23]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3))

In [24]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [25]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fd66466f518>

In [26]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.358, Eval accuracy: 0.841
