In [31]:
import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [32]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)

parent_dir

'/root/.keras/datasets'

In [33]:
! ls {parent_dir}

HIGGS.csv.gz   butler.txt  derby.txt	  flower_photos		mnist.npz
auto-mpg.data  cowper.txt  fashion-mnist  flower_photos.tar.gz


In [34]:
def labeler(example,index):
  return example,tf.cast(index,tf.int64)

labeled_data_sets=[]
for i,file_name in enumerate(FILE_NAMES):
  lines_dataset=tf.data.TextLineDataset(os.path.join(parent_dir,file_name))
  labeled_dataset=lines_dataset.map(lambda ex:labeler(ex,i))
  labeled_data_sets.append(labeled_dataset)

In [35]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [36]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [37]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b"Behind the steeds pitch'd headlong, and expired;">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'To whom great Hector of the glancing helm:'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Tom from his arms by violence away.'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'dangling in the mid firmament. So far am I above all others either of'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"Was plash'd with blood the axle, and the rails">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [38]:
tokenizer=tfds.features.text.Tokenizer()
vocabulary_set=set()
for text_tensor,_ in all_labeled_data:
  some_tokens=tokenizer.tokenize(text_tensor.numpy()) #split it into tokens.
  vocabulary_set.update(some_tokens)
vocab_size =len(vocabulary_set)
some_tokens

['the',
 'Gods',
 'forbidden',
 'by',
 'Jupiter',
 'On',
 'the',
 'contrary',
 'the',
 'Trojans',
 'see',
 'their']

In [39]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set) #传入词表
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)
encoded_example = encoder.encode(example_text)
print(encoded_example)

b"Behind the steeds pitch'd headlong, and expired;"
[1171, 9546, 10241, 9677, 1248, 5669, 10570, 4662]


In [None]:
# 该方法无法直接用于map，因为map接受的是一个op，是一个运算符
# 传给
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  # print(text_tensor)
  #这里是一个Tensor("args_0:0", shape=(), dtype=string)
  # 没有值
  return encoded_text, label

def encode_map_fn(text,label):
  # 使用py_function把这个encode包成一个op
  encode_text,label=tf.py_function(encode,inp=[text,label],
                                   Tout=[tf.int64,tf.int64])
  encode_text.set_shape([None])
  label.set_shape([])
  return encode_text,label

all_encoded_data=all_labeled_data.map(encode_map_fn)

In [48]:
next(iter(all_encoded_data))[0]

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([ 1171,  9546, 10241,  9677,  1248,  5669, 10570,  4662])>

In [49]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

In [50]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: shape=(16,), dtype=int64, numpy=
 array([ 1171,  9546, 10241,  9677,  1248,  5669, 10570,  4662,     0,
            0,     0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [51]:
vocab_size =1+len(vocabulary_set)

In [52]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_data, epochs=3, validation_data=test_data)
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

Epoch 1/3
Epoch 2/3
Epoch 3/3

Eval loss: 0.389, Eval accuracy: 0.825
