In [42]:
import tensorflow as tf
import keras

#Turn tensor into dataset
X = tf.range(10) # any data tensor
dataset = tf.data.Dataset.from_tensor_slices(X)
print("Dataset")
for item in dataset:
    print(item)

#Transform chains (slices -> repeat x3 -> batch 7)
dataset = dataset.repeat(3).batch(7)
print("Chained")
for item in dataset:
    print(item)

#Maps (Operate on each data instance)
print("Mapped")
dataset_map = dataset.map(lambda x: x * 2,     
    num_parallel_calls=tf.data.experimental.AUTOTUNE) 
print(dataset_map)

#TAke pieces
print("Taken")
for item in dataset.take(3):
    print(item)


Dataset
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
Chained
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)
Mapped
<ParallelMapDataset shapes: (None,), types: tf.int32>
Taken
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)


In [43]:
#Everyday we're shuffling (not good for large data)
dataset = tf.data.Dataset.range(10).repeat(3) # 0 to 9, three times
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


In [44]:
#Some preprocessing code
n_inputs = 8
def preprocess(line, X_mean, X_std):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

print(preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,\
    37.47,-122.2,2.782', 0, 1))

#Together a CSV reader
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
        n_read_threads=None, shuffle_buffer_size=10000,
        n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)


(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([   4.2083,   44.    ,    5.3232,    0.9171,  846.    ,    2.337 ,
         37.47  , -122.2   ], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)


In [45]:
#Most stuff in the OneNote document

In [46]:
#Writing a TFRecord file
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com",
                        b"c@d.com"]))
    }))
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())

feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string),
}

for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
    parsed_example = tf.io.parse_single_example(serialized_example,
            feature_description)

tf.sparse.to_dense(parsed_example["emails"], default_value=b"")

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

In [47]:
#One hot encoding
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
print(cat_one_hot)

tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]], shape=(4, 7), dtype=float32)


In [51]:
#Embedding
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
print(tf.nn.embedding_lookup(embedding_matrix, cat_indices))

embedding = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
    output_dim=embedding_dim)
print(embedding(cat_indices))

#Full keras model with embedded categorical features
regular_inputs = keras.layers.Input(shape=[8])
categories = keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))  (categories)
cat_embed = keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs, categories],
    outputs=[outputs])
print(model)

tf.Tensor(
[[0.7547101  0.3076043 ]
 [0.94363654 0.22225738]
 [0.65452874 0.57899797]
 [0.65452874 0.57899797]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[-0.0123139   0.00910963]
 [ 0.00032943  0.00796934]
 [ 0.01987575  0.04442216]
 [ 0.01987575  0.04442216]], shape=(4, 2), dtype=float32)
<tensorflow.python.keras.engine.functional.Functional object at 0x000001A2D4D228E0>


In [54]:
#TF Transform
import tensorflow_transform as tft

def preprocess(inputs): # inputs = a batch of input features
    median_age = inputs["housing_median_age"]
    ocean_proximity = inputs["ocean_proximity"]
    standardized_age = tft.scale_to_z_score(median_age)
    ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
    return {
        "standardized_median_age": standardized_age,
        "ocean_proximity_id": ocean_proximity_id
    }


In [56]:
#Some TF datasets
import tensorflow_datasets as tfds

dataset = tfds.load(name="mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]

mnist_train = mnist_train.shuffle(10000).batch(32).prefetch(1)
for item in mnist_train:
    images = item["image"]
    labels = item["label"]

mnist_train = mnist_train.shuffle(10000).batch(32)
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = dataset["train"].prefetch(1)
model = keras.models.Sequential([...])
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd")
model.fit(mnist_train, epochs=5)



Dl Completed...: 0 url [00:00, ? url/s][A

Dl Size...: 0 MiB [00:00, ? MiB/s][A[A


Extraction completed...: 0 file [00:00, ? file/s]
Dl Size...: 0 MiB [00:00, ? MiB/s]
Dl Completed...: 0 url [00:00, ? url/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s][A

Generating train examples...: 0 examples [00:00, ? examples/s][A[A

Generating train examples...: 12 examples [00:00, 119.60 examples/s][A[A[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Cameron James\tensorflow_datasets\mnist\3.0.1...[0m





Generating train examples...: 146 examples [00:00, 834.87 examples/s][A[A

Generating train examples...: 261 examples [00:00, 977.04 examples/s][A[A

Generating train examples...: 393 examples [00:00, 1110.71 examples/s][A[A

Generating train examples...: 543 examples [00:00, 1249.31 examples/s][A[A

Generating train examples...: 685 examples [00:00, 1305.74 examples/s]

TypeError: The added layer must be an instance of class Layer. Found: Ellipsis