## Data API

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)

2022-06-27 22:19:57.954697: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-27 22:19:57.979404: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-27 22:19:57.979566: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-27 22:19:57.980175: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [3]:
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [4]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [5]:
dataset = dataset.repeat(3).batch(7)

In [6]:
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [7]:
dataset = dataset.map(lambda x: x*2, num_parallel_calls=12)

In [8]:
dataset = dataset.apply(tf.data.experimental.unbatch())

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.


In [9]:
dataset = dataset.filter(lambda x : x<18)

In [10]:
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


#### Data Shuffling

In [11]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


### Data Preprocessing

In [12]:
X_mean, X_std = 1.0, 0.4

In [13]:
n_inputs = 8

def preprocess(line):
    defs = [0.]*n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    X = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    
    return (X-X_mean) / X_std, y

In [14]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None, shuffle_buffer_size=10000, n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_cells=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_cells=n_parse_threads)
    return dataset.batch(batch_size).prefetch(1)

### Using tf.keras

```
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)
```

```
model = keras.models.Sequential([...])
model.compile([...])
model.fit(train_set, epochs=10, validation_data=valid_set) # can fit without X and y
```

In [15]:
@tf.function
def train(model, optimizer, loss_fn, n_epochs, **kwargs):
    train_set = csv_reader_dataset(train_filepaths, repeate=n_epochs, **kwargs)
    
    for X_batch, y_batch in train_set: #이렇게 해서 분리, 배치처리 됨!
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_pred, y_batch)) # loss calculates losses for each data!
            loss = tf.add_n([main_loss] + model.losses)
            pass
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

## TFRecord Format

In [16]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

In [17]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths) # can input more than one filepaths!
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [18]:
from tensorflow.train import BytesList, FloatList, Int64List, Features, Feature, Example

In [19]:
person_example = Example(
    features=Features(
        feature={
            "name" : Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id" : Feature(int64_list=Int64List(value=[123])),
            "emails" : Feature(bytes_list=BytesList(value=[b"a@b.com",
                                                           b"c@d.com"]))
        }))

In [20]:
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())


In [21]:
feature_description = {
    "name" : tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id" : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails" : tf.io.VarLenFeature(tf.string),
}

for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)
    print(parsed_example)

{'emails': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7efe1c3ed240>, 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}


In [22]:
parsed_example["emails"].values

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

## 입력 특성 전처리 

In [23]:
X_train = np.arange(36).reshape(6, 6)

In [24]:
means = np.mean(X_train, axis=0, keepdims=True)
stds = np.std(X_train, axis=0, keepdims=True)
eps = keras.backend.epsilon()

In [25]:
model = keras.models.Sequential([
    keras.layers.Lambda(lambda inputs: (inputs - means) / (stds + eps)),
    #... other layers
])

In [26]:
print(means)
print(stds)
print(eps)

[[15. 16. 17. 18. 19. 20.]]
[[10.24695077 10.24695077 10.24695077 10.24695077 10.24695077 10.24695077]]
1e-07


In [27]:
# whole user-dev layer
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample): # to store preprocessing values
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())

In [28]:
std_layer = Standardization()
std_layer.adapt(X_train)

In [29]:
model = keras.Sequential()
model.add(std_layer)
# make model
model.compile()

In [30]:
norm_layer = keras.layers.Normalization()
norm_layer.adapt(X_train)
model.add(norm_layer)

In [31]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

In [32]:
X_train[:5]

array([[ 3.52140000e+00,  1.50000000e+01,  3.04994451e+00,
         1.10654828e+00,  1.44700000e+03,  1.60599334e+00,
         3.76300000e+01, -1.22430000e+02],
       [ 5.32750000e+00,  5.00000000e+00,  6.49005964e+00,
         9.91053678e-01,  3.46400000e+03,  3.44333996e+00,
         3.36900000e+01, -1.17390000e+02],
       [ 3.10000000e+00,  2.90000000e+01,  7.54237288e+00,
         1.59152542e+00,  1.32800000e+03,  2.25084746e+00,
         3.84400000e+01, -1.22980000e+02],
       [ 7.17360000e+00,  1.20000000e+01,  6.28900256e+00,
         9.97442455e-01,  1.05400000e+03,  2.69565217e+00,
         3.35500000e+01, -1.17700000e+02],
       [ 2.05490000e+00,  1.30000000e+01,  5.31245745e+00,
         1.08509190e+00,  3.29700000e+03,  2.24438393e+00,
         3.39300000e+01, -1.16930000e+02]])

In [33]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [34]:
fetch_housing_data()

In [35]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [36]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [37]:
vocab = ['<1H OCEAN', 'INLAND',  'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
indices = tf.range(len(vocab), dtype=tf.int64)
print(indices)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices) # initialize table 

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)


In [38]:
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [39]:
#one-hot-vector encoding
categories = tf.constant(['NEAR OCEAN', 'NEAR BAY', 'ISLAND', "DESERT"]) # with one unknown
cat_indices = table.lookup(categories)

In [40]:
print(cat_indices)

tf.Tensor([2 3 4 5], shape=(4,), dtype=int64)


In [41]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
print(cat_one_hot)

tf.Tensor(
[[0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]], shape=(4, 7), dtype=float32)


In [42]:
text_vec = keras.layers.TextVectorization()

In [43]:
text_vec.adapt(housing["ocean_proximity"])

In [44]:
text_vec(housing["ocean_proximity"])

<tf.Tensor: shape=(20640, 2), dtype=int64, numpy=
array([[5, 6],
       [5, 6],
       [5, 6],
       ...,
       [4, 0],
       [4, 0],
       [4, 0]])>

In [45]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)

In [46]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.19123924, 0.07143486],
       [0.20266116, 0.5653901 ],
       [0.10385895, 0.5766828 ],
       [0.7025596 , 0.9996774 ],
       [0.7523098 , 0.0173403 ],
       [0.75574756, 0.68333125],
       [0.8093147 , 0.61791205]], dtype=float32)>

In [47]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.10385895, 0.5766828 ],
       [0.7025596 , 0.9996774 ],
       [0.7523098 , 0.0173403 ],
       [0.75574756, 0.68333125]], dtype=float32)>

In [48]:
embedding = keras.layers.Embedding(input_dim = len(vocab)+num_oov_buckets, output_dim=embedding_dim)
print(embedding(cat_indices))

tf.Tensor(
[[-0.04380808 -0.02991445]
 [ 0.01114583 -0.02278848]
 [ 0.00278368 -0.01677831]
 [ 0.00017037 -0.01561464]], shape=(4, 2), dtype=float32)


In [49]:
regular_inputs = keras.layers.Input(shape=[8])
categories= keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories) # use functional API on Lambda function!
cat_embed= keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)
encoded_inputs= keras.layers.concatenate([regular_inputs, cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs, categories], outputs=[outputs])

## 케라스 전처리 층 

```
normalization = keras.layers.Normalization()
discritization = keras.layers.Discritization([...])
pipeline = keras.layers.PreprocessingStage([normalization, discritization])
pipeline.adapt(data_sample)
```

#### Tf transformation

In [50]:
#import tensorflow_transform as tft

```
def preprocess(inputs):
    median_age = inputs["housing_median_age"]
    ocean_proximity = inputs["ocean_proximity"]
    standard_age = tft.scale_to_z_score(median_age)
    ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
    return {
        "standardized_median_age" : standardized_age,
        "ocean_proximity_id" : ocean_proximity_id
    }
```

## Tensorflow Dataset Projects

In [51]:
import tensorflow_datasets as tfds

dataset = tfds.load(name="mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]

In [52]:
type(mnist_train)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [53]:
mnist_train = mnist_train.shuffle(10000).batch(32).prefetch(1) # shuffle dataset and create batchs and load it to GPU !! CRITICAL to PERFORMANCE!

In [54]:
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"])) # use this to map all items from dict to tuple!
mnist_train = mnist_train.prefetch(1) # using prefetch to feed data to GPU!

#### This is way more simple

In [55]:
dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = dataset["train"].prefetch(1)

In [60]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal", kernel_regularizer=keras.regularizers.L2(0.01), kernel_constraint=keras.constraints.MaxNorm(5)),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal", kernel_regularizer=keras.regularizers.L2(0.01), kernel_constraint=keras.constraints.MaxNorm(5)),
    keras.layers.Dense(10, activation="softmax",kernel_initializer="he_normal", kernel_regularizer=keras.regularizers.L2(0.01), kernel_constraint=keras.constraints.MaxNorm(5)),
])
model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(learning_rate=0.001), metrics=["accuracy"])
model.fit(mnist_train, epochs=50) # only calling this will train the dataset with prefetch included!

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7efaf0284070>

In [61]:
model.save("mnist_train.h5")