#### Tensorflow data storage and preprocessing

In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print('gpu', gpu)
    tf.config.experimental.set_memory_growth(gpu, True)
    print('memory growth:' , tf.config.experimental.get_memory_growth(gpu))

gpu PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
memory growth: True


In [2]:
#data.Dataset.from_tensor_slics (keep in operating memory)
x = tf.range(1000)
dataset = tf.data.Dataset.from_tensor_slices(x)
print(dataset)

<TensorSliceDataset shapes: (), types: tf.int32>


In [3]:
for tensor in dataset:
    print(tensor)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(13, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(15, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(17, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(19, shape=(), dtype=int32)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(23, shape=(), dtype=int32)
tf.Tensor(24, shape=(), dtype=int32)
tf.Tensor(25, shape=(), dtype=int32)
tf.Tensor(26, shape=(), dtype=int32)
tf.Tensor(2

In [4]:
dataset = dataset.shuffle(buffer_size=10, seed=42).batch(32, drop_remainder=True)

In [5]:
dataset = dataset.map(lambda x: x * 2)
for tensor in dataset:
    print(tensor)

tf.Tensor(
[10  4 16 22 14 18 24  0 20 28 32  6 42 40 38 46 34 12 54 56 26 60 48  2
 64 66  8 62 36 52 30 78], shape=(32,), dtype=int32)
tf.Tensor(
[ 82  76  58  88  72  84  70  86  92 100 102  90  80 104  50 112 114 106
  68  94 108  98 118 122 130  44 134 136 126 110 128 140], shape=(32,), dtype=int32)
tf.Tensor(
[132 116 148 152  74 156 120 142 158 150 162 144 170 160 154 174 168 180
 172 146 124 178 188 192 138  96 164 198 202 196 166 182], shape=(32,), dtype=int32)
tf.Tensor(
[210 204 212 208 186 190 194 214 218 184 220 200 230 206 222 236 238 232
 240 216 250 244 234 248 258 176 228 246 226 256 224 260], shape=(32,), dtype=int32)
tf.Tensor(
[264 276 278 254 282 268 270 266 242 280 252 286 274 290 288 262 306 302
 300 304 308 314 312 272 322 284 316 296 330 294 328 318], shape=(32,), dtype=int32)
tf.Tensor(
[332 310 338 324 342 320 346 292 354 348 356 336 298 340 350 326 358 334
 344 366 364 372 352 384 378 388 374 370 376 360 398 400], shape=(32,), dtype=int32)
tf.Tensor(
[382 39

### Numeric data normalization
Subtract mean and divided by standard deviation (using Scikit-Learn)

In [6]:
class Standardization(tf.keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keedpdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + tf.keras.backend.epsilon())
    
#std_layer = Standardization()
#std_layer.adapt(data_sample)

### Categorical data 
With OneHot encoding

In [8]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [9]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "BYLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 6])>

In [10]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.]], dtype=float32)>